1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
182                                          const GCNTargetMachine &TM)
183   :  ST(ST_) {
184   using namespace TargetOpcode;
185 
186   auto GetAddrSpacePtr = [&TM](unsigned AS) {
187     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
188   };
189 
190   const LLT S1 = LLT::scalar(1);
191   const LLT S16 = LLT::scalar(16);
192   const LLT S32 = LLT::scalar(32);
193   const LLT S64 = LLT::scalar(64);
194   const LLT S128 = LLT::scalar(128);
195   const LLT S256 = LLT::scalar(256);
196   const LLT S1024 = LLT::scalar(1024);
197 
198   const LLT V2S16 = LLT::vector(2, 16);
199   const LLT V4S16 = LLT::vector(4, 16);
200 
201   const LLT V2S32 = LLT::vector(2, 32);
202   const LLT V3S32 = LLT::vector(3, 32);
203   const LLT V4S32 = LLT::vector(4, 32);
204   const LLT V5S32 = LLT::vector(5, 32);
205   const LLT V6S32 = LLT::vector(6, 32);
206   const LLT V7S32 = LLT::vector(7, 32);
207   const LLT V8S32 = LLT::vector(8, 32);
208   const LLT V9S32 = LLT::vector(9, 32);
209   const LLT V10S32 = LLT::vector(10, 32);
210   const LLT V11S32 = LLT::vector(11, 32);
211   const LLT V12S32 = LLT::vector(12, 32);
212   const LLT V13S32 = LLT::vector(13, 32);
213   const LLT V14S32 = LLT::vector(14, 32);
214   const LLT V15S32 = LLT::vector(15, 32);
215   const LLT V16S32 = LLT::vector(16, 32);
216   const LLT V32S32 = LLT::vector(32, 32);
217 
218   const LLT V2S64 = LLT::vector(2, 64);
219   const LLT V3S64 = LLT::vector(3, 64);
220   const LLT V4S64 = LLT::vector(4, 64);
221   const LLT V5S64 = LLT::vector(5, 64);
222   const LLT V6S64 = LLT::vector(6, 64);
223   const LLT V7S64 = LLT::vector(7, 64);
224   const LLT V8S64 = LLT::vector(8, 64);
225   const LLT V16S64 = LLT::vector(16, 64);
226 
227   std::initializer_list<LLT> AllS32Vectors =
228     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
229      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
230   std::initializer_list<LLT> AllS64Vectors =
231     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
232 
233   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
234   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
235   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
236   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
237   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
238   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
239   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
240 
241   const LLT CodePtr = FlatPtr;
242 
243   const std::initializer_list<LLT> AddrSpaces64 = {
244     GlobalPtr, ConstantPtr, FlatPtr
245   };
246 
247   const std::initializer_list<LLT> AddrSpaces32 = {
248     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
249   };
250 
251   const std::initializer_list<LLT> FPTypesBase = {
252     S32, S64
253   };
254 
255   const std::initializer_list<LLT> FPTypes16 = {
256     S32, S64, S16
257   };
258 
259   const std::initializer_list<LLT> FPTypesPK16 = {
260     S32, S64, S16, V2S16
261   };
262 
263   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
264 
265   setAction({G_BRCOND, S1}, Legal); // VCC branches
266   setAction({G_BRCOND, S32}, Legal); // SCC branches
267 
268   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
269   // elements for v3s16
270   getActionDefinitionsBuilder(G_PHI)
271     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
272     .legalFor(AllS32Vectors)
273     .legalFor(AllS64Vectors)
274     .legalFor(AddrSpaces64)
275     .legalFor(AddrSpaces32)
276     .clampScalar(0, S32, S256)
277     .widenScalarToNextPow2(0, 32)
278     .clampMaxNumElements(0, S32, 16)
279     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
280     .legalIf(isPointer(0));
281 
282   if (ST.has16BitInsts()) {
283     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
284       .legalFor({S32, S16})
285       .clampScalar(0, S16, S32)
286       .scalarize(0)
287       .widenScalarToNextPow2(0, 32);
288   } else {
289     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
290       .legalFor({S32})
291       .clampScalar(0, S32, S32)
292       .scalarize(0);
293   }
294 
295   // FIXME: Not really legal. Placeholder for custom lowering.
296   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
297     .legalFor({S32, S64})
298     .clampScalar(0, S32, S64)
299     .widenScalarToNextPow2(0, 32)
300     .scalarize(0);
301 
302   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
303     .legalFor({S32})
304     .clampScalar(0, S32, S32)
305     .scalarize(0);
306 
307   // Report legal for any types we can handle anywhere. For the cases only legal
308   // on the SALU, RegBankSelect will be able to re-legalize.
309   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
310     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
311     .clampScalar(0, S32, S64)
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
314     .widenScalarToNextPow2(0)
315     .scalarize(0);
316 
317   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
318                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
319     .legalFor({{S32, S1}, {S32, S32}})
320     .clampScalar(0, S32, S32)
321     .scalarize(0); // TODO: Implement.
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     .lower();
327 
328 
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   getActionDefinitionsBuilder(G_FCONSTANT)
337     .legalFor({S32, S64, S16})
338     .clampScalar(0, S16, S64);
339 
340   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
341     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
342                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
343     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344     .clampScalarOrElt(0, S32, S1024)
345     .legalIf(isMultiple32(0))
346     .widenScalarToNextPow2(0, 32)
347     .clampMaxNumElements(0, S32, 16);
348 
349   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
350   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
351     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
352   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
353 
354   auto &FPOpActions = getActionDefinitionsBuilder(
355     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
356     .legalFor({S32, S64});
357   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
358     .customFor({S32, S64});
359   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
360     .customFor({S32, S64});
361 
362   if (ST.has16BitInsts()) {
363     if (ST.hasVOP3PInsts())
364       FPOpActions.legalFor({S16, V2S16});
365     else
366       FPOpActions.legalFor({S16});
367 
368     TrigActions.customFor({S16});
369     FDIVActions.customFor({S16});
370   }
371 
372   auto &MinNumMaxNum = getActionDefinitionsBuilder({
373       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
374 
375   if (ST.hasVOP3PInsts()) {
376     MinNumMaxNum.customFor(FPTypesPK16)
377       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378       .clampMaxNumElements(0, S16, 2)
379       .clampScalar(0, S16, S64)
380       .scalarize(0);
381   } else if (ST.has16BitInsts()) {
382     MinNumMaxNum.customFor(FPTypes16)
383       .clampScalar(0, S16, S64)
384       .scalarize(0);
385   } else {
386     MinNumMaxNum.customFor(FPTypesBase)
387       .clampScalar(0, S32, S64)
388       .scalarize(0);
389   }
390 
391   if (ST.hasVOP3PInsts())
392     FPOpActions.clampMaxNumElements(0, S16, 2);
393 
394   FPOpActions
395     .scalarize(0)
396     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
397 
398   TrigActions
399     .scalarize(0)
400     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
401 
402   FDIVActions
403     .scalarize(0)
404     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
405 
406   getActionDefinitionsBuilder({G_FNEG, G_FABS})
407     .legalFor(FPTypesPK16)
408     .clampMaxNumElements(0, S16, 2)
409     .scalarize(0)
410     .clampScalar(0, S16, S64);
411 
412   if (ST.has16BitInsts()) {
413     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
414       .legalFor({S32, S64, S16})
415       .scalarize(0)
416       .clampScalar(0, S16, S64);
417   } else {
418     getActionDefinitionsBuilder(G_FSQRT)
419       .legalFor({S32, S64})
420       .scalarize(0)
421       .clampScalar(0, S32, S64);
422 
423     if (ST.hasFractBug()) {
424       getActionDefinitionsBuilder(G_FFLOOR)
425         .customFor({S64})
426         .legalFor({S32, S64})
427         .scalarize(0)
428         .clampScalar(0, S32, S64);
429     } else {
430       getActionDefinitionsBuilder(G_FFLOOR)
431         .legalFor({S32, S64})
432         .scalarize(0)
433         .clampScalar(0, S32, S64);
434     }
435   }
436 
437   getActionDefinitionsBuilder(G_FPTRUNC)
438     .legalFor({{S32, S64}, {S16, S32}})
439     .scalarize(0);
440 
441   getActionDefinitionsBuilder(G_FPEXT)
442     .legalFor({{S64, S32}, {S32, S16}})
443     .lowerFor({{S64, S16}}) // FIXME: Implement
444     .scalarize(0);
445 
446   getActionDefinitionsBuilder(G_FSUB)
447       // Use actual fsub instruction
448       .legalFor({S32})
449       // Must use fadd + fneg
450       .lowerFor({S64, S16, V2S16})
451       .scalarize(0)
452       .clampScalar(0, S32, S64);
453 
454   // Whether this is legal depends on the floating point mode for the function.
455   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
456   if (ST.hasMadF16())
457     FMad.customFor({S32, S16});
458   else
459     FMad.customFor({S32});
460   FMad.scalarize(0)
461       .lower();
462 
463   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
464     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
465                {S32, S1}, {S64, S1}, {S16, S1}})
466     .scalarize(0)
467     .clampScalar(0, S32, S64)
468     .widenScalarToNextPow2(1, 32);
469 
470   // TODO: Split s1->s64 during regbankselect for VALU.
471   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
472     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
473     .lowerFor({{S32, S64}})
474     .lowerIf(typeIs(1, S1))
475     .customFor({{S64, S64}});
476   if (ST.has16BitInsts())
477     IToFP.legalFor({{S16, S16}});
478   IToFP.clampScalar(1, S32, S64)
479        .scalarize(0);
480 
481   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
482     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
483     .customFor({{S64, S64}});
484   if (ST.has16BitInsts())
485     FPToI.legalFor({{S16, S16}});
486   else
487     FPToI.minScalar(1, S32);
488 
489   FPToI.minScalar(0, S32)
490        .scalarize(0)
491        .lower();
492 
493   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
494     .scalarize(0)
495     .lower();
496 
497   if (ST.has16BitInsts()) {
498     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
499       .legalFor({S16, S32, S64})
500       .clampScalar(0, S16, S64)
501       .scalarize(0);
502   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
503     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
504       .legalFor({S32, S64})
505       .clampScalar(0, S32, S64)
506       .scalarize(0);
507   } else {
508     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
509       .legalFor({S32})
510       .customFor({S64})
511       .clampScalar(0, S32, S64)
512       .scalarize(0);
513   }
514 
515   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
516     .scalarize(0)
517     .alwaysLegal();
518 
519   auto &CmpBuilder =
520     getActionDefinitionsBuilder(G_ICMP)
521     // The compare output type differs based on the register bank of the output,
522     // so make both s1 and s32 legal.
523     //
524     // Scalar compares producing output in scc will be promoted to s32, as that
525     // is the allocatable register type that will be needed for the copy from
526     // scc. This will be promoted during RegBankSelect, and we assume something
527     // before that won't try to use s32 result types.
528     //
529     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
530     // bank.
531     .legalForCartesianProduct(
532       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
533     .legalForCartesianProduct(
534       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
535   if (ST.has16BitInsts()) {
536     CmpBuilder.legalFor({{S1, S16}});
537   }
538 
539   CmpBuilder
540     .widenScalarToNextPow2(1)
541     .clampScalar(1, S32, S64)
542     .scalarize(0)
543     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
544 
545   getActionDefinitionsBuilder(G_FCMP)
546     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
547     .widenScalarToNextPow2(1)
548     .clampScalar(1, S32, S64)
549     .scalarize(0);
550 
551   // FIXME: fpow has a selection pattern that should move to custom lowering.
552   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
553   if (ST.has16BitInsts())
554     Exp2Ops.legalFor({S32, S16});
555   else
556     Exp2Ops.legalFor({S32});
557   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
558   Exp2Ops.scalarize(0);
559 
560   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
561   if (ST.has16BitInsts())
562     ExpOps.customFor({{S32}, {S16}});
563   else
564     ExpOps.customFor({S32});
565   ExpOps.clampScalar(0, MinScalarFPTy, S32)
566         .scalarize(0);
567 
568   // The 64-bit versions produce 32-bit results, but only on the SALU.
569   getActionDefinitionsBuilder(G_CTPOP)
570     .legalFor({{S32, S32}, {S32, S64}})
571     .clampScalar(0, S32, S32)
572     .clampScalar(1, S32, S64)
573     .scalarize(0)
574     .widenScalarToNextPow2(0, 32)
575     .widenScalarToNextPow2(1, 32);
576 
577   // The hardware instructions return a different result on 0 than the generic
578   // instructions expect. The hardware produces -1, but these produce the
579   // bitwidth.
580   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
581     .scalarize(0)
582     .clampScalar(0, S32, S32)
583     .clampScalar(1, S32, S64)
584     .widenScalarToNextPow2(0, 32)
585     .widenScalarToNextPow2(1, 32)
586     .lower();
587 
588   // The 64-bit versions produce 32-bit results, but only on the SALU.
589   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
590     .legalFor({{S32, S32}, {S32, S64}})
591     .clampScalar(0, S32, S32)
592     .clampScalar(1, S32, S64)
593     .scalarize(0)
594     .widenScalarToNextPow2(0, 32)
595     .widenScalarToNextPow2(1, 32);
596 
597   // TODO: Expand for > s32
598   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
599     .legalFor({S32})
600     .clampScalar(0, S32, S32)
601     .scalarize(0);
602 
603   if (ST.has16BitInsts()) {
604     if (ST.hasVOP3PInsts()) {
605       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
606         .legalFor({S32, S16, V2S16})
607         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608         .clampMaxNumElements(0, S16, 2)
609         .clampScalar(0, S16, S32)
610         .widenScalarToNextPow2(0)
611         .scalarize(0);
612     } else {
613       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
614         .legalFor({S32, S16})
615         .widenScalarToNextPow2(0)
616         .clampScalar(0, S16, S32)
617         .scalarize(0);
618     }
619   } else {
620     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
621       .legalFor({S32})
622       .clampScalar(0, S32, S32)
623       .widenScalarToNextPow2(0)
624       .scalarize(0);
625   }
626 
627   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
628     return [=](const LegalityQuery &Query) {
629       return Query.Types[TypeIdx0].getSizeInBits() <
630              Query.Types[TypeIdx1].getSizeInBits();
631     };
632   };
633 
634   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
635     return [=](const LegalityQuery &Query) {
636       return Query.Types[TypeIdx0].getSizeInBits() >
637              Query.Types[TypeIdx1].getSizeInBits();
638     };
639   };
640 
641   getActionDefinitionsBuilder(G_INTTOPTR)
642     // List the common cases
643     .legalForCartesianProduct(AddrSpaces64, {S64})
644     .legalForCartesianProduct(AddrSpaces32, {S32})
645     .scalarize(0)
646     // Accept any address space as long as the size matches
647     .legalIf(sameSize(0, 1))
648     .widenScalarIf(smallerThan(1, 0),
649       [](const LegalityQuery &Query) {
650         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
651       })
652     .narrowScalarIf(greaterThan(1, 0),
653       [](const LegalityQuery &Query) {
654         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
655       });
656 
657   getActionDefinitionsBuilder(G_PTRTOINT)
658     // List the common cases
659     .legalForCartesianProduct(AddrSpaces64, {S64})
660     .legalForCartesianProduct(AddrSpaces32, {S32})
661     .scalarize(0)
662     // Accept any address space as long as the size matches
663     .legalIf(sameSize(0, 1))
664     .widenScalarIf(smallerThan(0, 1),
665       [](const LegalityQuery &Query) {
666         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
667       })
668     .narrowScalarIf(
669       greaterThan(0, 1),
670       [](const LegalityQuery &Query) {
671         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
672       });
673 
674   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
675     .scalarize(0)
676     .custom();
677 
678   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
679   // handle some operations by just promoting the register during
680   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
681   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
682     switch (AS) {
683     // FIXME: Private element size.
684     case AMDGPUAS::PRIVATE_ADDRESS:
685       return 32;
686     // FIXME: Check subtarget
687     case AMDGPUAS::LOCAL_ADDRESS:
688       return ST.useDS128() ? 128 : 64;
689 
690     // Treat constant and global as identical. SMRD loads are sometimes usable
691     // for global loads (ideally constant address space should be eliminated)
692     // depending on the context. Legality cannot be context dependent, but
693     // RegBankSelect can split the load as necessary depending on the pointer
694     // register bank/uniformity and if the memory is invariant or not written in
695     // a kernel.
696     case AMDGPUAS::CONSTANT_ADDRESS:
697     case AMDGPUAS::GLOBAL_ADDRESS:
698       return IsLoad ? 512 : 128;
699     default:
700       return 128;
701     }
702   };
703 
704   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
705                                     bool IsLoad) -> bool {
706     const LLT DstTy = Query.Types[0];
707 
708     // Split vector extloads.
709     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
710     unsigned Align = Query.MMODescrs[0].AlignInBits;
711 
712     if (MemSize < DstTy.getSizeInBits())
713       MemSize = std::max(MemSize, Align);
714 
715     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
716       return true;
717 
718     const LLT PtrTy = Query.Types[1];
719     unsigned AS = PtrTy.getAddressSpace();
720     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
721       return true;
722 
723     // Catch weird sized loads that don't evenly divide into the access sizes
724     // TODO: May be able to widen depending on alignment etc.
725     unsigned NumRegs = (MemSize + 31) / 32;
726     if (NumRegs == 3) {
727       if (!ST.hasDwordx3LoadStores())
728         return true;
729     } else {
730       // If the alignment allows, these should have been widened.
731       if (!isPowerOf2_32(NumRegs))
732         return true;
733     }
734 
735     if (Align < MemSize) {
736       const SITargetLowering *TLI = ST.getTargetLowering();
737       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
738     }
739 
740     return false;
741   };
742 
743   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
744     unsigned Size = Query.Types[0].getSizeInBits();
745     if (isPowerOf2_32(Size))
746       return false;
747 
748     if (Size == 96 && ST.hasDwordx3LoadStores())
749       return false;
750 
751     unsigned AddrSpace = Query.Types[1].getAddressSpace();
752     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
753       return false;
754 
755     unsigned Align = Query.MMODescrs[0].AlignInBits;
756     unsigned RoundedSize = NextPowerOf2(Size);
757     return (Align >= RoundedSize);
758   };
759 
760   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
761   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
762   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
763 
764   // TODO: Refine based on subtargets which support unaligned access or 128-bit
765   // LDS
766   // TODO: Unsupported flat for SI.
767 
768   for (unsigned Op : {G_LOAD, G_STORE}) {
769     const bool IsStore = Op == G_STORE;
770 
771     auto &Actions = getActionDefinitionsBuilder(Op);
772     // Whitelist the common cases.
773     // TODO: Loads to s16 on gfx9
774     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
775                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
776                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
777                                       {S128, GlobalPtr, 128, GlobalAlign32},
778                                       {S64, GlobalPtr, 64, GlobalAlign32},
779                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
780                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
781                                       {S32, GlobalPtr, 8, GlobalAlign8},
782                                       {S32, GlobalPtr, 16, GlobalAlign16},
783 
784                                       {S32, LocalPtr, 32, 32},
785                                       {S64, LocalPtr, 64, 32},
786                                       {V2S32, LocalPtr, 64, 32},
787                                       {S32, LocalPtr, 8, 8},
788                                       {S32, LocalPtr, 16, 16},
789                                       {V2S16, LocalPtr, 32, 32},
790 
791                                       {S32, PrivatePtr, 32, 32},
792                                       {S32, PrivatePtr, 8, 8},
793                                       {S32, PrivatePtr, 16, 16},
794                                       {V2S16, PrivatePtr, 32, 32},
795 
796                                       {S32, FlatPtr, 32, GlobalAlign32},
797                                       {S32, FlatPtr, 16, GlobalAlign16},
798                                       {S32, FlatPtr, 8, GlobalAlign8},
799                                       {V2S16, FlatPtr, 32, GlobalAlign32},
800 
801                                       {S32, ConstantPtr, 32, GlobalAlign32},
802                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
803                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
804                                       {S64, ConstantPtr, 64, GlobalAlign32},
805                                       {S128, ConstantPtr, 128, GlobalAlign32},
806                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
807     Actions
808         .customIf(typeIs(1, Constant32Ptr))
809         // Widen suitably aligned loads by loading extra elements.
810         .moreElementsIf([=](const LegalityQuery &Query) {
811             const LLT Ty = Query.Types[0];
812             return Op == G_LOAD && Ty.isVector() &&
813                    shouldWidenLoadResult(Query);
814           }, moreElementsToNextPow2(0))
815         .widenScalarIf([=](const LegalityQuery &Query) {
816             const LLT Ty = Query.Types[0];
817             return Op == G_LOAD && !Ty.isVector() &&
818                    shouldWidenLoadResult(Query);
819           }, widenScalarOrEltToNextPow2(0))
820         .narrowScalarIf(
821             [=](const LegalityQuery &Query) -> bool {
822               return !Query.Types[0].isVector() &&
823                      needToSplitMemOp(Query, Op == G_LOAD);
824             },
825             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
826               const LLT DstTy = Query.Types[0];
827               const LLT PtrTy = Query.Types[1];
828 
829               const unsigned DstSize = DstTy.getSizeInBits();
830               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
831 
832               // Split extloads.
833               if (DstSize > MemSize)
834                 return std::make_pair(0, LLT::scalar(MemSize));
835 
836               if (!isPowerOf2_32(DstSize)) {
837                 // We're probably decomposing an odd sized store. Try to split
838                 // to the widest type. TODO: Account for alignment. As-is it
839                 // should be OK, since the new parts will be further legalized.
840                 unsigned FloorSize = PowerOf2Floor(DstSize);
841                 return std::make_pair(0, LLT::scalar(FloorSize));
842               }
843 
844               if (DstSize > 32 && (DstSize % 32 != 0)) {
845                 // FIXME: Need a way to specify non-extload of larger size if
846                 // suitably aligned.
847                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
848               }
849 
850               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
851                                                      Op == G_LOAD);
852               if (MemSize > MaxSize)
853                 return std::make_pair(0, LLT::scalar(MaxSize));
854 
855               unsigned Align = Query.MMODescrs[0].AlignInBits;
856               return std::make_pair(0, LLT::scalar(Align));
857             })
858         .fewerElementsIf(
859             [=](const LegalityQuery &Query) -> bool {
860               return Query.Types[0].isVector() &&
861                      needToSplitMemOp(Query, Op == G_LOAD);
862             },
863             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
864               const LLT DstTy = Query.Types[0];
865               const LLT PtrTy = Query.Types[1];
866 
867               LLT EltTy = DstTy.getElementType();
868               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
869                                                      Op == G_LOAD);
870 
871               // FIXME: Handle widened to power of 2 results better. This ends
872               // up scalarizing.
873               // FIXME: 3 element stores scalarized on SI
874 
875               // Split if it's too large for the address space.
876               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
877                 unsigned NumElts = DstTy.getNumElements();
878                 unsigned EltSize = EltTy.getSizeInBits();
879 
880                 if (MaxSize % EltSize == 0) {
881                   return std::make_pair(
882                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
883                 }
884 
885                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
886 
887                 // FIXME: Refine when odd breakdowns handled
888                 // The scalars will need to be re-legalized.
889                 if (NumPieces == 1 || NumPieces >= NumElts ||
890                     NumElts % NumPieces != 0)
891                   return std::make_pair(0, EltTy);
892 
893                 return std::make_pair(0,
894                                       LLT::vector(NumElts / NumPieces, EltTy));
895               }
896 
897               // FIXME: We could probably handle weird extending loads better.
898               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
899               if (DstTy.getSizeInBits() > MemSize)
900                 return std::make_pair(0, EltTy);
901 
902               unsigned EltSize = EltTy.getSizeInBits();
903               unsigned DstSize = DstTy.getSizeInBits();
904               if (!isPowerOf2_32(DstSize)) {
905                 // We're probably decomposing an odd sized store. Try to split
906                 // to the widest type. TODO: Account for alignment. As-is it
907                 // should be OK, since the new parts will be further legalized.
908                 unsigned FloorSize = PowerOf2Floor(DstSize);
909                 return std::make_pair(
910                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
911               }
912 
913               // Need to split because of alignment.
914               unsigned Align = Query.MMODescrs[0].AlignInBits;
915               if (EltSize > Align &&
916                   (EltSize / Align < DstTy.getNumElements())) {
917                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
918               }
919 
920               // May need relegalization for the scalars.
921               return std::make_pair(0, EltTy);
922             })
923         .minScalar(0, S32);
924 
925     if (IsStore)
926       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
927 
928     // TODO: Need a bitcast lower option?
929     Actions
930         .legalIf([=](const LegalityQuery &Query) {
931           const LLT Ty0 = Query.Types[0];
932           unsigned Size = Ty0.getSizeInBits();
933           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
934           unsigned Align = Query.MMODescrs[0].AlignInBits;
935 
936           // FIXME: Widening store from alignment not valid.
937           if (MemSize < Size)
938             MemSize = std::max(MemSize, Align);
939 
940           // No extending vector loads.
941           if (Size > MemSize && Ty0.isVector())
942             return false;
943 
944           switch (MemSize) {
945           case 8:
946           case 16:
947             return Size == 32;
948           case 32:
949           case 64:
950           case 128:
951             return true;
952           case 96:
953             return ST.hasDwordx3LoadStores();
954           case 256:
955           case 512:
956             return true;
957           default:
958             return false;
959           }
960         })
961         .widenScalarToNextPow2(0)
962         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
963   }
964 
965   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
966                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
967                                                   {S32, GlobalPtr, 16, 2 * 8},
968                                                   {S32, LocalPtr, 8, 8},
969                                                   {S32, LocalPtr, 16, 16},
970                                                   {S32, PrivatePtr, 8, 8},
971                                                   {S32, PrivatePtr, 16, 16},
972                                                   {S32, ConstantPtr, 8, 8},
973                                                   {S32, ConstantPtr, 16, 2 * 8}});
974   if (ST.hasFlatAddressSpace()) {
975     ExtLoads.legalForTypesWithMemDesc(
976         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
977   }
978 
979   ExtLoads.clampScalar(0, S32, S32)
980           .widenScalarToNextPow2(0)
981           .unsupportedIfMemSizeNotPow2()
982           .lower();
983 
984   auto &Atomics = getActionDefinitionsBuilder(
985     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
986      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
987      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
988      G_ATOMICRMW_UMIN})
989     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
990                {S64, GlobalPtr}, {S64, LocalPtr}});
991   if (ST.hasFlatAddressSpace()) {
992     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
993   }
994 
995   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
996     .legalFor({{S32, LocalPtr}});
997 
998   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
999   // demarshalling
1000   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1001     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1002                 {S32, FlatPtr}, {S64, FlatPtr}})
1003     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1004                {S32, RegionPtr}, {S64, RegionPtr}});
1005   // TODO: Pointer types, any 32-bit or 64-bit vector
1006 
1007   // Condition should be s32 for scalar, s1 for vector.
1008   getActionDefinitionsBuilder(G_SELECT)
1009     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1010           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1011           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1012     .clampScalar(0, S16, S64)
1013     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1014     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1015     .scalarize(1)
1016     .clampMaxNumElements(0, S32, 2)
1017     .clampMaxNumElements(0, LocalPtr, 2)
1018     .clampMaxNumElements(0, PrivatePtr, 2)
1019     .scalarize(0)
1020     .widenScalarToNextPow2(0)
1021     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1022 
1023   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1024   // be more flexible with the shift amount type.
1025   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1026     .legalFor({{S32, S32}, {S64, S32}});
1027   if (ST.has16BitInsts()) {
1028     if (ST.hasVOP3PInsts()) {
1029       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1030             .clampMaxNumElements(0, S16, 2);
1031     } else
1032       Shifts.legalFor({{S16, S32}, {S16, S16}});
1033 
1034     // TODO: Support 16-bit shift amounts
1035     Shifts.clampScalar(1, S32, S32);
1036     Shifts.clampScalar(0, S16, S64);
1037     Shifts.widenScalarToNextPow2(0, 16);
1038   } else {
1039     // Make sure we legalize the shift amount type first, as the general
1040     // expansion for the shifted type will produce much worse code if it hasn't
1041     // been truncated already.
1042     Shifts.clampScalar(1, S32, S32);
1043     Shifts.clampScalar(0, S32, S64);
1044     Shifts.widenScalarToNextPow2(0, 32);
1045   }
1046   Shifts.scalarize(0);
1047 
1048   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1049     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1050     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1051     unsigned IdxTypeIdx = 2;
1052 
1053     getActionDefinitionsBuilder(Op)
1054       .customIf([=](const LegalityQuery &Query) {
1055           const LLT EltTy = Query.Types[EltTypeIdx];
1056           const LLT VecTy = Query.Types[VecTypeIdx];
1057           const LLT IdxTy = Query.Types[IdxTypeIdx];
1058           return (EltTy.getSizeInBits() == 16 ||
1059                   EltTy.getSizeInBits() % 32 == 0) &&
1060                  VecTy.getSizeInBits() % 32 == 0 &&
1061                  VecTy.getSizeInBits() <= 1024 &&
1062                  IdxTy.getSizeInBits() == 32;
1063         })
1064       .clampScalar(EltTypeIdx, S32, S64)
1065       .clampScalar(VecTypeIdx, S32, S64)
1066       .clampScalar(IdxTypeIdx, S32, S32);
1067   }
1068 
1069   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1070     .unsupportedIf([=](const LegalityQuery &Query) {
1071         const LLT &EltTy = Query.Types[1].getElementType();
1072         return Query.Types[0] != EltTy;
1073       });
1074 
1075   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1076     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1077     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1078 
1079     // FIXME: Doesn't handle extract of illegal sizes.
1080     getActionDefinitionsBuilder(Op)
1081       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1082       // FIXME: Multiples of 16 should not be legal.
1083       .legalIf([=](const LegalityQuery &Query) {
1084           const LLT BigTy = Query.Types[BigTyIdx];
1085           const LLT LitTy = Query.Types[LitTyIdx];
1086           return (BigTy.getSizeInBits() % 32 == 0) &&
1087                  (LitTy.getSizeInBits() % 16 == 0);
1088         })
1089       .widenScalarIf(
1090         [=](const LegalityQuery &Query) {
1091           const LLT BigTy = Query.Types[BigTyIdx];
1092           return (BigTy.getScalarSizeInBits() < 16);
1093         },
1094         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1095       .widenScalarIf(
1096         [=](const LegalityQuery &Query) {
1097           const LLT LitTy = Query.Types[LitTyIdx];
1098           return (LitTy.getScalarSizeInBits() < 16);
1099         },
1100         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1101       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1102       .widenScalarToNextPow2(BigTyIdx, 32);
1103 
1104   }
1105 
1106   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1107     .legalForCartesianProduct(AllS32Vectors, {S32})
1108     .legalForCartesianProduct(AllS64Vectors, {S64})
1109     .clampNumElements(0, V16S32, V32S32)
1110     .clampNumElements(0, V2S64, V16S64)
1111     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1112 
1113   if (ST.hasScalarPackInsts()) {
1114     BuildVector
1115       // FIXME: Should probably widen s1 vectors straight to s32
1116       .minScalarOrElt(0, S16)
1117       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1118       .minScalar(1, S32);
1119 
1120     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1121       .legalFor({V2S16, S32})
1122       .lower();
1123     BuildVector.minScalarOrElt(0, S32);
1124   } else {
1125     BuildVector.customFor({V2S16, S16});
1126     BuildVector.minScalarOrElt(0, S32);
1127 
1128     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1129       .customFor({V2S16, S32})
1130       .lower();
1131   }
1132 
1133   BuildVector.legalIf(isRegisterType(0));
1134 
1135   // FIXME: Clamp maximum size
1136   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1137     .legalIf(isRegisterType(0));
1138 
1139   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1140   // pre-legalize.
1141   if (ST.hasVOP3PInsts()) {
1142     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1143       .customFor({V2S16, V2S16})
1144       .lower();
1145   } else
1146     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1147 
1148   // Merge/Unmerge
1149   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1150     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1151     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1152 
1153     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1154       const LLT &Ty = Query.Types[TypeIdx];
1155       if (Ty.isVector()) {
1156         const LLT &EltTy = Ty.getElementType();
1157         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1158           return true;
1159         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1160           return true;
1161       }
1162       return false;
1163     };
1164 
1165     auto &Builder = getActionDefinitionsBuilder(Op)
1166       // Try to widen to s16 first for small types.
1167       // TODO: Only do this on targets with legal s16 shifts
1168       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1169 
1170       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1171       .lowerFor({{S16, V2S16}})
1172       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1173       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1174                            elementTypeIs(1, S16)),
1175                        changeTo(1, V2S16))
1176       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1177       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1178       // valid.
1179       .clampScalar(LitTyIdx, S32, S256)
1180       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1181       // Break up vectors with weird elements into scalars
1182       .fewerElementsIf(
1183         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1184         scalarize(0))
1185       .fewerElementsIf(
1186         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1187         scalarize(1))
1188       .clampScalar(BigTyIdx, S32, S1024);
1189 
1190     if (Op == G_MERGE_VALUES) {
1191       Builder.widenScalarIf(
1192         // TODO: Use 16-bit shifts if legal for 8-bit values?
1193         [=](const LegalityQuery &Query) {
1194           const LLT Ty = Query.Types[LitTyIdx];
1195           return Ty.getSizeInBits() < 32;
1196         },
1197         changeTo(LitTyIdx, S32));
1198     }
1199 
1200     Builder.widenScalarIf(
1201       [=](const LegalityQuery &Query) {
1202         const LLT Ty = Query.Types[BigTyIdx];
1203         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1204           Ty.getSizeInBits() % 16 != 0;
1205       },
1206       [=](const LegalityQuery &Query) {
1207         // Pick the next power of 2, or a multiple of 64 over 128.
1208         // Whichever is smaller.
1209         const LLT &Ty = Query.Types[BigTyIdx];
1210         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1211         if (NewSizeInBits >= 256) {
1212           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1213           if (RoundedTo < NewSizeInBits)
1214             NewSizeInBits = RoundedTo;
1215         }
1216         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1217       })
1218       .legalIf([=](const LegalityQuery &Query) {
1219           const LLT &BigTy = Query.Types[BigTyIdx];
1220           const LLT &LitTy = Query.Types[LitTyIdx];
1221 
1222           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1223             return false;
1224           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1225             return false;
1226 
1227           return BigTy.getSizeInBits() % 16 == 0 &&
1228                  LitTy.getSizeInBits() % 16 == 0 &&
1229                  BigTy.getSizeInBits() <= 1024;
1230         })
1231       // Any vectors left are the wrong size. Scalarize them.
1232       .scalarize(0)
1233       .scalarize(1);
1234   }
1235 
1236   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1237   // RegBankSelect.
1238   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1239     .legalFor({{S32}, {S64}});
1240 
1241   if (ST.hasVOP3PInsts()) {
1242     SextInReg.lowerFor({{V2S16}})
1243       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1244       // get more vector shift opportunities, since we'll get those when
1245       // expanded.
1246       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1247   } else if (ST.has16BitInsts()) {
1248     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1249   } else {
1250     // Prefer to promote to s32 before lowering if we don't have 16-bit
1251     // shifts. This avoid a lot of intermediate truncate and extend operations.
1252     SextInReg.lowerFor({{S32}, {S64}});
1253   }
1254 
1255   SextInReg
1256     .scalarize(0)
1257     .clampScalar(0, S32, S64)
1258     .lower();
1259 
1260   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1261     .legalFor({S64});
1262 
1263   getActionDefinitionsBuilder({
1264       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1265       G_FCOPYSIGN,
1266 
1267       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1268       G_READ_REGISTER,
1269       G_WRITE_REGISTER,
1270 
1271       G_SADDO, G_SSUBO,
1272 
1273        // TODO: Implement
1274       G_FMINIMUM, G_FMAXIMUM
1275     }).lower();
1276 
1277   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1278         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1279         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1280     .unsupported();
1281 
1282   computeTables();
1283   verify(*ST.getInstrInfo());
1284 }
1285 
1286 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1287                                          MachineRegisterInfo &MRI,
1288                                          MachineIRBuilder &B,
1289                                          GISelChangeObserver &Observer) const {
1290   switch (MI.getOpcode()) {
1291   case TargetOpcode::G_ADDRSPACE_CAST:
1292     return legalizeAddrSpaceCast(MI, MRI, B);
1293   case TargetOpcode::G_FRINT:
1294     return legalizeFrint(MI, MRI, B);
1295   case TargetOpcode::G_FCEIL:
1296     return legalizeFceil(MI, MRI, B);
1297   case TargetOpcode::G_INTRINSIC_TRUNC:
1298     return legalizeIntrinsicTrunc(MI, MRI, B);
1299   case TargetOpcode::G_SITOFP:
1300     return legalizeITOFP(MI, MRI, B, true);
1301   case TargetOpcode::G_UITOFP:
1302     return legalizeITOFP(MI, MRI, B, false);
1303   case TargetOpcode::G_FPTOSI:
1304     return legalizeFPTOI(MI, MRI, B, true);
1305   case TargetOpcode::G_FPTOUI:
1306     return legalizeFPTOI(MI, MRI, B, false);
1307   case TargetOpcode::G_FMINNUM:
1308   case TargetOpcode::G_FMAXNUM:
1309   case TargetOpcode::G_FMINNUM_IEEE:
1310   case TargetOpcode::G_FMAXNUM_IEEE:
1311     return legalizeMinNumMaxNum(MI, MRI, B);
1312   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1313     return legalizeExtractVectorElt(MI, MRI, B);
1314   case TargetOpcode::G_INSERT_VECTOR_ELT:
1315     return legalizeInsertVectorElt(MI, MRI, B);
1316   case TargetOpcode::G_SHUFFLE_VECTOR:
1317     return legalizeShuffleVector(MI, MRI, B);
1318   case TargetOpcode::G_FSIN:
1319   case TargetOpcode::G_FCOS:
1320     return legalizeSinCos(MI, MRI, B);
1321   case TargetOpcode::G_GLOBAL_VALUE:
1322     return legalizeGlobalValue(MI, MRI, B);
1323   case TargetOpcode::G_LOAD:
1324     return legalizeLoad(MI, MRI, B, Observer);
1325   case TargetOpcode::G_FMAD:
1326     return legalizeFMad(MI, MRI, B);
1327   case TargetOpcode::G_FDIV:
1328     return legalizeFDIV(MI, MRI, B);
1329   case TargetOpcode::G_ATOMIC_CMPXCHG:
1330     return legalizeAtomicCmpXChg(MI, MRI, B);
1331   case TargetOpcode::G_FLOG:
1332     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1333   case TargetOpcode::G_FLOG10:
1334     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1335   case TargetOpcode::G_FEXP:
1336     return legalizeFExp(MI, B);
1337   case TargetOpcode::G_FFLOOR:
1338     return legalizeFFloor(MI, MRI, B);
1339   case TargetOpcode::G_BUILD_VECTOR:
1340     return legalizeBuildVector(MI, MRI, B);
1341   default:
1342     return false;
1343   }
1344 
1345   llvm_unreachable("expected switch to return");
1346 }
1347 
1348 Register AMDGPULegalizerInfo::getSegmentAperture(
1349   unsigned AS,
1350   MachineRegisterInfo &MRI,
1351   MachineIRBuilder &B) const {
1352   MachineFunction &MF = B.getMF();
1353   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1354   const LLT S32 = LLT::scalar(32);
1355 
1356   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1357 
1358   if (ST.hasApertureRegs()) {
1359     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1360     // getreg.
1361     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1362         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1363         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1364     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1365         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1366         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1367     unsigned Encoding =
1368         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1369         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1370         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1371 
1372     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1373 
1374     B.buildInstr(AMDGPU::S_GETREG_B32)
1375       .addDef(GetReg)
1376       .addImm(Encoding);
1377     MRI.setType(GetReg, S32);
1378 
1379     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1380     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1381   }
1382 
1383   Register QueuePtr = MRI.createGenericVirtualRegister(
1384     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1385 
1386   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1387   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1388     return Register();
1389 
1390   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1391   // private_segment_aperture_base_hi.
1392   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1393 
1394   // TODO: can we be smarter about machine pointer info?
1395   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1396   MachineMemOperand *MMO = MF.getMachineMemOperand(
1397     PtrInfo,
1398     MachineMemOperand::MOLoad |
1399     MachineMemOperand::MODereferenceable |
1400     MachineMemOperand::MOInvariant,
1401     4,
1402     MinAlign(64, StructOffset));
1403 
1404   Register LoadAddr;
1405 
1406   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1407   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1408 }
1409 
1410 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1411   MachineInstr &MI, MachineRegisterInfo &MRI,
1412   MachineIRBuilder &B) const {
1413   MachineFunction &MF = B.getMF();
1414 
1415   B.setInstr(MI);
1416 
1417   const LLT S32 = LLT::scalar(32);
1418   Register Dst = MI.getOperand(0).getReg();
1419   Register Src = MI.getOperand(1).getReg();
1420 
1421   LLT DstTy = MRI.getType(Dst);
1422   LLT SrcTy = MRI.getType(Src);
1423   unsigned DestAS = DstTy.getAddressSpace();
1424   unsigned SrcAS = SrcTy.getAddressSpace();
1425 
1426   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1427   // vector element.
1428   assert(!DstTy.isVector());
1429 
1430   const AMDGPUTargetMachine &TM
1431     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1432 
1433   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1434   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1435     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1436     return true;
1437   }
1438 
1439   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1440     // Truncate.
1441     B.buildExtract(Dst, Src, 0);
1442     MI.eraseFromParent();
1443     return true;
1444   }
1445 
1446   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1447     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1448     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1449 
1450     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1451     // another. Merge operands are required to be the same type, but creating an
1452     // extra ptrtoint would be kind of pointless.
1453     auto HighAddr = B.buildConstant(
1454       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1455     B.buildMerge(Dst, {Src, HighAddr});
1456     MI.eraseFromParent();
1457     return true;
1458   }
1459 
1460   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1461     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1462            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1463     unsigned NullVal = TM.getNullPointerValue(DestAS);
1464 
1465     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1466     auto FlatNull = B.buildConstant(SrcTy, 0);
1467 
1468     // Extract low 32-bits of the pointer.
1469     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1470 
1471     auto CmpRes =
1472         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1473     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1474 
1475     MI.eraseFromParent();
1476     return true;
1477   }
1478 
1479   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1480     return false;
1481 
1482   if (!ST.hasFlatAddressSpace())
1483     return false;
1484 
1485   auto SegmentNull =
1486       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1487   auto FlatNull =
1488       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1489 
1490   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1491   if (!ApertureReg.isValid())
1492     return false;
1493 
1494   auto CmpRes =
1495       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1496 
1497   // Coerce the type of the low half of the result so we can use merge_values.
1498   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1499 
1500   // TODO: Should we allow mismatched types but matching sizes in merges to
1501   // avoid the ptrtoint?
1502   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1503   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1504 
1505   MI.eraseFromParent();
1506   return true;
1507 }
1508 
1509 bool AMDGPULegalizerInfo::legalizeFrint(
1510   MachineInstr &MI, MachineRegisterInfo &MRI,
1511   MachineIRBuilder &B) const {
1512   B.setInstr(MI);
1513 
1514   Register Src = MI.getOperand(1).getReg();
1515   LLT Ty = MRI.getType(Src);
1516   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1517 
1518   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1519   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1520 
1521   auto C1 = B.buildFConstant(Ty, C1Val);
1522   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1523 
1524   // TODO: Should this propagate fast-math-flags?
1525   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1526   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1527 
1528   auto C2 = B.buildFConstant(Ty, C2Val);
1529   auto Fabs = B.buildFAbs(Ty, Src);
1530 
1531   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1532   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1533   return true;
1534 }
1535 
1536 bool AMDGPULegalizerInfo::legalizeFceil(
1537   MachineInstr &MI, MachineRegisterInfo &MRI,
1538   MachineIRBuilder &B) const {
1539   B.setInstr(MI);
1540 
1541   const LLT S1 = LLT::scalar(1);
1542   const LLT S64 = LLT::scalar(64);
1543 
1544   Register Src = MI.getOperand(1).getReg();
1545   assert(MRI.getType(Src) == S64);
1546 
1547   // result = trunc(src)
1548   // if (src > 0.0 && src != result)
1549   //   result += 1.0
1550 
1551   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1552 
1553   const auto Zero = B.buildFConstant(S64, 0.0);
1554   const auto One = B.buildFConstant(S64, 1.0);
1555   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1556   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1557   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1558   auto Add = B.buildSelect(S64, And, One, Zero);
1559 
1560   // TODO: Should this propagate fast-math-flags?
1561   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1562   return true;
1563 }
1564 
1565 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1566                                               MachineIRBuilder &B) {
1567   const unsigned FractBits = 52;
1568   const unsigned ExpBits = 11;
1569   LLT S32 = LLT::scalar(32);
1570 
1571   auto Const0 = B.buildConstant(S32, FractBits - 32);
1572   auto Const1 = B.buildConstant(S32, ExpBits);
1573 
1574   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1575     .addUse(Const0.getReg(0))
1576     .addUse(Const1.getReg(0));
1577 
1578   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1579 }
1580 
1581 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1582   MachineInstr &MI, MachineRegisterInfo &MRI,
1583   MachineIRBuilder &B) const {
1584   B.setInstr(MI);
1585 
1586   const LLT S1 = LLT::scalar(1);
1587   const LLT S32 = LLT::scalar(32);
1588   const LLT S64 = LLT::scalar(64);
1589 
1590   Register Src = MI.getOperand(1).getReg();
1591   assert(MRI.getType(Src) == S64);
1592 
1593   // TODO: Should this use extract since the low half is unused?
1594   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1595   Register Hi = Unmerge.getReg(1);
1596 
1597   // Extract the upper half, since this is where we will find the sign and
1598   // exponent.
1599   auto Exp = extractF64Exponent(Hi, B);
1600 
1601   const unsigned FractBits = 52;
1602 
1603   // Extract the sign bit.
1604   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1605   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1606 
1607   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1608 
1609   const auto Zero32 = B.buildConstant(S32, 0);
1610 
1611   // Extend back to 64-bits.
1612   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1613 
1614   auto Shr = B.buildAShr(S64, FractMask, Exp);
1615   auto Not = B.buildNot(S64, Shr);
1616   auto Tmp0 = B.buildAnd(S64, Src, Not);
1617   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1618 
1619   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1620   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1621 
1622   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1623   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1624   return true;
1625 }
1626 
1627 bool AMDGPULegalizerInfo::legalizeITOFP(
1628   MachineInstr &MI, MachineRegisterInfo &MRI,
1629   MachineIRBuilder &B, bool Signed) const {
1630   B.setInstr(MI);
1631 
1632   Register Dst = MI.getOperand(0).getReg();
1633   Register Src = MI.getOperand(1).getReg();
1634 
1635   const LLT S64 = LLT::scalar(64);
1636   const LLT S32 = LLT::scalar(32);
1637 
1638   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1639 
1640   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1641 
1642   auto CvtHi = Signed ?
1643     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1644     B.buildUITOFP(S64, Unmerge.getReg(1));
1645 
1646   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1647 
1648   auto ThirtyTwo = B.buildConstant(S32, 32);
1649   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1650     .addUse(CvtHi.getReg(0))
1651     .addUse(ThirtyTwo.getReg(0));
1652 
1653   // TODO: Should this propagate fast-math-flags?
1654   B.buildFAdd(Dst, LdExp, CvtLo);
1655   MI.eraseFromParent();
1656   return true;
1657 }
1658 
1659 // TODO: Copied from DAG implementation. Verify logic and document how this
1660 // actually works.
1661 bool AMDGPULegalizerInfo::legalizeFPTOI(
1662   MachineInstr &MI, MachineRegisterInfo &MRI,
1663   MachineIRBuilder &B, bool Signed) const {
1664   B.setInstr(MI);
1665 
1666   Register Dst = MI.getOperand(0).getReg();
1667   Register Src = MI.getOperand(1).getReg();
1668 
1669   const LLT S64 = LLT::scalar(64);
1670   const LLT S32 = LLT::scalar(32);
1671 
1672   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1673 
1674   unsigned Flags = MI.getFlags();
1675 
1676   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1677   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1678   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1679 
1680   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1681   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1682   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1683 
1684   auto Hi = Signed ?
1685     B.buildFPTOSI(S32, FloorMul) :
1686     B.buildFPTOUI(S32, FloorMul);
1687   auto Lo = B.buildFPTOUI(S32, Fma);
1688 
1689   B.buildMerge(Dst, { Lo, Hi });
1690   MI.eraseFromParent();
1691 
1692   return true;
1693 }
1694 
1695 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1696   MachineInstr &MI, MachineRegisterInfo &MRI,
1697   MachineIRBuilder &B) const {
1698   MachineFunction &MF = B.getMF();
1699   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1700 
1701   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1702                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1703 
1704   // With ieee_mode disabled, the instructions have the correct behavior
1705   // already for G_FMINNUM/G_FMAXNUM
1706   if (!MFI->getMode().IEEE)
1707     return !IsIEEEOp;
1708 
1709   if (IsIEEEOp)
1710     return true;
1711 
1712   MachineIRBuilder HelperBuilder(MI);
1713   GISelObserverWrapper DummyObserver;
1714   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1715   HelperBuilder.setInstr(MI);
1716   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1717 }
1718 
1719 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1720   MachineInstr &MI, MachineRegisterInfo &MRI,
1721   MachineIRBuilder &B) const {
1722   // TODO: Should move some of this into LegalizerHelper.
1723 
1724   // TODO: Promote dynamic indexing of s16 to s32
1725 
1726   // FIXME: Artifact combiner probably should have replaced the truncated
1727   // constant before this, so we shouldn't need
1728   // getConstantVRegValWithLookThrough.
1729   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1730     MI.getOperand(2).getReg(), MRI);
1731   if (!IdxVal) // Dynamic case will be selected to register indexing.
1732     return true;
1733 
1734   Register Dst = MI.getOperand(0).getReg();
1735   Register Vec = MI.getOperand(1).getReg();
1736 
1737   LLT VecTy = MRI.getType(Vec);
1738   LLT EltTy = VecTy.getElementType();
1739   assert(EltTy == MRI.getType(Dst));
1740 
1741   B.setInstr(MI);
1742 
1743   if (IdxVal->Value < VecTy.getNumElements())
1744     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1745   else
1746     B.buildUndef(Dst);
1747 
1748   MI.eraseFromParent();
1749   return true;
1750 }
1751 
1752 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1753   MachineInstr &MI, MachineRegisterInfo &MRI,
1754   MachineIRBuilder &B) const {
1755   // TODO: Should move some of this into LegalizerHelper.
1756 
1757   // TODO: Promote dynamic indexing of s16 to s32
1758 
1759   // FIXME: Artifact combiner probably should have replaced the truncated
1760   // constant before this, so we shouldn't need
1761   // getConstantVRegValWithLookThrough.
1762   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1763     MI.getOperand(3).getReg(), MRI);
1764   if (!IdxVal) // Dynamic case will be selected to register indexing.
1765     return true;
1766 
1767   Register Dst = MI.getOperand(0).getReg();
1768   Register Vec = MI.getOperand(1).getReg();
1769   Register Ins = MI.getOperand(2).getReg();
1770 
1771   LLT VecTy = MRI.getType(Vec);
1772   LLT EltTy = VecTy.getElementType();
1773   assert(EltTy == MRI.getType(Ins));
1774 
1775   B.setInstr(MI);
1776 
1777   if (IdxVal->Value < VecTy.getNumElements())
1778     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1779   else
1780     B.buildUndef(Dst);
1781 
1782   MI.eraseFromParent();
1783   return true;
1784 }
1785 
1786 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1787   assert(Mask.size() == 2);
1788 
1789   // If one half is undef, the other is trivially in the same reg.
1790   if (Mask[0] == -1 || Mask[1] == -1)
1791     return true;
1792   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1793          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1794 }
1795 
1796 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1797   MachineInstr &MI, MachineRegisterInfo &MRI,
1798   MachineIRBuilder &B) const {
1799   const LLT V2S16 = LLT::vector(2, 16);
1800 
1801   Register Dst = MI.getOperand(0).getReg();
1802   Register Src0 = MI.getOperand(1).getReg();
1803   LLT DstTy = MRI.getType(Dst);
1804   LLT SrcTy = MRI.getType(Src0);
1805 
1806   if (SrcTy == V2S16 && DstTy == V2S16 &&
1807       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1808     return true;
1809 
1810   MachineIRBuilder HelperBuilder(MI);
1811   GISelObserverWrapper DummyObserver;
1812   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1813   HelperBuilder.setInstr(MI);
1814   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1815 }
1816 
1817 bool AMDGPULegalizerInfo::legalizeSinCos(
1818   MachineInstr &MI, MachineRegisterInfo &MRI,
1819   MachineIRBuilder &B) const {
1820   B.setInstr(MI);
1821 
1822   Register DstReg = MI.getOperand(0).getReg();
1823   Register SrcReg = MI.getOperand(1).getReg();
1824   LLT Ty = MRI.getType(DstReg);
1825   unsigned Flags = MI.getFlags();
1826 
1827   Register TrigVal;
1828   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1829   if (ST.hasTrigReducedRange()) {
1830     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1831     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1832       .addUse(MulVal.getReg(0))
1833       .setMIFlags(Flags).getReg(0);
1834   } else
1835     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1836 
1837   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1838     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1839   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1840     .addUse(TrigVal)
1841     .setMIFlags(Flags);
1842   MI.eraseFromParent();
1843   return true;
1844 }
1845 
1846 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1847   Register DstReg, LLT PtrTy,
1848   MachineIRBuilder &B, const GlobalValue *GV,
1849   unsigned Offset, unsigned GAFlags) const {
1850   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1851   // to the following code sequence:
1852   //
1853   // For constant address space:
1854   //   s_getpc_b64 s[0:1]
1855   //   s_add_u32 s0, s0, $symbol
1856   //   s_addc_u32 s1, s1, 0
1857   //
1858   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1859   //   a fixup or relocation is emitted to replace $symbol with a literal
1860   //   constant, which is a pc-relative offset from the encoding of the $symbol
1861   //   operand to the global variable.
1862   //
1863   // For global address space:
1864   //   s_getpc_b64 s[0:1]
1865   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1866   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1867   //
1868   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1869   //   fixups or relocations are emitted to replace $symbol@*@lo and
1870   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1871   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1872   //   operand to the global variable.
1873   //
1874   // What we want here is an offset from the value returned by s_getpc
1875   // (which is the address of the s_add_u32 instruction) to the global
1876   // variable, but since the encoding of $symbol starts 4 bytes after the start
1877   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1878   // small. This requires us to add 4 to the global variable offset in order to
1879   // compute the correct address.
1880 
1881   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1882 
1883   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1884     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1885 
1886   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1887     .addDef(PCReg);
1888 
1889   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1890   if (GAFlags == SIInstrInfo::MO_NONE)
1891     MIB.addImm(0);
1892   else
1893     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1894 
1895   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1896 
1897   if (PtrTy.getSizeInBits() == 32)
1898     B.buildExtract(DstReg, PCReg, 0);
1899   return true;
1900  }
1901 
1902 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1903   MachineInstr &MI, MachineRegisterInfo &MRI,
1904   MachineIRBuilder &B) const {
1905   Register DstReg = MI.getOperand(0).getReg();
1906   LLT Ty = MRI.getType(DstReg);
1907   unsigned AS = Ty.getAddressSpace();
1908 
1909   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1910   MachineFunction &MF = B.getMF();
1911   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1912   B.setInstr(MI);
1913 
1914   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1915     if (!MFI->isEntryFunction()) {
1916       const Function &Fn = MF.getFunction();
1917       DiagnosticInfoUnsupported BadLDSDecl(
1918         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1919       Fn.getContext().diagnose(BadLDSDecl);
1920     }
1921 
1922     // TODO: We could emit code to handle the initialization somewhere.
1923     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1924       const SITargetLowering *TLI = ST.getTargetLowering();
1925       if (!TLI->shouldUseLDSConstAddress(GV)) {
1926         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1927         return true; // Leave in place;
1928       }
1929 
1930       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1931       MI.eraseFromParent();
1932       return true;
1933     }
1934 
1935     const Function &Fn = MF.getFunction();
1936     DiagnosticInfoUnsupported BadInit(
1937       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1938     Fn.getContext().diagnose(BadInit);
1939     return true;
1940   }
1941 
1942   const SITargetLowering *TLI = ST.getTargetLowering();
1943 
1944   if (TLI->shouldEmitFixup(GV)) {
1945     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1946     MI.eraseFromParent();
1947     return true;
1948   }
1949 
1950   if (TLI->shouldEmitPCReloc(GV)) {
1951     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1952     MI.eraseFromParent();
1953     return true;
1954   }
1955 
1956   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1957   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1958 
1959   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1960     MachinePointerInfo::getGOT(MF),
1961     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1962     MachineMemOperand::MOInvariant,
1963     8 /*Size*/, 8 /*Align*/);
1964 
1965   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1966 
1967   if (Ty.getSizeInBits() == 32) {
1968     // Truncate if this is a 32-bit constant adrdess.
1969     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1970     B.buildExtract(DstReg, Load, 0);
1971   } else
1972     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1973 
1974   MI.eraseFromParent();
1975   return true;
1976 }
1977 
1978 bool AMDGPULegalizerInfo::legalizeLoad(
1979   MachineInstr &MI, MachineRegisterInfo &MRI,
1980   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1981   B.setInstr(MI);
1982   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1983   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1984   Observer.changingInstr(MI);
1985   MI.getOperand(1).setReg(Cast.getReg(0));
1986   Observer.changedInstr(MI);
1987   return true;
1988 }
1989 
1990 bool AMDGPULegalizerInfo::legalizeFMad(
1991   MachineInstr &MI, MachineRegisterInfo &MRI,
1992   MachineIRBuilder &B) const {
1993   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1994   assert(Ty.isScalar());
1995 
1996   MachineFunction &MF = B.getMF();
1997   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1998 
1999   // TODO: Always legal with future ftz flag.
2000   // FIXME: Do we need just output?
2001   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2002     return true;
2003   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2004     return true;
2005 
2006   MachineIRBuilder HelperBuilder(MI);
2007   GISelObserverWrapper DummyObserver;
2008   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2009   HelperBuilder.setMBB(*MI.getParent());
2010   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2014   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2015   Register DstReg = MI.getOperand(0).getReg();
2016   Register PtrReg = MI.getOperand(1).getReg();
2017   Register CmpVal = MI.getOperand(2).getReg();
2018   Register NewVal = MI.getOperand(3).getReg();
2019 
2020   assert(SITargetLowering::isFlatGlobalAddrSpace(
2021            MRI.getType(PtrReg).getAddressSpace()) &&
2022          "this should not have been custom lowered");
2023 
2024   LLT ValTy = MRI.getType(CmpVal);
2025   LLT VecTy = LLT::vector(2, ValTy);
2026 
2027   B.setInstr(MI);
2028   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2029 
2030   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2031     .addDef(DstReg)
2032     .addUse(PtrReg)
2033     .addUse(PackedVal)
2034     .setMemRefs(MI.memoperands());
2035 
2036   MI.eraseFromParent();
2037   return true;
2038 }
2039 
2040 bool AMDGPULegalizerInfo::legalizeFlog(
2041   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2042   Register Dst = MI.getOperand(0).getReg();
2043   Register Src = MI.getOperand(1).getReg();
2044   LLT Ty = B.getMRI()->getType(Dst);
2045   unsigned Flags = MI.getFlags();
2046   B.setInstr(MI);
2047 
2048   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2049   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2050 
2051   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2052   MI.eraseFromParent();
2053   return true;
2054 }
2055 
2056 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2057                                        MachineIRBuilder &B) const {
2058   Register Dst = MI.getOperand(0).getReg();
2059   Register Src = MI.getOperand(1).getReg();
2060   unsigned Flags = MI.getFlags();
2061   LLT Ty = B.getMRI()->getType(Dst);
2062   B.setInstr(MI);
2063 
2064   auto K = B.buildFConstant(Ty, numbers::log2e);
2065   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2066   B.buildFExp2(Dst, Mul, Flags);
2067   MI.eraseFromParent();
2068   return true;
2069 }
2070 
2071 // Find a source register, ignoring any possible source modifiers.
2072 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2073   Register ModSrc = OrigSrc;
2074   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2075     ModSrc = SrcFNeg->getOperand(1).getReg();
2076     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2077       ModSrc = SrcFAbs->getOperand(1).getReg();
2078   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2079     ModSrc = SrcFAbs->getOperand(1).getReg();
2080   return ModSrc;
2081 }
2082 
2083 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2084                                          MachineRegisterInfo &MRI,
2085                                          MachineIRBuilder &B) const {
2086   B.setInstr(MI);
2087 
2088   const LLT S1 = LLT::scalar(1);
2089   const LLT S64 = LLT::scalar(64);
2090   Register Dst = MI.getOperand(0).getReg();
2091   Register OrigSrc = MI.getOperand(1).getReg();
2092   unsigned Flags = MI.getFlags();
2093   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2094          "this should not have been custom lowered");
2095 
2096   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2097   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2098   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2099   // V_FRACT bug is:
2100   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2101   //
2102   // Convert floor(x) to (x - fract(x))
2103 
2104   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2105     .addUse(OrigSrc)
2106     .setMIFlags(Flags);
2107 
2108   // Give source modifier matching some assistance before obscuring a foldable
2109   // pattern.
2110 
2111   // TODO: We can avoid the neg on the fract? The input sign to fract
2112   // shouldn't matter?
2113   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2114 
2115   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2116 
2117   Register Min = MRI.createGenericVirtualRegister(S64);
2118 
2119   // We don't need to concern ourselves with the snan handling difference, so
2120   // use the one which will directly select.
2121   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2122   if (MFI->getMode().IEEE)
2123     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2124   else
2125     B.buildFMinNum(Min, Fract, Const, Flags);
2126 
2127   Register CorrectedFract = Min;
2128   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2129     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2130     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2131   }
2132 
2133   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2134   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2135 
2136   MI.eraseFromParent();
2137   return true;
2138 }
2139 
2140 // Turn an illegal packed v2s16 build vector into bit operations.
2141 // TODO: This should probably be a bitcast action in LegalizerHelper.
2142 bool AMDGPULegalizerInfo::legalizeBuildVector(
2143   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2144   Register Dst = MI.getOperand(0).getReg();
2145   LLT DstTy = MRI.getType(Dst);
2146   const LLT S32 = LLT::scalar(32);
2147   const LLT V2S16 = LLT::vector(2, 16);
2148   (void)DstTy;
2149   (void)V2S16;
2150   assert(DstTy == V2S16);
2151 
2152   Register Src0 = MI.getOperand(1).getReg();
2153   Register Src1 = MI.getOperand(2).getReg();
2154   assert(MRI.getType(Src0) == LLT::scalar(16));
2155 
2156   B.setInstr(MI);
2157   auto Merge = B.buildMerge(S32, {Src0, Src1});
2158   B.buildBitcast(Dst, Merge);
2159 
2160   MI.eraseFromParent();
2161   return true;
2162 }
2163 
2164 // Return the use branch instruction, otherwise null if the usage is invalid.
2165 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2166                                        MachineRegisterInfo &MRI,
2167                                        MachineInstr *&Br) {
2168   Register CondDef = MI.getOperand(0).getReg();
2169   if (!MRI.hasOneNonDBGUse(CondDef))
2170     return nullptr;
2171 
2172   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2173   if (UseMI.getParent() != MI.getParent() ||
2174       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2175     return nullptr;
2176 
2177   // Make sure the cond br is followed by a G_BR
2178   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2179   if (Next != MI.getParent()->end()) {
2180     if (Next->getOpcode() != AMDGPU::G_BR)
2181       return nullptr;
2182     Br = &*Next;
2183   }
2184 
2185   return &UseMI;
2186 }
2187 
2188 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2189                                                 Register Reg, LLT Ty) const {
2190   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2191   if (LiveIn)
2192     return LiveIn;
2193 
2194   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2195   MRI.addLiveIn(Reg, NewReg);
2196   return NewReg;
2197 }
2198 
2199 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2200                                          const ArgDescriptor *Arg) const {
2201   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2202     return false; // TODO: Handle these
2203 
2204   assert(Arg->getRegister().isPhysical());
2205 
2206   MachineRegisterInfo &MRI = *B.getMRI();
2207 
2208   LLT Ty = MRI.getType(DstReg);
2209   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2210 
2211   if (Arg->isMasked()) {
2212     // TODO: Should we try to emit this once in the entry block?
2213     const LLT S32 = LLT::scalar(32);
2214     const unsigned Mask = Arg->getMask();
2215     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2216 
2217     Register AndMaskSrc = LiveIn;
2218 
2219     if (Shift != 0) {
2220       auto ShiftAmt = B.buildConstant(S32, Shift);
2221       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2222     }
2223 
2224     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2225   } else
2226     B.buildCopy(DstReg, LiveIn);
2227 
2228   // Insert the argument copy if it doens't already exist.
2229   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2230   if (!MRI.getVRegDef(LiveIn)) {
2231     // FIXME: Should have scoped insert pt
2232     MachineBasicBlock &OrigInsBB = B.getMBB();
2233     auto OrigInsPt = B.getInsertPt();
2234 
2235     MachineBasicBlock &EntryMBB = B.getMF().front();
2236     EntryMBB.addLiveIn(Arg->getRegister());
2237     B.setInsertPt(EntryMBB, EntryMBB.begin());
2238     B.buildCopy(LiveIn, Arg->getRegister());
2239 
2240     B.setInsertPt(OrigInsBB, OrigInsPt);
2241   }
2242 
2243   return true;
2244 }
2245 
2246 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2247   MachineInstr &MI,
2248   MachineRegisterInfo &MRI,
2249   MachineIRBuilder &B,
2250   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2251   B.setInstr(MI);
2252 
2253   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2254 
2255   const ArgDescriptor *Arg;
2256   const TargetRegisterClass *RC;
2257   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2258   if (!Arg) {
2259     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2260     return false;
2261   }
2262 
2263   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2264     MI.eraseFromParent();
2265     return true;
2266   }
2267 
2268   return false;
2269 }
2270 
2271 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2272                                        MachineRegisterInfo &MRI,
2273                                        MachineIRBuilder &B) const {
2274   B.setInstr(MI);
2275   Register Dst = MI.getOperand(0).getReg();
2276   LLT DstTy = MRI.getType(Dst);
2277   LLT S16 = LLT::scalar(16);
2278   LLT S32 = LLT::scalar(32);
2279   LLT S64 = LLT::scalar(64);
2280 
2281   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2282     return true;
2283 
2284   if (DstTy == S16)
2285     return legalizeFDIV16(MI, MRI, B);
2286   if (DstTy == S32)
2287     return legalizeFDIV32(MI, MRI, B);
2288   if (DstTy == S64)
2289     return legalizeFDIV64(MI, MRI, B);
2290 
2291   return false;
2292 }
2293 
2294 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2295                                                  MachineRegisterInfo &MRI,
2296                                                  MachineIRBuilder &B) const {
2297   Register Res = MI.getOperand(0).getReg();
2298   Register LHS = MI.getOperand(1).getReg();
2299   Register RHS = MI.getOperand(2).getReg();
2300 
2301   uint16_t Flags = MI.getFlags();
2302 
2303   LLT ResTy = MRI.getType(Res);
2304   LLT S32 = LLT::scalar(32);
2305   LLT S64 = LLT::scalar(64);
2306 
2307   const MachineFunction &MF = B.getMF();
2308   bool Unsafe =
2309     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2310 
2311   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2312     return false;
2313 
2314   if (!Unsafe && ResTy == S32 &&
2315       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2316     return false;
2317 
2318   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2319     // 1 / x -> RCP(x)
2320     if (CLHS->isExactlyValue(1.0)) {
2321       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2322         .addUse(RHS)
2323         .setMIFlags(Flags);
2324 
2325       MI.eraseFromParent();
2326       return true;
2327     }
2328 
2329     // -1 / x -> RCP( FNEG(x) )
2330     if (CLHS->isExactlyValue(-1.0)) {
2331       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2332       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2333         .addUse(FNeg.getReg(0))
2334         .setMIFlags(Flags);
2335 
2336       MI.eraseFromParent();
2337       return true;
2338     }
2339   }
2340 
2341   // x / y -> x * (1.0 / y)
2342   if (Unsafe) {
2343     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2344       .addUse(RHS)
2345       .setMIFlags(Flags);
2346     B.buildFMul(Res, LHS, RCP, Flags);
2347 
2348     MI.eraseFromParent();
2349     return true;
2350   }
2351 
2352   return false;
2353 }
2354 
2355 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2356                                          MachineRegisterInfo &MRI,
2357                                          MachineIRBuilder &B) const {
2358   B.setInstr(MI);
2359   Register Res = MI.getOperand(0).getReg();
2360   Register LHS = MI.getOperand(1).getReg();
2361   Register RHS = MI.getOperand(2).getReg();
2362 
2363   uint16_t Flags = MI.getFlags();
2364 
2365   LLT S16 = LLT::scalar(16);
2366   LLT S32 = LLT::scalar(32);
2367 
2368   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2369   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2370 
2371   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2372     .addUse(RHSExt.getReg(0))
2373     .setMIFlags(Flags);
2374 
2375   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2376   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2377 
2378   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2379     .addUse(RDst.getReg(0))
2380     .addUse(RHS)
2381     .addUse(LHS)
2382     .setMIFlags(Flags);
2383 
2384   MI.eraseFromParent();
2385   return true;
2386 }
2387 
2388 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2389 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2390 static void toggleSPDenormMode(bool Enable,
2391                                MachineIRBuilder &B,
2392                                const GCNSubtarget &ST,
2393                                AMDGPU::SIModeRegisterDefaults Mode) {
2394   // Set SP denorm mode to this value.
2395   unsigned SPDenormMode =
2396     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2397 
2398   if (ST.hasDenormModeInst()) {
2399     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2400     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2401 
2402     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2403     B.buildInstr(AMDGPU::S_DENORM_MODE)
2404       .addImm(NewDenormModeValue);
2405 
2406   } else {
2407     // Select FP32 bit field in mode register.
2408     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2409                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2410                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2411 
2412     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2413       .addImm(SPDenormMode)
2414       .addImm(SPDenormModeBitField);
2415   }
2416 }
2417 
2418 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2419                                          MachineRegisterInfo &MRI,
2420                                          MachineIRBuilder &B) const {
2421   B.setInstr(MI);
2422   Register Res = MI.getOperand(0).getReg();
2423   Register LHS = MI.getOperand(1).getReg();
2424   Register RHS = MI.getOperand(2).getReg();
2425   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2426   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2427 
2428   uint16_t Flags = MI.getFlags();
2429 
2430   LLT S32 = LLT::scalar(32);
2431   LLT S1 = LLT::scalar(1);
2432 
2433   auto One = B.buildFConstant(S32, 1.0f);
2434 
2435   auto DenominatorScaled =
2436     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2437       .addUse(RHS)
2438       .addUse(LHS)
2439       .addImm(1)
2440       .setMIFlags(Flags);
2441   auto NumeratorScaled =
2442     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2443       .addUse(LHS)
2444       .addUse(RHS)
2445       .addImm(0)
2446       .setMIFlags(Flags);
2447 
2448   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2449     .addUse(DenominatorScaled.getReg(0))
2450     .setMIFlags(Flags);
2451   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2452 
2453   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2454   // aren't modeled as reading it.
2455   if (!Mode.allFP32Denormals())
2456     toggleSPDenormMode(true, B, ST, Mode);
2457 
2458   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2459   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2460   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2461   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2462   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2463   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2464 
2465   if (!Mode.allFP32Denormals())
2466     toggleSPDenormMode(false, B, ST, Mode);
2467 
2468   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2469     .addUse(Fma4.getReg(0))
2470     .addUse(Fma1.getReg(0))
2471     .addUse(Fma3.getReg(0))
2472     .addUse(NumeratorScaled.getReg(1))
2473     .setMIFlags(Flags);
2474 
2475   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2476     .addUse(Fmas.getReg(0))
2477     .addUse(RHS)
2478     .addUse(LHS)
2479     .setMIFlags(Flags);
2480 
2481   MI.eraseFromParent();
2482   return true;
2483 }
2484 
2485 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2486                                          MachineRegisterInfo &MRI,
2487                                          MachineIRBuilder &B) const {
2488   B.setInstr(MI);
2489   Register Res = MI.getOperand(0).getReg();
2490   Register LHS = MI.getOperand(1).getReg();
2491   Register RHS = MI.getOperand(2).getReg();
2492 
2493   uint16_t Flags = MI.getFlags();
2494 
2495   LLT S64 = LLT::scalar(64);
2496   LLT S1 = LLT::scalar(1);
2497 
2498   auto One = B.buildFConstant(S64, 1.0);
2499 
2500   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2501     .addUse(LHS)
2502     .addUse(RHS)
2503     .addImm(1)
2504     .setMIFlags(Flags);
2505 
2506   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2507 
2508   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2509     .addUse(DivScale0.getReg(0))
2510     .setMIFlags(Flags);
2511 
2512   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2513   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2514   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2515 
2516   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2517     .addUse(LHS)
2518     .addUse(RHS)
2519     .addImm(0)
2520     .setMIFlags(Flags);
2521 
2522   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2523   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2524   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2525 
2526   Register Scale;
2527   if (!ST.hasUsableDivScaleConditionOutput()) {
2528     // Workaround a hardware bug on SI where the condition output from div_scale
2529     // is not usable.
2530 
2531     LLT S32 = LLT::scalar(32);
2532 
2533     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2534     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2535     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2536     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2537 
2538     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2539                               Scale1Unmerge.getReg(1));
2540     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2541                               Scale0Unmerge.getReg(1));
2542     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2543   } else {
2544     Scale = DivScale1.getReg(1);
2545   }
2546 
2547   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2548     .addUse(Fma4.getReg(0))
2549     .addUse(Fma3.getReg(0))
2550     .addUse(Mul.getReg(0))
2551     .addUse(Scale)
2552     .setMIFlags(Flags);
2553 
2554   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2555     .addUse(Fmas.getReg(0))
2556     .addUse(RHS)
2557     .addUse(LHS)
2558     .setMIFlags(Flags);
2559 
2560   MI.eraseFromParent();
2561   return true;
2562 }
2563 
2564 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2565                                                  MachineRegisterInfo &MRI,
2566                                                  MachineIRBuilder &B) const {
2567   B.setInstr(MI);
2568   Register Res = MI.getOperand(0).getReg();
2569   Register LHS = MI.getOperand(2).getReg();
2570   Register RHS = MI.getOperand(3).getReg();
2571   uint16_t Flags = MI.getFlags();
2572 
2573   LLT S32 = LLT::scalar(32);
2574   LLT S1 = LLT::scalar(1);
2575 
2576   auto Abs = B.buildFAbs(S32, RHS, Flags);
2577   const APFloat C0Val(1.0f);
2578 
2579   auto C0 = B.buildConstant(S32, 0x6f800000);
2580   auto C1 = B.buildConstant(S32, 0x2f800000);
2581   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2582 
2583   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2584   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2585 
2586   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2587 
2588   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2589     .addUse(Mul0.getReg(0))
2590     .setMIFlags(Flags);
2591 
2592   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2593 
2594   B.buildFMul(Res, Sel, Mul1, Flags);
2595 
2596   MI.eraseFromParent();
2597   return true;
2598 }
2599 
2600 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2601                                                  MachineRegisterInfo &MRI,
2602                                                  MachineIRBuilder &B) const {
2603   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2604   if (!MFI->isEntryFunction()) {
2605     return legalizePreloadedArgIntrin(MI, MRI, B,
2606                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2607   }
2608 
2609   B.setInstr(MI);
2610 
2611   uint64_t Offset =
2612     ST.getTargetLowering()->getImplicitParameterOffset(
2613       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2614   Register DstReg = MI.getOperand(0).getReg();
2615   LLT DstTy = MRI.getType(DstReg);
2616   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2617 
2618   const ArgDescriptor *Arg;
2619   const TargetRegisterClass *RC;
2620   std::tie(Arg, RC)
2621     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2622   if (!Arg)
2623     return false;
2624 
2625   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2626   if (!loadInputValue(KernargPtrReg, B, Arg))
2627     return false;
2628 
2629   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2630   MI.eraseFromParent();
2631   return true;
2632 }
2633 
2634 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2635                                               MachineRegisterInfo &MRI,
2636                                               MachineIRBuilder &B,
2637                                               unsigned AddrSpace) const {
2638   B.setInstr(MI);
2639   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2640   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2641   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2642   MI.eraseFromParent();
2643   return true;
2644 }
2645 
2646 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2647 // offset (the offset that is included in bounds checking and swizzling, to be
2648 // split between the instruction's voffset and immoffset fields) and soffset
2649 // (the offset that is excluded from bounds checking and swizzling, to go in
2650 // the instruction's soffset field).  This function takes the first kind of
2651 // offset and figures out how to split it between voffset and immoffset.
2652 std::tuple<Register, unsigned, unsigned>
2653 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2654                                         Register OrigOffset) const {
2655   const unsigned MaxImm = 4095;
2656   Register BaseReg;
2657   unsigned TotalConstOffset;
2658   MachineInstr *OffsetDef;
2659   const LLT S32 = LLT::scalar(32);
2660 
2661   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2662     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2663 
2664   unsigned ImmOffset = TotalConstOffset;
2665 
2666   // If the immediate value is too big for the immoffset field, put the value
2667   // and -4096 into the immoffset field so that the value that is copied/added
2668   // for the voffset field is a multiple of 4096, and it stands more chance
2669   // of being CSEd with the copy/add for another similar load/store.
2670   // However, do not do that rounding down to a multiple of 4096 if that is a
2671   // negative number, as it appears to be illegal to have a negative offset
2672   // in the vgpr, even if adding the immediate offset makes it positive.
2673   unsigned Overflow = ImmOffset & ~MaxImm;
2674   ImmOffset -= Overflow;
2675   if ((int32_t)Overflow < 0) {
2676     Overflow += ImmOffset;
2677     ImmOffset = 0;
2678   }
2679 
2680   if (Overflow != 0) {
2681     if (!BaseReg) {
2682       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2683     } else {
2684       auto OverflowVal = B.buildConstant(S32, Overflow);
2685       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2686     }
2687   }
2688 
2689   if (!BaseReg)
2690     BaseReg = B.buildConstant(S32, 0).getReg(0);
2691 
2692   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2693 }
2694 
2695 /// Handle register layout difference for f16 images for some subtargets.
2696 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2697                                              MachineRegisterInfo &MRI,
2698                                              Register Reg) const {
2699   if (!ST.hasUnpackedD16VMem())
2700     return Reg;
2701 
2702   const LLT S16 = LLT::scalar(16);
2703   const LLT S32 = LLT::scalar(32);
2704   LLT StoreVT = MRI.getType(Reg);
2705   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2706 
2707   auto Unmerge = B.buildUnmerge(S16, Reg);
2708 
2709   SmallVector<Register, 4> WideRegs;
2710   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2711     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2712 
2713   int NumElts = StoreVT.getNumElements();
2714 
2715   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2716 }
2717 
2718 Register AMDGPULegalizerInfo::fixStoreSourceType(
2719   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2720   MachineRegisterInfo *MRI = B.getMRI();
2721   LLT Ty = MRI->getType(VData);
2722 
2723   const LLT S16 = LLT::scalar(16);
2724 
2725   // Fixup illegal register types for i8 stores.
2726   if (Ty == LLT::scalar(8) || Ty == S16) {
2727     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2728     return AnyExt;
2729   }
2730 
2731   if (Ty.isVector()) {
2732     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2733       if (IsFormat)
2734         return handleD16VData(B, *MRI, VData);
2735     }
2736   }
2737 
2738   return VData;
2739 }
2740 
2741 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2742                                               MachineRegisterInfo &MRI,
2743                                               MachineIRBuilder &B,
2744                                               bool IsTyped,
2745                                               bool IsFormat) const {
2746   B.setInstr(MI);
2747 
2748   Register VData = MI.getOperand(1).getReg();
2749   LLT Ty = MRI.getType(VData);
2750   LLT EltTy = Ty.getScalarType();
2751   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2752   const LLT S32 = LLT::scalar(32);
2753 
2754   VData = fixStoreSourceType(B, VData, IsFormat);
2755   Register RSrc = MI.getOperand(2).getReg();
2756 
2757   MachineMemOperand *MMO = *MI.memoperands_begin();
2758   const int MemSize = MMO->getSize();
2759 
2760   unsigned ImmOffset;
2761   unsigned TotalOffset;
2762 
2763   // The typed intrinsics add an immediate after the registers.
2764   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2765 
2766   // The struct intrinsic variants add one additional operand over raw.
2767   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2768   Register VIndex;
2769   int OpOffset = 0;
2770   if (HasVIndex) {
2771     VIndex = MI.getOperand(3).getReg();
2772     OpOffset = 1;
2773   }
2774 
2775   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2776   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2777 
2778   unsigned Format = 0;
2779   if (IsTyped) {
2780     Format = MI.getOperand(5 + OpOffset).getImm();
2781     ++OpOffset;
2782   }
2783 
2784   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2785 
2786   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2787   if (TotalOffset != 0)
2788     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2789 
2790   unsigned Opc;
2791   if (IsTyped) {
2792     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2793                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2794   } else if (IsFormat) {
2795     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2796                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2797   } else {
2798     switch (MemSize) {
2799     case 1:
2800       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2801       break;
2802     case 2:
2803       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2804       break;
2805     default:
2806       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2807       break;
2808     }
2809   }
2810 
2811   if (!VIndex)
2812     VIndex = B.buildConstant(S32, 0).getReg(0);
2813 
2814   auto MIB = B.buildInstr(Opc)
2815     .addUse(VData)              // vdata
2816     .addUse(RSrc)               // rsrc
2817     .addUse(VIndex)             // vindex
2818     .addUse(VOffset)            // voffset
2819     .addUse(SOffset)            // soffset
2820     .addImm(ImmOffset);         // offset(imm)
2821 
2822   if (IsTyped)
2823     MIB.addImm(Format);
2824 
2825   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2826      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2827      .addMemOperand(MMO);
2828 
2829   MI.eraseFromParent();
2830   return true;
2831 }
2832 
2833 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2834                                              MachineRegisterInfo &MRI,
2835                                              MachineIRBuilder &B,
2836                                              bool IsFormat,
2837                                              bool IsTyped) const {
2838   B.setInstr(MI);
2839 
2840   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2841   MachineMemOperand *MMO = *MI.memoperands_begin();
2842   const int MemSize = MMO->getSize();
2843   const LLT S32 = LLT::scalar(32);
2844 
2845   Register Dst = MI.getOperand(0).getReg();
2846   Register RSrc = MI.getOperand(2).getReg();
2847 
2848   // The typed intrinsics add an immediate after the registers.
2849   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2850 
2851   // The struct intrinsic variants add one additional operand over raw.
2852   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2853   Register VIndex;
2854   int OpOffset = 0;
2855   if (HasVIndex) {
2856     VIndex = MI.getOperand(3).getReg();
2857     OpOffset = 1;
2858   }
2859 
2860   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2861   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2862 
2863   unsigned Format = 0;
2864   if (IsTyped) {
2865     Format = MI.getOperand(5 + OpOffset).getImm();
2866     ++OpOffset;
2867   }
2868 
2869   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2870   unsigned ImmOffset;
2871   unsigned TotalOffset;
2872 
2873   LLT Ty = MRI.getType(Dst);
2874   LLT EltTy = Ty.getScalarType();
2875   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2876   const bool Unpacked = ST.hasUnpackedD16VMem();
2877 
2878   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2879   if (TotalOffset != 0)
2880     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2881 
2882   unsigned Opc;
2883 
2884   if (IsTyped) {
2885     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2886                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2887   } else if (IsFormat) {
2888     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2889                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2890   } else {
2891     switch (MemSize) {
2892     case 1:
2893       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2894       break;
2895     case 2:
2896       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2897       break;
2898     default:
2899       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2900       break;
2901     }
2902   }
2903 
2904   Register LoadDstReg;
2905 
2906   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2907   LLT UnpackedTy = Ty.changeElementSize(32);
2908 
2909   if (IsExtLoad)
2910     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2911   else if (Unpacked && IsD16 && Ty.isVector())
2912     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2913   else
2914     LoadDstReg = Dst;
2915 
2916   if (!VIndex)
2917     VIndex = B.buildConstant(S32, 0).getReg(0);
2918 
2919   auto MIB = B.buildInstr(Opc)
2920     .addDef(LoadDstReg)         // vdata
2921     .addUse(RSrc)               // rsrc
2922     .addUse(VIndex)             // vindex
2923     .addUse(VOffset)            // voffset
2924     .addUse(SOffset)            // soffset
2925     .addImm(ImmOffset);         // offset(imm)
2926 
2927   if (IsTyped)
2928     MIB.addImm(Format);
2929 
2930   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2931      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2932      .addMemOperand(MMO);
2933 
2934   if (LoadDstReg != Dst) {
2935     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2936 
2937     // Widen result for extending loads was widened.
2938     if (IsExtLoad)
2939       B.buildTrunc(Dst, LoadDstReg);
2940     else {
2941       // Repack to original 16-bit vector result
2942       // FIXME: G_TRUNC should work, but legalization currently fails
2943       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2944       SmallVector<Register, 4> Repack;
2945       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2946         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2947       B.buildMerge(Dst, Repack);
2948     }
2949   }
2950 
2951   MI.eraseFromParent();
2952   return true;
2953 }
2954 
2955 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2956                                                MachineIRBuilder &B,
2957                                                bool IsInc) const {
2958   B.setInstr(MI);
2959   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2960                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2961   B.buildInstr(Opc)
2962     .addDef(MI.getOperand(0).getReg())
2963     .addUse(MI.getOperand(2).getReg())
2964     .addUse(MI.getOperand(3).getReg())
2965     .cloneMemRefs(MI);
2966   MI.eraseFromParent();
2967   return true;
2968 }
2969 
2970 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2971   switch (IntrID) {
2972   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2973   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2974     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2975   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2976   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2977     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2978   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2979   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2980     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2981   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2982   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2983     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2984   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2985   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2986     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2987   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2988   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2989     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2990   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2991   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2992     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2993   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2994   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2995     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2996   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2997   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2998     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2999   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3000   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3001     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3002   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3003   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3004     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3005   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3006   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3007     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3008   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3009   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3010     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3011   default:
3012     llvm_unreachable("unhandled atomic opcode");
3013   }
3014 }
3015 
3016 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3017                                                MachineIRBuilder &B,
3018                                                Intrinsic::ID IID) const {
3019   B.setInstr(MI);
3020 
3021   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3022                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3023 
3024   Register Dst = MI.getOperand(0).getReg();
3025   Register VData = MI.getOperand(2).getReg();
3026 
3027   Register CmpVal;
3028   int OpOffset = 0;
3029 
3030   if (IsCmpSwap) {
3031     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3032     ++OpOffset;
3033   }
3034 
3035   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3036   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3037 
3038   // The struct intrinsic variants add one additional operand over raw.
3039   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3040   Register VIndex;
3041   if (HasVIndex) {
3042     VIndex = MI.getOperand(4 + OpOffset).getReg();
3043     ++OpOffset;
3044   }
3045 
3046   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3047   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3048   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3049 
3050   MachineMemOperand *MMO = *MI.memoperands_begin();
3051 
3052   unsigned ImmOffset;
3053   unsigned TotalOffset;
3054   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3055   if (TotalOffset != 0)
3056     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3057 
3058   if (!VIndex)
3059     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3060 
3061   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3062     .addDef(Dst)
3063     .addUse(VData); // vdata
3064 
3065   if (IsCmpSwap)
3066     MIB.addReg(CmpVal);
3067 
3068   MIB.addUse(RSrc)               // rsrc
3069      .addUse(VIndex)             // vindex
3070      .addUse(VOffset)            // voffset
3071      .addUse(SOffset)            // soffset
3072      .addImm(ImmOffset)          // offset(imm)
3073      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3074      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3075      .addMemOperand(MMO);
3076 
3077   MI.eraseFromParent();
3078   return true;
3079 }
3080 
3081 // Produce a vector of s16 elements from s32 pieces.
3082 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3083                              ArrayRef<Register> UnmergeParts) {
3084   const LLT S16 = LLT::scalar(16);
3085 
3086   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3087   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3088     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3089 
3090   B.buildBuildVector(DstReg, RemergeParts);
3091 }
3092 
3093 /// Convert a set of s32 registers to a result vector with s16 elements.
3094 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3095                                ArrayRef<Register> UnmergeParts) {
3096   MachineRegisterInfo &MRI = *B.getMRI();
3097   const LLT V2S16 = LLT::vector(2, 16);
3098   LLT TargetTy = MRI.getType(DstReg);
3099   int NumElts = UnmergeParts.size();
3100 
3101   if (NumElts == 1) {
3102     assert(TargetTy == V2S16);
3103     B.buildBitcast(DstReg, UnmergeParts[0]);
3104     return;
3105   }
3106 
3107   SmallVector<Register, 4> RemergeParts(NumElts);
3108   for (int I = 0; I != NumElts; ++I)
3109     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3110 
3111   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3112     B.buildConcatVectors(DstReg, RemergeParts);
3113     return;
3114   }
3115 
3116   const LLT V3S16 = LLT::vector(3, 16);
3117   const LLT V6S16 = LLT::vector(6, 16);
3118 
3119   // Widen to v6s16 and unpack v3 parts.
3120   assert(TargetTy == V3S16);
3121 
3122   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3123   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3124   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3125 }
3126 
3127 // FIXME: Just vector trunc should be sufficent, but legalization currently
3128 // broken.
3129 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3130                                   Register WideDstReg) {
3131   const LLT S32 = LLT::scalar(32);
3132   const LLT S16 = LLT::scalar(16);
3133 
3134   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3135 
3136   int NumOps = Unmerge->getNumOperands() - 1;
3137   SmallVector<Register, 4> RemergeParts(NumOps);
3138   for (int I = 0; I != NumOps; ++I)
3139     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3140 
3141   B.buildBuildVector(DstReg, RemergeParts);
3142 }
3143 
3144 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3145     MachineInstr &MI, MachineIRBuilder &B,
3146     GISelChangeObserver &Observer,
3147     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3148   bool IsTFE = MI.getNumExplicitDefs() == 2;
3149 
3150   // We are only processing the operands of d16 image operations on subtargets
3151   // that use the unpacked register layout, or need to repack the TFE result.
3152 
3153   // TODO: Need to handle a16 images too
3154   // TODO: Do we need to guard against already legalized intrinsics?
3155   if (!IsTFE && !ST.hasUnpackedD16VMem())
3156     return true;
3157 
3158   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3159     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3160 
3161   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3162     return true;
3163 
3164   B.setInstr(MI);
3165 
3166   MachineRegisterInfo *MRI = B.getMRI();
3167   const LLT S32 = LLT::scalar(32);
3168   const LLT S16 = LLT::scalar(16);
3169 
3170   if (BaseOpcode->Store) { // No TFE for stores?
3171     Register VData = MI.getOperand(1).getReg();
3172     LLT Ty = MRI->getType(VData);
3173     if (!Ty.isVector() || Ty.getElementType() != S16)
3174       return true;
3175 
3176     B.setInstr(MI);
3177 
3178     Observer.changingInstr(MI);
3179     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3180     Observer.changedInstr(MI);
3181     return true;
3182   }
3183 
3184   Register DstReg = MI.getOperand(0).getReg();
3185   LLT Ty = MRI->getType(DstReg);
3186   const LLT EltTy = Ty.getScalarType();
3187   const bool IsD16 = Ty.getScalarType() == S16;
3188   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3189 
3190   if (IsTFE) {
3191     // In the IR, TFE is supposed to be used with a 2 element struct return
3192     // type. The intruction really returns these two values in one contiguous
3193     // register, with one additional dword beyond the loaded data. Rewrite the
3194     // return type to use a single register result.
3195     Register Dst1Reg = MI.getOperand(1).getReg();
3196     if (MRI->getType(Dst1Reg) != S32)
3197       return false;
3198 
3199     // TODO: Make sure the TFE operand bit is set.
3200 
3201     // The raw dword aligned data component of the load. The only legal cases
3202     // where this matters should be when using the packed D16 format, for
3203     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3204     LLT RoundedTy;
3205     LLT TFETy;
3206 
3207     if (IsD16 && ST.hasUnpackedD16VMem()) {
3208       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3209       TFETy = LLT::vector(NumElts + 1, 32);
3210     } else {
3211       unsigned EltSize = Ty.getScalarSizeInBits();
3212       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3213       unsigned RoundedSize = 32 * RoundedElts;
3214       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3215       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3216     }
3217 
3218     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3219     Observer.changingInstr(MI);
3220 
3221     MI.getOperand(0).setReg(TFEReg);
3222     MI.RemoveOperand(1);
3223 
3224     Observer.changedInstr(MI);
3225 
3226     // Insert after the instruction.
3227     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3228 
3229     // Now figure out how to copy the new result register back into the old
3230     // result.
3231 
3232     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3233     int NumDataElts = TFETy.getNumElements() - 1;
3234 
3235     if (!Ty.isVector()) {
3236       // Simplest case is a trivial unmerge (plus a truncate for d16).
3237       UnmergeResults[0] = Ty == S32 ?
3238         DstReg : MRI->createGenericVirtualRegister(S32);
3239 
3240       B.buildUnmerge(UnmergeResults, TFEReg);
3241       if (Ty != S32)
3242         B.buildTrunc(DstReg, UnmergeResults[0]);
3243       return true;
3244     }
3245 
3246     // We have to repack into a new vector of some kind.
3247     for (int I = 0; I != NumDataElts; ++I)
3248       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3249     B.buildUnmerge(UnmergeResults, TFEReg);
3250 
3251     // Drop the final TFE element.
3252     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3253 
3254     if (EltTy == S32)
3255       B.buildBuildVector(DstReg, DataPart);
3256     else if (ST.hasUnpackedD16VMem())
3257       truncToS16Vector(B, DstReg, DataPart);
3258     else
3259       bitcastToS16Vector(B, DstReg, DataPart);
3260 
3261     return true;
3262   }
3263 
3264   // Must be an image load.
3265   if (!Ty.isVector() || Ty.getElementType() != S16)
3266     return true;
3267 
3268   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3269 
3270   LLT WidenedTy = Ty.changeElementType(S32);
3271   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3272 
3273   Observer.changingInstr(MI);
3274   MI.getOperand(0).setReg(WideDstReg);
3275   Observer.changedInstr(MI);
3276 
3277   repackUnpackedD16Load(B, DstReg, WideDstReg);
3278   return true;
3279 }
3280 
3281 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3282   MachineInstr &MI, MachineIRBuilder &B,
3283   GISelChangeObserver &Observer) const {
3284   Register Dst = MI.getOperand(0).getReg();
3285   LLT Ty = B.getMRI()->getType(Dst);
3286   unsigned Size = Ty.getSizeInBits();
3287   MachineFunction &MF = B.getMF();
3288 
3289   Observer.changingInstr(MI);
3290 
3291   // FIXME: We don't really need this intermediate instruction. The intrinsic
3292   // should be fixed to have a memory operand. Since it's readnone, we're not
3293   // allowed to add one.
3294   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3295   MI.RemoveOperand(1); // Remove intrinsic ID
3296 
3297   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3298   // TODO: Should this use datalayout alignment?
3299   const unsigned MemSize = (Size + 7) / 8;
3300   const unsigned MemAlign = 4;
3301   MachineMemOperand *MMO = MF.getMachineMemOperand(
3302     MachinePointerInfo(),
3303     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3304     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3305   MI.addMemOperand(MF, MMO);
3306 
3307   // There are no 96-bit result scalar loads, but widening to 128-bit should
3308   // always be legal. We may need to restore this to a 96-bit result if it turns
3309   // out this needs to be converted to a vector load during RegBankSelect.
3310   if (!isPowerOf2_32(Size)) {
3311     LegalizerHelper Helper(MF, *this, Observer, B);
3312     B.setInstr(MI);
3313 
3314     if (Ty.isVector())
3315       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3316     else
3317       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3318   }
3319 
3320   Observer.changedInstr(MI);
3321   return true;
3322 }
3323 
3324 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3325                                             MachineIRBuilder &B,
3326                                             GISelChangeObserver &Observer) const {
3327   MachineRegisterInfo &MRI = *B.getMRI();
3328 
3329   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3330   auto IntrID = MI.getIntrinsicID();
3331   switch (IntrID) {
3332   case Intrinsic::amdgcn_if:
3333   case Intrinsic::amdgcn_else: {
3334     MachineInstr *Br = nullptr;
3335     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3336       const SIRegisterInfo *TRI
3337         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3338 
3339       B.setInstr(*BrCond);
3340       Register Def = MI.getOperand(1).getReg();
3341       Register Use = MI.getOperand(3).getReg();
3342 
3343       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3344       if (Br)
3345         BrTarget = Br->getOperand(0).getMBB();
3346 
3347       if (IntrID == Intrinsic::amdgcn_if) {
3348         B.buildInstr(AMDGPU::SI_IF)
3349           .addDef(Def)
3350           .addUse(Use)
3351           .addMBB(BrTarget);
3352       } else {
3353         B.buildInstr(AMDGPU::SI_ELSE)
3354           .addDef(Def)
3355           .addUse(Use)
3356           .addMBB(BrTarget)
3357           .addImm(0);
3358       }
3359 
3360       if (Br)
3361         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3362 
3363       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3364       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3365       MI.eraseFromParent();
3366       BrCond->eraseFromParent();
3367       return true;
3368     }
3369 
3370     return false;
3371   }
3372   case Intrinsic::amdgcn_loop: {
3373     MachineInstr *Br = nullptr;
3374     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3375       const SIRegisterInfo *TRI
3376         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3377 
3378       B.setInstr(*BrCond);
3379 
3380       // FIXME: Need to adjust branch targets based on unconditional branch.
3381       Register Reg = MI.getOperand(2).getReg();
3382       B.buildInstr(AMDGPU::SI_LOOP)
3383         .addUse(Reg)
3384         .addMBB(BrCond->getOperand(1).getMBB());
3385       MI.eraseFromParent();
3386       BrCond->eraseFromParent();
3387       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3388       return true;
3389     }
3390 
3391     return false;
3392   }
3393   case Intrinsic::amdgcn_kernarg_segment_ptr:
3394     return legalizePreloadedArgIntrin(
3395       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3396   case Intrinsic::amdgcn_implicitarg_ptr:
3397     return legalizeImplicitArgPtr(MI, MRI, B);
3398   case Intrinsic::amdgcn_workitem_id_x:
3399     return legalizePreloadedArgIntrin(MI, MRI, B,
3400                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3401   case Intrinsic::amdgcn_workitem_id_y:
3402     return legalizePreloadedArgIntrin(MI, MRI, B,
3403                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3404   case Intrinsic::amdgcn_workitem_id_z:
3405     return legalizePreloadedArgIntrin(MI, MRI, B,
3406                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3407   case Intrinsic::amdgcn_workgroup_id_x:
3408     return legalizePreloadedArgIntrin(MI, MRI, B,
3409                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3410   case Intrinsic::amdgcn_workgroup_id_y:
3411     return legalizePreloadedArgIntrin(MI, MRI, B,
3412                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3413   case Intrinsic::amdgcn_workgroup_id_z:
3414     return legalizePreloadedArgIntrin(MI, MRI, B,
3415                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3416   case Intrinsic::amdgcn_dispatch_ptr:
3417     return legalizePreloadedArgIntrin(MI, MRI, B,
3418                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3419   case Intrinsic::amdgcn_queue_ptr:
3420     return legalizePreloadedArgIntrin(MI, MRI, B,
3421                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3422   case Intrinsic::amdgcn_implicit_buffer_ptr:
3423     return legalizePreloadedArgIntrin(
3424       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3425   case Intrinsic::amdgcn_dispatch_id:
3426     return legalizePreloadedArgIntrin(MI, MRI, B,
3427                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3428   case Intrinsic::amdgcn_fdiv_fast:
3429     return legalizeFDIVFastIntrin(MI, MRI, B);
3430   case Intrinsic::amdgcn_is_shared:
3431     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3432   case Intrinsic::amdgcn_is_private:
3433     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3434   case Intrinsic::amdgcn_wavefrontsize: {
3435     B.setInstr(MI);
3436     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3437     MI.eraseFromParent();
3438     return true;
3439   }
3440   case Intrinsic::amdgcn_s_buffer_load:
3441     return legalizeSBufferLoad(MI, B, Observer);
3442   case Intrinsic::amdgcn_raw_buffer_store:
3443   case Intrinsic::amdgcn_struct_buffer_store:
3444     return legalizeBufferStore(MI, MRI, B, false, false);
3445   case Intrinsic::amdgcn_raw_buffer_store_format:
3446   case Intrinsic::amdgcn_struct_buffer_store_format:
3447     return legalizeBufferStore(MI, MRI, B, false, true);
3448   case Intrinsic::amdgcn_raw_tbuffer_store:
3449   case Intrinsic::amdgcn_struct_tbuffer_store:
3450     return legalizeBufferStore(MI, MRI, B, true, true);
3451   case Intrinsic::amdgcn_raw_buffer_load:
3452   case Intrinsic::amdgcn_struct_buffer_load:
3453     return legalizeBufferLoad(MI, MRI, B, false, false);
3454   case Intrinsic::amdgcn_raw_buffer_load_format:
3455   case Intrinsic::amdgcn_struct_buffer_load_format:
3456     return legalizeBufferLoad(MI, MRI, B, true, false);
3457   case Intrinsic::amdgcn_raw_tbuffer_load:
3458   case Intrinsic::amdgcn_struct_tbuffer_load:
3459     return legalizeBufferLoad(MI, MRI, B, true, true);
3460   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3461   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3462   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3463   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3464   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3465   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3466   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3467   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3468   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3469   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3470   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3471   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3472   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3473   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3474   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3475   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3476   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3477   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3478   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3479   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3480   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3481   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3482   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3483   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3484   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3485   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3486     return legalizeBufferAtomic(MI, B, IntrID);
3487   case Intrinsic::amdgcn_atomic_inc:
3488     return legalizeAtomicIncDec(MI, B, true);
3489   case Intrinsic::amdgcn_atomic_dec:
3490     return legalizeAtomicIncDec(MI, B, false);
3491   default: {
3492     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3493             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3494       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3495     return true;
3496   }
3497   }
3498 
3499   return true;
3500 }
3501