1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
182                                          const GCNTargetMachine &TM)
183   :  ST(ST_) {
184   using namespace TargetOpcode;
185 
186   auto GetAddrSpacePtr = [&TM](unsigned AS) {
187     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
188   };
189 
190   const LLT S1 = LLT::scalar(1);
191   const LLT S16 = LLT::scalar(16);
192   const LLT S32 = LLT::scalar(32);
193   const LLT S64 = LLT::scalar(64);
194   const LLT S128 = LLT::scalar(128);
195   const LLT S256 = LLT::scalar(256);
196   const LLT S1024 = LLT::scalar(1024);
197 
198   const LLT V2S16 = LLT::vector(2, 16);
199   const LLT V4S16 = LLT::vector(4, 16);
200 
201   const LLT V2S32 = LLT::vector(2, 32);
202   const LLT V3S32 = LLT::vector(3, 32);
203   const LLT V4S32 = LLT::vector(4, 32);
204   const LLT V5S32 = LLT::vector(5, 32);
205   const LLT V6S32 = LLT::vector(6, 32);
206   const LLT V7S32 = LLT::vector(7, 32);
207   const LLT V8S32 = LLT::vector(8, 32);
208   const LLT V9S32 = LLT::vector(9, 32);
209   const LLT V10S32 = LLT::vector(10, 32);
210   const LLT V11S32 = LLT::vector(11, 32);
211   const LLT V12S32 = LLT::vector(12, 32);
212   const LLT V13S32 = LLT::vector(13, 32);
213   const LLT V14S32 = LLT::vector(14, 32);
214   const LLT V15S32 = LLT::vector(15, 32);
215   const LLT V16S32 = LLT::vector(16, 32);
216   const LLT V32S32 = LLT::vector(32, 32);
217 
218   const LLT V2S64 = LLT::vector(2, 64);
219   const LLT V3S64 = LLT::vector(3, 64);
220   const LLT V4S64 = LLT::vector(4, 64);
221   const LLT V5S64 = LLT::vector(5, 64);
222   const LLT V6S64 = LLT::vector(6, 64);
223   const LLT V7S64 = LLT::vector(7, 64);
224   const LLT V8S64 = LLT::vector(8, 64);
225   const LLT V16S64 = LLT::vector(16, 64);
226 
227   std::initializer_list<LLT> AllS32Vectors =
228     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
229      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
230   std::initializer_list<LLT> AllS64Vectors =
231     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
232 
233   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
234   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
235   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
236   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
237   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
238   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
239   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
240 
241   const LLT CodePtr = FlatPtr;
242 
243   const std::initializer_list<LLT> AddrSpaces64 = {
244     GlobalPtr, ConstantPtr, FlatPtr
245   };
246 
247   const std::initializer_list<LLT> AddrSpaces32 = {
248     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
249   };
250 
251   const std::initializer_list<LLT> FPTypesBase = {
252     S32, S64
253   };
254 
255   const std::initializer_list<LLT> FPTypes16 = {
256     S32, S64, S16
257   };
258 
259   const std::initializer_list<LLT> FPTypesPK16 = {
260     S32, S64, S16, V2S16
261   };
262 
263   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
264 
265   setAction({G_BRCOND, S1}, Legal); // VCC branches
266   setAction({G_BRCOND, S32}, Legal); // SCC branches
267 
268   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
269   // elements for v3s16
270   getActionDefinitionsBuilder(G_PHI)
271     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
272     .legalFor(AllS32Vectors)
273     .legalFor(AllS64Vectors)
274     .legalFor(AddrSpaces64)
275     .legalFor(AddrSpaces32)
276     .clampScalar(0, S32, S256)
277     .widenScalarToNextPow2(0, 32)
278     .clampMaxNumElements(0, S32, 16)
279     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
280     .legalIf(isPointer(0));
281 
282   if (ST.has16BitInsts()) {
283     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
284       .legalFor({S32, S16})
285       .clampScalar(0, S16, S32)
286       .scalarize(0)
287       .widenScalarToNextPow2(0, 32);
288   } else {
289     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
290       .legalFor({S32})
291       .clampScalar(0, S32, S32)
292       .scalarize(0);
293   }
294 
295   // FIXME: Not really legal. Placeholder for custom lowering.
296   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
297     .legalFor({S32, S64})
298     .clampScalar(0, S32, S64)
299     .widenScalarToNextPow2(0, 32)
300     .scalarize(0);
301 
302   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
303     .legalFor({S32})
304     .clampScalar(0, S32, S32)
305     .scalarize(0);
306 
307   // Report legal for any types we can handle anywhere. For the cases only legal
308   // on the SALU, RegBankSelect will be able to re-legalize.
309   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
310     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
311     .clampScalar(0, S32, S64)
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
314     .widenScalarToNextPow2(0)
315     .scalarize(0);
316 
317   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
318                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
319     .legalFor({{S32, S1}, {S32, S32}})
320     .clampScalar(0, S32, S32)
321     .scalarize(0); // TODO: Implement.
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     .lower();
327 
328 
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   getActionDefinitionsBuilder(G_FCONSTANT)
337     .legalFor({S32, S64, S16})
338     .clampScalar(0, S16, S64);
339 
340   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
341     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
342                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
343     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344     .clampScalarOrElt(0, S32, S1024)
345     .legalIf(isMultiple32(0))
346     .widenScalarToNextPow2(0, 32)
347     .clampMaxNumElements(0, S32, 16);
348 
349   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
350   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
351     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
352   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
353 
354   auto &FPOpActions = getActionDefinitionsBuilder(
355     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
356     .legalFor({S32, S64});
357   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
358     .customFor({S32, S64});
359   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
360     .customFor({S32, S64});
361 
362   if (ST.has16BitInsts()) {
363     if (ST.hasVOP3PInsts())
364       FPOpActions.legalFor({S16, V2S16});
365     else
366       FPOpActions.legalFor({S16});
367 
368     TrigActions.customFor({S16});
369     FDIVActions.customFor({S16});
370   }
371 
372   auto &MinNumMaxNum = getActionDefinitionsBuilder({
373       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
374 
375   if (ST.hasVOP3PInsts()) {
376     MinNumMaxNum.customFor(FPTypesPK16)
377       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378       .clampMaxNumElements(0, S16, 2)
379       .clampScalar(0, S16, S64)
380       .scalarize(0);
381   } else if (ST.has16BitInsts()) {
382     MinNumMaxNum.customFor(FPTypes16)
383       .clampScalar(0, S16, S64)
384       .scalarize(0);
385   } else {
386     MinNumMaxNum.customFor(FPTypesBase)
387       .clampScalar(0, S32, S64)
388       .scalarize(0);
389   }
390 
391   if (ST.hasVOP3PInsts())
392     FPOpActions.clampMaxNumElements(0, S16, 2);
393 
394   FPOpActions
395     .scalarize(0)
396     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
397 
398   TrigActions
399     .scalarize(0)
400     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
401 
402   FDIVActions
403     .scalarize(0)
404     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
405 
406   getActionDefinitionsBuilder({G_FNEG, G_FABS})
407     .legalFor(FPTypesPK16)
408     .clampMaxNumElements(0, S16, 2)
409     .scalarize(0)
410     .clampScalar(0, S16, S64);
411 
412   if (ST.has16BitInsts()) {
413     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
414       .legalFor({S32, S64, S16})
415       .scalarize(0)
416       .clampScalar(0, S16, S64);
417   } else {
418     getActionDefinitionsBuilder(G_FSQRT)
419       .legalFor({S32, S64})
420       .scalarize(0)
421       .clampScalar(0, S32, S64);
422 
423     if (ST.hasFractBug()) {
424       getActionDefinitionsBuilder(G_FFLOOR)
425         .customFor({S64})
426         .legalFor({S32, S64})
427         .scalarize(0)
428         .clampScalar(0, S32, S64);
429     } else {
430       getActionDefinitionsBuilder(G_FFLOOR)
431         .legalFor({S32, S64})
432         .scalarize(0)
433         .clampScalar(0, S32, S64);
434     }
435   }
436 
437   getActionDefinitionsBuilder(G_FPTRUNC)
438     .legalFor({{S32, S64}, {S16, S32}})
439     .scalarize(0);
440 
441   getActionDefinitionsBuilder(G_FPEXT)
442     .legalFor({{S64, S32}, {S32, S16}})
443     .lowerFor({{S64, S16}}) // FIXME: Implement
444     .scalarize(0);
445 
446   getActionDefinitionsBuilder(G_FSUB)
447       // Use actual fsub instruction
448       .legalFor({S32})
449       // Must use fadd + fneg
450       .lowerFor({S64, S16, V2S16})
451       .scalarize(0)
452       .clampScalar(0, S32, S64);
453 
454   // Whether this is legal depends on the floating point mode for the function.
455   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
456   if (ST.hasMadF16())
457     FMad.customFor({S32, S16});
458   else
459     FMad.customFor({S32});
460   FMad.scalarize(0)
461       .lower();
462 
463   getActionDefinitionsBuilder(G_TRUNC)
464     .alwaysLegal();
465 
466   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
467     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
468                {S32, S1}, {S64, S1}, {S16, S1}})
469     .scalarize(0)
470     .clampScalar(0, S32, S64)
471     .widenScalarToNextPow2(1, 32);
472 
473   // TODO: Split s1->s64 during regbankselect for VALU.
474   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
475     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
476     .lowerFor({{S32, S64}})
477     .lowerIf(typeIs(1, S1))
478     .customFor({{S64, S64}});
479   if (ST.has16BitInsts())
480     IToFP.legalFor({{S16, S16}});
481   IToFP.clampScalar(1, S32, S64)
482        .scalarize(0);
483 
484   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
485     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
486     .customFor({{S64, S64}});
487   if (ST.has16BitInsts())
488     FPToI.legalFor({{S16, S16}});
489   else
490     FPToI.minScalar(1, S32);
491 
492   FPToI.minScalar(0, S32)
493        .scalarize(0)
494        .lower();
495 
496   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
497     .scalarize(0)
498     .lower();
499 
500   if (ST.has16BitInsts()) {
501     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
502       .legalFor({S16, S32, S64})
503       .clampScalar(0, S16, S64)
504       .scalarize(0);
505   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
506     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
507       .legalFor({S32, S64})
508       .clampScalar(0, S32, S64)
509       .scalarize(0);
510   } else {
511     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
512       .legalFor({S32})
513       .customFor({S64})
514       .clampScalar(0, S32, S64)
515       .scalarize(0);
516   }
517 
518   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
519     .scalarize(0)
520     .alwaysLegal();
521 
522   auto &CmpBuilder =
523     getActionDefinitionsBuilder(G_ICMP)
524     // The compare output type differs based on the register bank of the output,
525     // so make both s1 and s32 legal.
526     //
527     // Scalar compares producing output in scc will be promoted to s32, as that
528     // is the allocatable register type that will be needed for the copy from
529     // scc. This will be promoted during RegBankSelect, and we assume something
530     // before that won't try to use s32 result types.
531     //
532     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
533     // bank.
534     .legalForCartesianProduct(
535       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
536     .legalForCartesianProduct(
537       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
538   if (ST.has16BitInsts()) {
539     CmpBuilder.legalFor({{S1, S16}});
540   }
541 
542   CmpBuilder
543     .widenScalarToNextPow2(1)
544     .clampScalar(1, S32, S64)
545     .scalarize(0)
546     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
547 
548   getActionDefinitionsBuilder(G_FCMP)
549     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
550     .widenScalarToNextPow2(1)
551     .clampScalar(1, S32, S64)
552     .scalarize(0);
553 
554   // FIXME: fpow has a selection pattern that should move to custom lowering.
555   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
556   if (ST.has16BitInsts())
557     Exp2Ops.legalFor({S32, S16});
558   else
559     Exp2Ops.legalFor({S32});
560   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
561   Exp2Ops.scalarize(0);
562 
563   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
564   if (ST.has16BitInsts())
565     ExpOps.customFor({{S32}, {S16}});
566   else
567     ExpOps.customFor({S32});
568   ExpOps.clampScalar(0, MinScalarFPTy, S32)
569         .scalarize(0);
570 
571   // The 64-bit versions produce 32-bit results, but only on the SALU.
572   getActionDefinitionsBuilder(G_CTPOP)
573     .legalFor({{S32, S32}, {S32, S64}})
574     .clampScalar(0, S32, S32)
575     .clampScalar(1, S32, S64)
576     .scalarize(0)
577     .widenScalarToNextPow2(0, 32)
578     .widenScalarToNextPow2(1, 32);
579 
580   // The hardware instructions return a different result on 0 than the generic
581   // instructions expect. The hardware produces -1, but these produce the
582   // bitwidth.
583   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
584     .scalarize(0)
585     .clampScalar(0, S32, S32)
586     .clampScalar(1, S32, S64)
587     .widenScalarToNextPow2(0, 32)
588     .widenScalarToNextPow2(1, 32)
589     .lower();
590 
591   // The 64-bit versions produce 32-bit results, but only on the SALU.
592   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
593     .legalFor({{S32, S32}, {S32, S64}})
594     .clampScalar(0, S32, S32)
595     .clampScalar(1, S32, S64)
596     .scalarize(0)
597     .widenScalarToNextPow2(0, 32)
598     .widenScalarToNextPow2(1, 32);
599 
600   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
601     .legalFor({S32})
602     .clampScalar(0, S32, S32)
603     .scalarize(0);
604 
605   if (ST.has16BitInsts()) {
606     if (ST.hasVOP3PInsts()) {
607       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
608         .legalFor({S32, S16, V2S16})
609         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610         .clampMaxNumElements(0, S16, 2)
611         .clampScalar(0, S16, S32)
612         .widenScalarToNextPow2(0)
613         .scalarize(0);
614     } else {
615       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
616         .legalFor({S32, S16})
617         .widenScalarToNextPow2(0)
618         .clampScalar(0, S16, S32)
619         .scalarize(0);
620     }
621   } else {
622     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
623       .legalFor({S32})
624       .clampScalar(0, S32, S32)
625       .widenScalarToNextPow2(0)
626       .scalarize(0);
627   }
628 
629   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
630     return [=](const LegalityQuery &Query) {
631       return Query.Types[TypeIdx0].getSizeInBits() <
632              Query.Types[TypeIdx1].getSizeInBits();
633     };
634   };
635 
636   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
637     return [=](const LegalityQuery &Query) {
638       return Query.Types[TypeIdx0].getSizeInBits() >
639              Query.Types[TypeIdx1].getSizeInBits();
640     };
641   };
642 
643   getActionDefinitionsBuilder(G_INTTOPTR)
644     // List the common cases
645     .legalForCartesianProduct(AddrSpaces64, {S64})
646     .legalForCartesianProduct(AddrSpaces32, {S32})
647     .scalarize(0)
648     // Accept any address space as long as the size matches
649     .legalIf(sameSize(0, 1))
650     .widenScalarIf(smallerThan(1, 0),
651       [](const LegalityQuery &Query) {
652         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
653       })
654     .narrowScalarIf(greaterThan(1, 0),
655       [](const LegalityQuery &Query) {
656         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
657       });
658 
659   getActionDefinitionsBuilder(G_PTRTOINT)
660     // List the common cases
661     .legalForCartesianProduct(AddrSpaces64, {S64})
662     .legalForCartesianProduct(AddrSpaces32, {S32})
663     .scalarize(0)
664     // Accept any address space as long as the size matches
665     .legalIf(sameSize(0, 1))
666     .widenScalarIf(smallerThan(0, 1),
667       [](const LegalityQuery &Query) {
668         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
669       })
670     .narrowScalarIf(
671       greaterThan(0, 1),
672       [](const LegalityQuery &Query) {
673         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
674       });
675 
676   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
677     .scalarize(0)
678     .custom();
679 
680   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
681   // handle some operations by just promoting the register during
682   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
683   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
684     switch (AS) {
685     // FIXME: Private element size.
686     case AMDGPUAS::PRIVATE_ADDRESS:
687       return 32;
688     // FIXME: Check subtarget
689     case AMDGPUAS::LOCAL_ADDRESS:
690       return ST.useDS128() ? 128 : 64;
691 
692     // Treat constant and global as identical. SMRD loads are sometimes usable
693     // for global loads (ideally constant address space should be eliminated)
694     // depending on the context. Legality cannot be context dependent, but
695     // RegBankSelect can split the load as necessary depending on the pointer
696     // register bank/uniformity and if the memory is invariant or not written in
697     // a kernel.
698     case AMDGPUAS::CONSTANT_ADDRESS:
699     case AMDGPUAS::GLOBAL_ADDRESS:
700       return IsLoad ? 512 : 128;
701     default:
702       return 128;
703     }
704   };
705 
706   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
707                                     bool IsLoad) -> bool {
708     const LLT DstTy = Query.Types[0];
709 
710     // Split vector extloads.
711     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
712     unsigned Align = Query.MMODescrs[0].AlignInBits;
713 
714     if (MemSize < DstTy.getSizeInBits())
715       MemSize = std::max(MemSize, Align);
716 
717     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
718       return true;
719 
720     const LLT PtrTy = Query.Types[1];
721     unsigned AS = PtrTy.getAddressSpace();
722     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
723       return true;
724 
725     // Catch weird sized loads that don't evenly divide into the access sizes
726     // TODO: May be able to widen depending on alignment etc.
727     unsigned NumRegs = (MemSize + 31) / 32;
728     if (NumRegs == 3) {
729       if (!ST.hasDwordx3LoadStores())
730         return true;
731     } else {
732       // If the alignment allows, these should have been widened.
733       if (!isPowerOf2_32(NumRegs))
734         return true;
735     }
736 
737     if (Align < MemSize) {
738       const SITargetLowering *TLI = ST.getTargetLowering();
739       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
740     }
741 
742     return false;
743   };
744 
745   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
746     unsigned Size = Query.Types[0].getSizeInBits();
747     if (isPowerOf2_32(Size))
748       return false;
749 
750     if (Size == 96 && ST.hasDwordx3LoadStores())
751       return false;
752 
753     unsigned AddrSpace = Query.Types[1].getAddressSpace();
754     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
755       return false;
756 
757     unsigned Align = Query.MMODescrs[0].AlignInBits;
758     unsigned RoundedSize = NextPowerOf2(Size);
759     return (Align >= RoundedSize);
760   };
761 
762   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
763   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
764   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
765 
766   // TODO: Refine based on subtargets which support unaligned access or 128-bit
767   // LDS
768   // TODO: Unsupported flat for SI.
769 
770   for (unsigned Op : {G_LOAD, G_STORE}) {
771     const bool IsStore = Op == G_STORE;
772 
773     auto &Actions = getActionDefinitionsBuilder(Op);
774     // Whitelist the common cases.
775     // TODO: Loads to s16 on gfx9
776     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
777                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
778                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
779                                       {S128, GlobalPtr, 128, GlobalAlign32},
780                                       {S64, GlobalPtr, 64, GlobalAlign32},
781                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
782                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
783                                       {S32, GlobalPtr, 8, GlobalAlign8},
784                                       {S32, GlobalPtr, 16, GlobalAlign16},
785 
786                                       {S32, LocalPtr, 32, 32},
787                                       {S64, LocalPtr, 64, 32},
788                                       {V2S32, LocalPtr, 64, 32},
789                                       {S32, LocalPtr, 8, 8},
790                                       {S32, LocalPtr, 16, 16},
791                                       {V2S16, LocalPtr, 32, 32},
792 
793                                       {S32, PrivatePtr, 32, 32},
794                                       {S32, PrivatePtr, 8, 8},
795                                       {S32, PrivatePtr, 16, 16},
796                                       {V2S16, PrivatePtr, 32, 32},
797 
798                                       {S32, FlatPtr, 32, GlobalAlign32},
799                                       {S32, FlatPtr, 16, GlobalAlign16},
800                                       {S32, FlatPtr, 8, GlobalAlign8},
801                                       {V2S16, FlatPtr, 32, GlobalAlign32},
802 
803                                       {S32, ConstantPtr, 32, GlobalAlign32},
804                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
805                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
806                                       {S64, ConstantPtr, 64, GlobalAlign32},
807                                       {S128, ConstantPtr, 128, GlobalAlign32},
808                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
809     Actions
810         .customIf(typeIs(1, Constant32Ptr))
811         // Widen suitably aligned loads by loading extra elements.
812         .moreElementsIf([=](const LegalityQuery &Query) {
813             const LLT Ty = Query.Types[0];
814             return Op == G_LOAD && Ty.isVector() &&
815                    shouldWidenLoadResult(Query);
816           }, moreElementsToNextPow2(0))
817         .widenScalarIf([=](const LegalityQuery &Query) {
818             const LLT Ty = Query.Types[0];
819             return Op == G_LOAD && !Ty.isVector() &&
820                    shouldWidenLoadResult(Query);
821           }, widenScalarOrEltToNextPow2(0))
822         .narrowScalarIf(
823             [=](const LegalityQuery &Query) -> bool {
824               return !Query.Types[0].isVector() &&
825                      needToSplitMemOp(Query, Op == G_LOAD);
826             },
827             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
828               const LLT DstTy = Query.Types[0];
829               const LLT PtrTy = Query.Types[1];
830 
831               const unsigned DstSize = DstTy.getSizeInBits();
832               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
833 
834               // Split extloads.
835               if (DstSize > MemSize)
836                 return std::make_pair(0, LLT::scalar(MemSize));
837 
838               if (!isPowerOf2_32(DstSize)) {
839                 // We're probably decomposing an odd sized store. Try to split
840                 // to the widest type. TODO: Account for alignment. As-is it
841                 // should be OK, since the new parts will be further legalized.
842                 unsigned FloorSize = PowerOf2Floor(DstSize);
843                 return std::make_pair(0, LLT::scalar(FloorSize));
844               }
845 
846               if (DstSize > 32 && (DstSize % 32 != 0)) {
847                 // FIXME: Need a way to specify non-extload of larger size if
848                 // suitably aligned.
849                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
850               }
851 
852               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
853                                                      Op == G_LOAD);
854               if (MemSize > MaxSize)
855                 return std::make_pair(0, LLT::scalar(MaxSize));
856 
857               unsigned Align = Query.MMODescrs[0].AlignInBits;
858               return std::make_pair(0, LLT::scalar(Align));
859             })
860         .fewerElementsIf(
861             [=](const LegalityQuery &Query) -> bool {
862               return Query.Types[0].isVector() &&
863                      needToSplitMemOp(Query, Op == G_LOAD);
864             },
865             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
866               const LLT DstTy = Query.Types[0];
867               const LLT PtrTy = Query.Types[1];
868 
869               LLT EltTy = DstTy.getElementType();
870               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
871                                                      Op == G_LOAD);
872 
873               // FIXME: Handle widened to power of 2 results better. This ends
874               // up scalarizing.
875               // FIXME: 3 element stores scalarized on SI
876 
877               // Split if it's too large for the address space.
878               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
879                 unsigned NumElts = DstTy.getNumElements();
880                 unsigned EltSize = EltTy.getSizeInBits();
881 
882                 if (MaxSize % EltSize == 0) {
883                   return std::make_pair(
884                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
885                 }
886 
887                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
888 
889                 // FIXME: Refine when odd breakdowns handled
890                 // The scalars will need to be re-legalized.
891                 if (NumPieces == 1 || NumPieces >= NumElts ||
892                     NumElts % NumPieces != 0)
893                   return std::make_pair(0, EltTy);
894 
895                 return std::make_pair(0,
896                                       LLT::vector(NumElts / NumPieces, EltTy));
897               }
898 
899               // FIXME: We could probably handle weird extending loads better.
900               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
901               if (DstTy.getSizeInBits() > MemSize)
902                 return std::make_pair(0, EltTy);
903 
904               unsigned EltSize = EltTy.getSizeInBits();
905               unsigned DstSize = DstTy.getSizeInBits();
906               if (!isPowerOf2_32(DstSize)) {
907                 // We're probably decomposing an odd sized store. Try to split
908                 // to the widest type. TODO: Account for alignment. As-is it
909                 // should be OK, since the new parts will be further legalized.
910                 unsigned FloorSize = PowerOf2Floor(DstSize);
911                 return std::make_pair(
912                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
913               }
914 
915               // Need to split because of alignment.
916               unsigned Align = Query.MMODescrs[0].AlignInBits;
917               if (EltSize > Align &&
918                   (EltSize / Align < DstTy.getNumElements())) {
919                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
920               }
921 
922               // May need relegalization for the scalars.
923               return std::make_pair(0, EltTy);
924             })
925         .minScalar(0, S32);
926 
927     if (IsStore)
928       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
929 
930     // TODO: Need a bitcast lower option?
931     Actions
932         .legalIf([=](const LegalityQuery &Query) {
933           const LLT Ty0 = Query.Types[0];
934           unsigned Size = Ty0.getSizeInBits();
935           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
936           unsigned Align = Query.MMODescrs[0].AlignInBits;
937 
938           // FIXME: Widening store from alignment not valid.
939           if (MemSize < Size)
940             MemSize = std::max(MemSize, Align);
941 
942           // No extending vector loads.
943           if (Size > MemSize && Ty0.isVector())
944             return false;
945 
946           switch (MemSize) {
947           case 8:
948           case 16:
949             return Size == 32;
950           case 32:
951           case 64:
952           case 128:
953             return true;
954           case 96:
955             return ST.hasDwordx3LoadStores();
956           case 256:
957           case 512:
958             return true;
959           default:
960             return false;
961           }
962         })
963         .widenScalarToNextPow2(0)
964         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
965   }
966 
967   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
968                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
969                                                   {S32, GlobalPtr, 16, 2 * 8},
970                                                   {S32, LocalPtr, 8, 8},
971                                                   {S32, LocalPtr, 16, 16},
972                                                   {S32, PrivatePtr, 8, 8},
973                                                   {S32, PrivatePtr, 16, 16},
974                                                   {S32, ConstantPtr, 8, 8},
975                                                   {S32, ConstantPtr, 16, 2 * 8}});
976   if (ST.hasFlatAddressSpace()) {
977     ExtLoads.legalForTypesWithMemDesc(
978         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
979   }
980 
981   ExtLoads.clampScalar(0, S32, S32)
982           .widenScalarToNextPow2(0)
983           .unsupportedIfMemSizeNotPow2()
984           .lower();
985 
986   auto &Atomics = getActionDefinitionsBuilder(
987     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
988      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
989      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
990      G_ATOMICRMW_UMIN})
991     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
992                {S64, GlobalPtr}, {S64, LocalPtr}});
993   if (ST.hasFlatAddressSpace()) {
994     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
995   }
996 
997   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
998     .legalFor({{S32, LocalPtr}});
999 
1000   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1001   // demarshalling
1002   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1003     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1004                 {S32, FlatPtr}, {S64, FlatPtr}})
1005     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1006                {S32, RegionPtr}, {S64, RegionPtr}});
1007   // TODO: Pointer types, any 32-bit or 64-bit vector
1008 
1009   // Condition should be s32 for scalar, s1 for vector.
1010   getActionDefinitionsBuilder(G_SELECT)
1011     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1012           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1013           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1014     .clampScalar(0, S16, S64)
1015     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1016     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1017     .scalarize(1)
1018     .clampMaxNumElements(0, S32, 2)
1019     .clampMaxNumElements(0, LocalPtr, 2)
1020     .clampMaxNumElements(0, PrivatePtr, 2)
1021     .scalarize(0)
1022     .widenScalarToNextPow2(0)
1023     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1024 
1025   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1026   // be more flexible with the shift amount type.
1027   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1028     .legalFor({{S32, S32}, {S64, S32}});
1029   if (ST.has16BitInsts()) {
1030     if (ST.hasVOP3PInsts()) {
1031       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1032             .clampMaxNumElements(0, S16, 2);
1033     } else
1034       Shifts.legalFor({{S16, S32}, {S16, S16}});
1035 
1036     // TODO: Support 16-bit shift amounts
1037     Shifts.clampScalar(1, S32, S32);
1038     Shifts.clampScalar(0, S16, S64);
1039     Shifts.widenScalarToNextPow2(0, 16);
1040   } else {
1041     // Make sure we legalize the shift amount type first, as the general
1042     // expansion for the shifted type will produce much worse code if it hasn't
1043     // been truncated already.
1044     Shifts.clampScalar(1, S32, S32);
1045     Shifts.clampScalar(0, S32, S64);
1046     Shifts.widenScalarToNextPow2(0, 32);
1047   }
1048   Shifts.scalarize(0);
1049 
1050   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1051     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1052     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1053     unsigned IdxTypeIdx = 2;
1054 
1055     getActionDefinitionsBuilder(Op)
1056       .customIf([=](const LegalityQuery &Query) {
1057           const LLT EltTy = Query.Types[EltTypeIdx];
1058           const LLT VecTy = Query.Types[VecTypeIdx];
1059           const LLT IdxTy = Query.Types[IdxTypeIdx];
1060           return (EltTy.getSizeInBits() == 16 ||
1061                   EltTy.getSizeInBits() % 32 == 0) &&
1062                  VecTy.getSizeInBits() % 32 == 0 &&
1063                  VecTy.getSizeInBits() <= 1024 &&
1064                  IdxTy.getSizeInBits() == 32;
1065         })
1066       .clampScalar(EltTypeIdx, S32, S64)
1067       .clampScalar(VecTypeIdx, S32, S64)
1068       .clampScalar(IdxTypeIdx, S32, S32);
1069   }
1070 
1071   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1072     .unsupportedIf([=](const LegalityQuery &Query) {
1073         const LLT &EltTy = Query.Types[1].getElementType();
1074         return Query.Types[0] != EltTy;
1075       });
1076 
1077   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1078     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1079     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1080 
1081     // FIXME: Doesn't handle extract of illegal sizes.
1082     getActionDefinitionsBuilder(Op)
1083       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1084       // FIXME: Multiples of 16 should not be legal.
1085       .legalIf([=](const LegalityQuery &Query) {
1086           const LLT BigTy = Query.Types[BigTyIdx];
1087           const LLT LitTy = Query.Types[LitTyIdx];
1088           return (BigTy.getSizeInBits() % 32 == 0) &&
1089                  (LitTy.getSizeInBits() % 16 == 0);
1090         })
1091       .widenScalarIf(
1092         [=](const LegalityQuery &Query) {
1093           const LLT BigTy = Query.Types[BigTyIdx];
1094           return (BigTy.getScalarSizeInBits() < 16);
1095         },
1096         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1097       .widenScalarIf(
1098         [=](const LegalityQuery &Query) {
1099           const LLT LitTy = Query.Types[LitTyIdx];
1100           return (LitTy.getScalarSizeInBits() < 16);
1101         },
1102         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1103       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1104       .widenScalarToNextPow2(BigTyIdx, 32);
1105 
1106   }
1107 
1108   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1109     .legalForCartesianProduct(AllS32Vectors, {S32})
1110     .legalForCartesianProduct(AllS64Vectors, {S64})
1111     .clampNumElements(0, V16S32, V32S32)
1112     .clampNumElements(0, V2S64, V16S64)
1113     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1114 
1115   if (ST.hasScalarPackInsts()) {
1116     BuildVector
1117       // FIXME: Should probably widen s1 vectors straight to s32
1118       .minScalarOrElt(0, S16)
1119       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1120       .minScalar(1, S32);
1121 
1122     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1123       .legalFor({V2S16, S32})
1124       .lower();
1125     BuildVector.minScalarOrElt(0, S32);
1126   } else {
1127     BuildVector.customFor({V2S16, S16});
1128     BuildVector.minScalarOrElt(0, S32);
1129 
1130     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1131       .customFor({V2S16, S32})
1132       .lower();
1133   }
1134 
1135   BuildVector.legalIf(isRegisterType(0));
1136 
1137   // FIXME: Clamp maximum size
1138   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1139     .legalIf(isRegisterType(0));
1140 
1141   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1142   // pre-legalize.
1143   if (ST.hasVOP3PInsts()) {
1144     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1145       .customFor({V2S16, V2S16})
1146       .lower();
1147   } else
1148     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1149 
1150   // Merge/Unmerge
1151   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1152     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1153     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1154 
1155     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1156       const LLT &Ty = Query.Types[TypeIdx];
1157       if (Ty.isVector()) {
1158         const LLT &EltTy = Ty.getElementType();
1159         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1160           return true;
1161         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1162           return true;
1163       }
1164       return false;
1165     };
1166 
1167     auto &Builder = getActionDefinitionsBuilder(Op)
1168       // Try to widen to s16 first for small types.
1169       // TODO: Only do this on targets with legal s16 shifts
1170       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1171 
1172       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1173       .lowerFor({{S16, V2S16}})
1174       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1175       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1176                            elementTypeIs(1, S16)),
1177                        changeTo(1, V2S16))
1178       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1179       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1180       // valid.
1181       .clampScalar(LitTyIdx, S32, S256)
1182       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1183       // Break up vectors with weird elements into scalars
1184       .fewerElementsIf(
1185         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1186         scalarize(0))
1187       .fewerElementsIf(
1188         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1189         scalarize(1))
1190       .clampScalar(BigTyIdx, S32, S1024);
1191 
1192     if (Op == G_MERGE_VALUES) {
1193       Builder.widenScalarIf(
1194         // TODO: Use 16-bit shifts if legal for 8-bit values?
1195         [=](const LegalityQuery &Query) {
1196           const LLT Ty = Query.Types[LitTyIdx];
1197           return Ty.getSizeInBits() < 32;
1198         },
1199         changeTo(LitTyIdx, S32));
1200     }
1201 
1202     Builder.widenScalarIf(
1203       [=](const LegalityQuery &Query) {
1204         const LLT Ty = Query.Types[BigTyIdx];
1205         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1206           Ty.getSizeInBits() % 16 != 0;
1207       },
1208       [=](const LegalityQuery &Query) {
1209         // Pick the next power of 2, or a multiple of 64 over 128.
1210         // Whichever is smaller.
1211         const LLT &Ty = Query.Types[BigTyIdx];
1212         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1213         if (NewSizeInBits >= 256) {
1214           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1215           if (RoundedTo < NewSizeInBits)
1216             NewSizeInBits = RoundedTo;
1217         }
1218         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1219       })
1220       .legalIf([=](const LegalityQuery &Query) {
1221           const LLT &BigTy = Query.Types[BigTyIdx];
1222           const LLT &LitTy = Query.Types[LitTyIdx];
1223 
1224           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1225             return false;
1226           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1227             return false;
1228 
1229           return BigTy.getSizeInBits() % 16 == 0 &&
1230                  LitTy.getSizeInBits() % 16 == 0 &&
1231                  BigTy.getSizeInBits() <= 1024;
1232         })
1233       // Any vectors left are the wrong size. Scalarize them.
1234       .scalarize(0)
1235       .scalarize(1);
1236   }
1237 
1238   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1239   // RegBankSelect.
1240   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1241     .legalFor({{S32}, {S64}});
1242 
1243   if (ST.hasVOP3PInsts()) {
1244     SextInReg.lowerFor({{V2S16}})
1245       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1246       // get more vector shift opportunities, since we'll get those when
1247       // expanded.
1248       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1249   } else if (ST.has16BitInsts()) {
1250     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1251   } else {
1252     // Prefer to promote to s32 before lowering if we don't have 16-bit
1253     // shifts. This avoid a lot of intermediate truncate and extend operations.
1254     SextInReg.lowerFor({{S32}, {S64}});
1255   }
1256 
1257   SextInReg
1258     .scalarize(0)
1259     .clampScalar(0, S32, S64)
1260     .lower();
1261 
1262   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1263     .legalFor({S64});
1264 
1265   getActionDefinitionsBuilder({
1266       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1267       G_FCOPYSIGN,
1268 
1269       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1270       G_READ_REGISTER,
1271       G_WRITE_REGISTER,
1272 
1273       G_SADDO, G_SSUBO,
1274 
1275        // TODO: Implement
1276       G_FMINIMUM, G_FMAXIMUM
1277     }).lower();
1278 
1279   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1280         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1281         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1282     .unsupported();
1283 
1284   computeTables();
1285   verify(*ST.getInstrInfo());
1286 }
1287 
1288 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1289                                          MachineRegisterInfo &MRI,
1290                                          MachineIRBuilder &B,
1291                                          GISelChangeObserver &Observer) const {
1292   switch (MI.getOpcode()) {
1293   case TargetOpcode::G_ADDRSPACE_CAST:
1294     return legalizeAddrSpaceCast(MI, MRI, B);
1295   case TargetOpcode::G_FRINT:
1296     return legalizeFrint(MI, MRI, B);
1297   case TargetOpcode::G_FCEIL:
1298     return legalizeFceil(MI, MRI, B);
1299   case TargetOpcode::G_INTRINSIC_TRUNC:
1300     return legalizeIntrinsicTrunc(MI, MRI, B);
1301   case TargetOpcode::G_SITOFP:
1302     return legalizeITOFP(MI, MRI, B, true);
1303   case TargetOpcode::G_UITOFP:
1304     return legalizeITOFP(MI, MRI, B, false);
1305   case TargetOpcode::G_FPTOSI:
1306     return legalizeFPTOI(MI, MRI, B, true);
1307   case TargetOpcode::G_FPTOUI:
1308     return legalizeFPTOI(MI, MRI, B, false);
1309   case TargetOpcode::G_FMINNUM:
1310   case TargetOpcode::G_FMAXNUM:
1311   case TargetOpcode::G_FMINNUM_IEEE:
1312   case TargetOpcode::G_FMAXNUM_IEEE:
1313     return legalizeMinNumMaxNum(MI, MRI, B);
1314   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1315     return legalizeExtractVectorElt(MI, MRI, B);
1316   case TargetOpcode::G_INSERT_VECTOR_ELT:
1317     return legalizeInsertVectorElt(MI, MRI, B);
1318   case TargetOpcode::G_SHUFFLE_VECTOR:
1319     return legalizeShuffleVector(MI, MRI, B);
1320   case TargetOpcode::G_FSIN:
1321   case TargetOpcode::G_FCOS:
1322     return legalizeSinCos(MI, MRI, B);
1323   case TargetOpcode::G_GLOBAL_VALUE:
1324     return legalizeGlobalValue(MI, MRI, B);
1325   case TargetOpcode::G_LOAD:
1326     return legalizeLoad(MI, MRI, B, Observer);
1327   case TargetOpcode::G_FMAD:
1328     return legalizeFMad(MI, MRI, B);
1329   case TargetOpcode::G_FDIV:
1330     return legalizeFDIV(MI, MRI, B);
1331   case TargetOpcode::G_ATOMIC_CMPXCHG:
1332     return legalizeAtomicCmpXChg(MI, MRI, B);
1333   case TargetOpcode::G_FLOG:
1334     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1335   case TargetOpcode::G_FLOG10:
1336     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1337   case TargetOpcode::G_FEXP:
1338     return legalizeFExp(MI, B);
1339   case TargetOpcode::G_FFLOOR:
1340     return legalizeFFloor(MI, MRI, B);
1341   case TargetOpcode::G_BUILD_VECTOR:
1342     return legalizeBuildVector(MI, MRI, B);
1343   default:
1344     return false;
1345   }
1346 
1347   llvm_unreachable("expected switch to return");
1348 }
1349 
1350 Register AMDGPULegalizerInfo::getSegmentAperture(
1351   unsigned AS,
1352   MachineRegisterInfo &MRI,
1353   MachineIRBuilder &B) const {
1354   MachineFunction &MF = B.getMF();
1355   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1356   const LLT S32 = LLT::scalar(32);
1357 
1358   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1359 
1360   if (ST.hasApertureRegs()) {
1361     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1362     // getreg.
1363     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1364         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1365         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1366     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1367         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1368         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1369     unsigned Encoding =
1370         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1371         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1372         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1373 
1374     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1375 
1376     B.buildInstr(AMDGPU::S_GETREG_B32)
1377       .addDef(GetReg)
1378       .addImm(Encoding);
1379     MRI.setType(GetReg, S32);
1380 
1381     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1382     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1383   }
1384 
1385   Register QueuePtr = MRI.createGenericVirtualRegister(
1386     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1387 
1388   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1389   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1390     return Register();
1391 
1392   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1393   // private_segment_aperture_base_hi.
1394   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1395 
1396   // TODO: can we be smarter about machine pointer info?
1397   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1398   MachineMemOperand *MMO = MF.getMachineMemOperand(
1399     PtrInfo,
1400     MachineMemOperand::MOLoad |
1401     MachineMemOperand::MODereferenceable |
1402     MachineMemOperand::MOInvariant,
1403     4,
1404     MinAlign(64, StructOffset));
1405 
1406   Register LoadAddr;
1407 
1408   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1409   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1410 }
1411 
1412 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1413   MachineInstr &MI, MachineRegisterInfo &MRI,
1414   MachineIRBuilder &B) const {
1415   MachineFunction &MF = B.getMF();
1416 
1417   B.setInstr(MI);
1418 
1419   const LLT S32 = LLT::scalar(32);
1420   Register Dst = MI.getOperand(0).getReg();
1421   Register Src = MI.getOperand(1).getReg();
1422 
1423   LLT DstTy = MRI.getType(Dst);
1424   LLT SrcTy = MRI.getType(Src);
1425   unsigned DestAS = DstTy.getAddressSpace();
1426   unsigned SrcAS = SrcTy.getAddressSpace();
1427 
1428   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1429   // vector element.
1430   assert(!DstTy.isVector());
1431 
1432   const AMDGPUTargetMachine &TM
1433     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1434 
1435   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1436   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1437     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1438     return true;
1439   }
1440 
1441   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1442     // Truncate.
1443     B.buildExtract(Dst, Src, 0);
1444     MI.eraseFromParent();
1445     return true;
1446   }
1447 
1448   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1449     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1450     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1451 
1452     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1453     // another. Merge operands are required to be the same type, but creating an
1454     // extra ptrtoint would be kind of pointless.
1455     auto HighAddr = B.buildConstant(
1456       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1457     B.buildMerge(Dst, {Src, HighAddr});
1458     MI.eraseFromParent();
1459     return true;
1460   }
1461 
1462   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1463     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1464            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1465     unsigned NullVal = TM.getNullPointerValue(DestAS);
1466 
1467     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1468     auto FlatNull = B.buildConstant(SrcTy, 0);
1469 
1470     // Extract low 32-bits of the pointer.
1471     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1472 
1473     auto CmpRes =
1474         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1475     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1476 
1477     MI.eraseFromParent();
1478     return true;
1479   }
1480 
1481   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1482     return false;
1483 
1484   if (!ST.hasFlatAddressSpace())
1485     return false;
1486 
1487   auto SegmentNull =
1488       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1489   auto FlatNull =
1490       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1491 
1492   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1493   if (!ApertureReg.isValid())
1494     return false;
1495 
1496   auto CmpRes =
1497       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1498 
1499   // Coerce the type of the low half of the result so we can use merge_values.
1500   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1501 
1502   // TODO: Should we allow mismatched types but matching sizes in merges to
1503   // avoid the ptrtoint?
1504   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1505   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1506 
1507   MI.eraseFromParent();
1508   return true;
1509 }
1510 
1511 bool AMDGPULegalizerInfo::legalizeFrint(
1512   MachineInstr &MI, MachineRegisterInfo &MRI,
1513   MachineIRBuilder &B) const {
1514   B.setInstr(MI);
1515 
1516   Register Src = MI.getOperand(1).getReg();
1517   LLT Ty = MRI.getType(Src);
1518   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1519 
1520   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1521   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1522 
1523   auto C1 = B.buildFConstant(Ty, C1Val);
1524   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1525 
1526   // TODO: Should this propagate fast-math-flags?
1527   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1528   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1529 
1530   auto C2 = B.buildFConstant(Ty, C2Val);
1531   auto Fabs = B.buildFAbs(Ty, Src);
1532 
1533   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1534   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1535   return true;
1536 }
1537 
1538 bool AMDGPULegalizerInfo::legalizeFceil(
1539   MachineInstr &MI, MachineRegisterInfo &MRI,
1540   MachineIRBuilder &B) const {
1541   B.setInstr(MI);
1542 
1543   const LLT S1 = LLT::scalar(1);
1544   const LLT S64 = LLT::scalar(64);
1545 
1546   Register Src = MI.getOperand(1).getReg();
1547   assert(MRI.getType(Src) == S64);
1548 
1549   // result = trunc(src)
1550   // if (src > 0.0 && src != result)
1551   //   result += 1.0
1552 
1553   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1554 
1555   const auto Zero = B.buildFConstant(S64, 0.0);
1556   const auto One = B.buildFConstant(S64, 1.0);
1557   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1558   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1559   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1560   auto Add = B.buildSelect(S64, And, One, Zero);
1561 
1562   // TODO: Should this propagate fast-math-flags?
1563   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1564   return true;
1565 }
1566 
1567 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1568                                               MachineIRBuilder &B) {
1569   const unsigned FractBits = 52;
1570   const unsigned ExpBits = 11;
1571   LLT S32 = LLT::scalar(32);
1572 
1573   auto Const0 = B.buildConstant(S32, FractBits - 32);
1574   auto Const1 = B.buildConstant(S32, ExpBits);
1575 
1576   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1577     .addUse(Const0.getReg(0))
1578     .addUse(Const1.getReg(0));
1579 
1580   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1581 }
1582 
1583 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1584   MachineInstr &MI, MachineRegisterInfo &MRI,
1585   MachineIRBuilder &B) const {
1586   B.setInstr(MI);
1587 
1588   const LLT S1 = LLT::scalar(1);
1589   const LLT S32 = LLT::scalar(32);
1590   const LLT S64 = LLT::scalar(64);
1591 
1592   Register Src = MI.getOperand(1).getReg();
1593   assert(MRI.getType(Src) == S64);
1594 
1595   // TODO: Should this use extract since the low half is unused?
1596   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1597   Register Hi = Unmerge.getReg(1);
1598 
1599   // Extract the upper half, since this is where we will find the sign and
1600   // exponent.
1601   auto Exp = extractF64Exponent(Hi, B);
1602 
1603   const unsigned FractBits = 52;
1604 
1605   // Extract the sign bit.
1606   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1607   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1608 
1609   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1610 
1611   const auto Zero32 = B.buildConstant(S32, 0);
1612 
1613   // Extend back to 64-bits.
1614   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1615 
1616   auto Shr = B.buildAShr(S64, FractMask, Exp);
1617   auto Not = B.buildNot(S64, Shr);
1618   auto Tmp0 = B.buildAnd(S64, Src, Not);
1619   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1620 
1621   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1622   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1623 
1624   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1625   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1626   return true;
1627 }
1628 
1629 bool AMDGPULegalizerInfo::legalizeITOFP(
1630   MachineInstr &MI, MachineRegisterInfo &MRI,
1631   MachineIRBuilder &B, bool Signed) const {
1632   B.setInstr(MI);
1633 
1634   Register Dst = MI.getOperand(0).getReg();
1635   Register Src = MI.getOperand(1).getReg();
1636 
1637   const LLT S64 = LLT::scalar(64);
1638   const LLT S32 = LLT::scalar(32);
1639 
1640   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1641 
1642   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1643 
1644   auto CvtHi = Signed ?
1645     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1646     B.buildUITOFP(S64, Unmerge.getReg(1));
1647 
1648   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1649 
1650   auto ThirtyTwo = B.buildConstant(S32, 32);
1651   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1652     .addUse(CvtHi.getReg(0))
1653     .addUse(ThirtyTwo.getReg(0));
1654 
1655   // TODO: Should this propagate fast-math-flags?
1656   B.buildFAdd(Dst, LdExp, CvtLo);
1657   MI.eraseFromParent();
1658   return true;
1659 }
1660 
1661 // TODO: Copied from DAG implementation. Verify logic and document how this
1662 // actually works.
1663 bool AMDGPULegalizerInfo::legalizeFPTOI(
1664   MachineInstr &MI, MachineRegisterInfo &MRI,
1665   MachineIRBuilder &B, bool Signed) const {
1666   B.setInstr(MI);
1667 
1668   Register Dst = MI.getOperand(0).getReg();
1669   Register Src = MI.getOperand(1).getReg();
1670 
1671   const LLT S64 = LLT::scalar(64);
1672   const LLT S32 = LLT::scalar(32);
1673 
1674   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1675 
1676   unsigned Flags = MI.getFlags();
1677 
1678   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1679   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1680   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1681 
1682   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1683   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1684   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1685 
1686   auto Hi = Signed ?
1687     B.buildFPTOSI(S32, FloorMul) :
1688     B.buildFPTOUI(S32, FloorMul);
1689   auto Lo = B.buildFPTOUI(S32, Fma);
1690 
1691   B.buildMerge(Dst, { Lo, Hi });
1692   MI.eraseFromParent();
1693 
1694   return true;
1695 }
1696 
1697 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1698   MachineInstr &MI, MachineRegisterInfo &MRI,
1699   MachineIRBuilder &B) const {
1700   MachineFunction &MF = B.getMF();
1701   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1702 
1703   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1704                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1705 
1706   // With ieee_mode disabled, the instructions have the correct behavior
1707   // already for G_FMINNUM/G_FMAXNUM
1708   if (!MFI->getMode().IEEE)
1709     return !IsIEEEOp;
1710 
1711   if (IsIEEEOp)
1712     return true;
1713 
1714   MachineIRBuilder HelperBuilder(MI);
1715   GISelObserverWrapper DummyObserver;
1716   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1717   HelperBuilder.setInstr(MI);
1718   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1719 }
1720 
1721 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1722   MachineInstr &MI, MachineRegisterInfo &MRI,
1723   MachineIRBuilder &B) const {
1724   // TODO: Should move some of this into LegalizerHelper.
1725 
1726   // TODO: Promote dynamic indexing of s16 to s32
1727 
1728   // FIXME: Artifact combiner probably should have replaced the truncated
1729   // constant before this, so we shouldn't need
1730   // getConstantVRegValWithLookThrough.
1731   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1732     MI.getOperand(2).getReg(), MRI);
1733   if (!IdxVal) // Dynamic case will be selected to register indexing.
1734     return true;
1735 
1736   Register Dst = MI.getOperand(0).getReg();
1737   Register Vec = MI.getOperand(1).getReg();
1738 
1739   LLT VecTy = MRI.getType(Vec);
1740   LLT EltTy = VecTy.getElementType();
1741   assert(EltTy == MRI.getType(Dst));
1742 
1743   B.setInstr(MI);
1744 
1745   if (IdxVal->Value < VecTy.getNumElements())
1746     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1747   else
1748     B.buildUndef(Dst);
1749 
1750   MI.eraseFromParent();
1751   return true;
1752 }
1753 
1754 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1755   MachineInstr &MI, MachineRegisterInfo &MRI,
1756   MachineIRBuilder &B) const {
1757   // TODO: Should move some of this into LegalizerHelper.
1758 
1759   // TODO: Promote dynamic indexing of s16 to s32
1760 
1761   // FIXME: Artifact combiner probably should have replaced the truncated
1762   // constant before this, so we shouldn't need
1763   // getConstantVRegValWithLookThrough.
1764   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1765     MI.getOperand(3).getReg(), MRI);
1766   if (!IdxVal) // Dynamic case will be selected to register indexing.
1767     return true;
1768 
1769   Register Dst = MI.getOperand(0).getReg();
1770   Register Vec = MI.getOperand(1).getReg();
1771   Register Ins = MI.getOperand(2).getReg();
1772 
1773   LLT VecTy = MRI.getType(Vec);
1774   LLT EltTy = VecTy.getElementType();
1775   assert(EltTy == MRI.getType(Ins));
1776 
1777   B.setInstr(MI);
1778 
1779   if (IdxVal->Value < VecTy.getNumElements())
1780     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1781   else
1782     B.buildUndef(Dst);
1783 
1784   MI.eraseFromParent();
1785   return true;
1786 }
1787 
1788 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1789   assert(Mask.size() == 2);
1790 
1791   // If one half is undef, the other is trivially in the same reg.
1792   if (Mask[0] == -1 || Mask[1] == -1)
1793     return true;
1794   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1795          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1796 }
1797 
1798 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1799   MachineInstr &MI, MachineRegisterInfo &MRI,
1800   MachineIRBuilder &B) const {
1801   const LLT V2S16 = LLT::vector(2, 16);
1802 
1803   Register Dst = MI.getOperand(0).getReg();
1804   Register Src0 = MI.getOperand(1).getReg();
1805   LLT DstTy = MRI.getType(Dst);
1806   LLT SrcTy = MRI.getType(Src0);
1807 
1808   if (SrcTy == V2S16 && DstTy == V2S16 &&
1809       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1810     return true;
1811 
1812   MachineIRBuilder HelperBuilder(MI);
1813   GISelObserverWrapper DummyObserver;
1814   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1815   HelperBuilder.setInstr(MI);
1816   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1817 }
1818 
1819 bool AMDGPULegalizerInfo::legalizeSinCos(
1820   MachineInstr &MI, MachineRegisterInfo &MRI,
1821   MachineIRBuilder &B) const {
1822   B.setInstr(MI);
1823 
1824   Register DstReg = MI.getOperand(0).getReg();
1825   Register SrcReg = MI.getOperand(1).getReg();
1826   LLT Ty = MRI.getType(DstReg);
1827   unsigned Flags = MI.getFlags();
1828 
1829   Register TrigVal;
1830   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1831   if (ST.hasTrigReducedRange()) {
1832     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1833     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1834       .addUse(MulVal.getReg(0))
1835       .setMIFlags(Flags).getReg(0);
1836   } else
1837     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1838 
1839   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1840     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1841   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1842     .addUse(TrigVal)
1843     .setMIFlags(Flags);
1844   MI.eraseFromParent();
1845   return true;
1846 }
1847 
1848 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1849   Register DstReg, LLT PtrTy,
1850   MachineIRBuilder &B, const GlobalValue *GV,
1851   unsigned Offset, unsigned GAFlags) const {
1852   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1853   // to the following code sequence:
1854   //
1855   // For constant address space:
1856   //   s_getpc_b64 s[0:1]
1857   //   s_add_u32 s0, s0, $symbol
1858   //   s_addc_u32 s1, s1, 0
1859   //
1860   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1861   //   a fixup or relocation is emitted to replace $symbol with a literal
1862   //   constant, which is a pc-relative offset from the encoding of the $symbol
1863   //   operand to the global variable.
1864   //
1865   // For global address space:
1866   //   s_getpc_b64 s[0:1]
1867   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1868   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1869   //
1870   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1871   //   fixups or relocations are emitted to replace $symbol@*@lo and
1872   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1873   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1874   //   operand to the global variable.
1875   //
1876   // What we want here is an offset from the value returned by s_getpc
1877   // (which is the address of the s_add_u32 instruction) to the global
1878   // variable, but since the encoding of $symbol starts 4 bytes after the start
1879   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1880   // small. This requires us to add 4 to the global variable offset in order to
1881   // compute the correct address.
1882 
1883   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1884 
1885   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1886     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1887 
1888   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1889     .addDef(PCReg);
1890 
1891   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1892   if (GAFlags == SIInstrInfo::MO_NONE)
1893     MIB.addImm(0);
1894   else
1895     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1896 
1897   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1898 
1899   if (PtrTy.getSizeInBits() == 32)
1900     B.buildExtract(DstReg, PCReg, 0);
1901   return true;
1902  }
1903 
1904 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1905   MachineInstr &MI, MachineRegisterInfo &MRI,
1906   MachineIRBuilder &B) const {
1907   Register DstReg = MI.getOperand(0).getReg();
1908   LLT Ty = MRI.getType(DstReg);
1909   unsigned AS = Ty.getAddressSpace();
1910 
1911   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1912   MachineFunction &MF = B.getMF();
1913   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1914   B.setInstr(MI);
1915 
1916   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1917     if (!MFI->isEntryFunction()) {
1918       const Function &Fn = MF.getFunction();
1919       DiagnosticInfoUnsupported BadLDSDecl(
1920         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1921       Fn.getContext().diagnose(BadLDSDecl);
1922     }
1923 
1924     // TODO: We could emit code to handle the initialization somewhere.
1925     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1926       const SITargetLowering *TLI = ST.getTargetLowering();
1927       if (!TLI->shouldUseLDSConstAddress(GV)) {
1928         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1929         return true; // Leave in place;
1930       }
1931 
1932       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1933       MI.eraseFromParent();
1934       return true;
1935     }
1936 
1937     const Function &Fn = MF.getFunction();
1938     DiagnosticInfoUnsupported BadInit(
1939       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1940     Fn.getContext().diagnose(BadInit);
1941     return true;
1942   }
1943 
1944   const SITargetLowering *TLI = ST.getTargetLowering();
1945 
1946   if (TLI->shouldEmitFixup(GV)) {
1947     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1948     MI.eraseFromParent();
1949     return true;
1950   }
1951 
1952   if (TLI->shouldEmitPCReloc(GV)) {
1953     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1954     MI.eraseFromParent();
1955     return true;
1956   }
1957 
1958   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1959   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1960 
1961   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1962     MachinePointerInfo::getGOT(MF),
1963     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1964     MachineMemOperand::MOInvariant,
1965     8 /*Size*/, 8 /*Align*/);
1966 
1967   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1968 
1969   if (Ty.getSizeInBits() == 32) {
1970     // Truncate if this is a 32-bit constant adrdess.
1971     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1972     B.buildExtract(DstReg, Load, 0);
1973   } else
1974     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1975 
1976   MI.eraseFromParent();
1977   return true;
1978 }
1979 
1980 bool AMDGPULegalizerInfo::legalizeLoad(
1981   MachineInstr &MI, MachineRegisterInfo &MRI,
1982   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1983   B.setInstr(MI);
1984   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1985   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1986   Observer.changingInstr(MI);
1987   MI.getOperand(1).setReg(Cast.getReg(0));
1988   Observer.changedInstr(MI);
1989   return true;
1990 }
1991 
1992 bool AMDGPULegalizerInfo::legalizeFMad(
1993   MachineInstr &MI, MachineRegisterInfo &MRI,
1994   MachineIRBuilder &B) const {
1995   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1996   assert(Ty.isScalar());
1997 
1998   MachineFunction &MF = B.getMF();
1999   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2000 
2001   // TODO: Always legal with future ftz flag.
2002   // FIXME: Do we need just output?
2003   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2004     return true;
2005   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2006     return true;
2007 
2008   MachineIRBuilder HelperBuilder(MI);
2009   GISelObserverWrapper DummyObserver;
2010   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2011   HelperBuilder.setMBB(*MI.getParent());
2012   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2013 }
2014 
2015 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2016   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2017   Register DstReg = MI.getOperand(0).getReg();
2018   Register PtrReg = MI.getOperand(1).getReg();
2019   Register CmpVal = MI.getOperand(2).getReg();
2020   Register NewVal = MI.getOperand(3).getReg();
2021 
2022   assert(SITargetLowering::isFlatGlobalAddrSpace(
2023            MRI.getType(PtrReg).getAddressSpace()) &&
2024          "this should not have been custom lowered");
2025 
2026   LLT ValTy = MRI.getType(CmpVal);
2027   LLT VecTy = LLT::vector(2, ValTy);
2028 
2029   B.setInstr(MI);
2030   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2031 
2032   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2033     .addDef(DstReg)
2034     .addUse(PtrReg)
2035     .addUse(PackedVal)
2036     .setMemRefs(MI.memoperands());
2037 
2038   MI.eraseFromParent();
2039   return true;
2040 }
2041 
2042 bool AMDGPULegalizerInfo::legalizeFlog(
2043   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2044   Register Dst = MI.getOperand(0).getReg();
2045   Register Src = MI.getOperand(1).getReg();
2046   LLT Ty = B.getMRI()->getType(Dst);
2047   unsigned Flags = MI.getFlags();
2048   B.setInstr(MI);
2049 
2050   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2051   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2052 
2053   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2054   MI.eraseFromParent();
2055   return true;
2056 }
2057 
2058 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2059                                        MachineIRBuilder &B) const {
2060   Register Dst = MI.getOperand(0).getReg();
2061   Register Src = MI.getOperand(1).getReg();
2062   unsigned Flags = MI.getFlags();
2063   LLT Ty = B.getMRI()->getType(Dst);
2064   B.setInstr(MI);
2065 
2066   auto K = B.buildFConstant(Ty, numbers::log2e);
2067   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2068   B.buildFExp2(Dst, Mul, Flags);
2069   MI.eraseFromParent();
2070   return true;
2071 }
2072 
2073 // Find a source register, ignoring any possible source modifiers.
2074 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2075   Register ModSrc = OrigSrc;
2076   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2077     ModSrc = SrcFNeg->getOperand(1).getReg();
2078     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2079       ModSrc = SrcFAbs->getOperand(1).getReg();
2080   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2081     ModSrc = SrcFAbs->getOperand(1).getReg();
2082   return ModSrc;
2083 }
2084 
2085 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2086                                          MachineRegisterInfo &MRI,
2087                                          MachineIRBuilder &B) const {
2088   B.setInstr(MI);
2089 
2090   const LLT S1 = LLT::scalar(1);
2091   const LLT S64 = LLT::scalar(64);
2092   Register Dst = MI.getOperand(0).getReg();
2093   Register OrigSrc = MI.getOperand(1).getReg();
2094   unsigned Flags = MI.getFlags();
2095   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2096          "this should not have been custom lowered");
2097 
2098   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2099   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2100   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2101   // V_FRACT bug is:
2102   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2103   //
2104   // Convert floor(x) to (x - fract(x))
2105 
2106   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2107     .addUse(OrigSrc)
2108     .setMIFlags(Flags);
2109 
2110   // Give source modifier matching some assistance before obscuring a foldable
2111   // pattern.
2112 
2113   // TODO: We can avoid the neg on the fract? The input sign to fract
2114   // shouldn't matter?
2115   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2116 
2117   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2118 
2119   Register Min = MRI.createGenericVirtualRegister(S64);
2120 
2121   // We don't need to concern ourselves with the snan handling difference, so
2122   // use the one which will directly select.
2123   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2124   if (MFI->getMode().IEEE)
2125     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2126   else
2127     B.buildFMinNum(Min, Fract, Const, Flags);
2128 
2129   Register CorrectedFract = Min;
2130   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2131     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2132     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2133   }
2134 
2135   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2136   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2137 
2138   MI.eraseFromParent();
2139   return true;
2140 }
2141 
2142 // Turn an illegal packed v2s16 build vector into bit operations.
2143 // TODO: This should probably be a bitcast action in LegalizerHelper.
2144 bool AMDGPULegalizerInfo::legalizeBuildVector(
2145   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2146   Register Dst = MI.getOperand(0).getReg();
2147   LLT DstTy = MRI.getType(Dst);
2148   const LLT S32 = LLT::scalar(32);
2149   const LLT V2S16 = LLT::vector(2, 16);
2150   (void)DstTy;
2151   (void)V2S16;
2152   assert(DstTy == V2S16);
2153 
2154   Register Src0 = MI.getOperand(1).getReg();
2155   Register Src1 = MI.getOperand(2).getReg();
2156   assert(MRI.getType(Src0) == LLT::scalar(16));
2157 
2158   B.setInstr(MI);
2159   auto Merge = B.buildMerge(S32, {Src0, Src1});
2160   B.buildBitcast(Dst, Merge);
2161 
2162   MI.eraseFromParent();
2163   return true;
2164 }
2165 
2166 // Return the use branch instruction, otherwise null if the usage is invalid.
2167 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2168                                        MachineRegisterInfo &MRI,
2169                                        MachineInstr *&Br) {
2170   Register CondDef = MI.getOperand(0).getReg();
2171   if (!MRI.hasOneNonDBGUse(CondDef))
2172     return nullptr;
2173 
2174   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2175   if (UseMI.getParent() != MI.getParent() ||
2176       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2177     return nullptr;
2178 
2179   // Make sure the cond br is followed by a G_BR
2180   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2181   if (Next != MI.getParent()->end()) {
2182     if (Next->getOpcode() != AMDGPU::G_BR)
2183       return nullptr;
2184     Br = &*Next;
2185   }
2186 
2187   return &UseMI;
2188 }
2189 
2190 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2191                                                 Register Reg, LLT Ty) const {
2192   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2193   if (LiveIn)
2194     return LiveIn;
2195 
2196   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2197   MRI.addLiveIn(Reg, NewReg);
2198   return NewReg;
2199 }
2200 
2201 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2202                                          const ArgDescriptor *Arg) const {
2203   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2204     return false; // TODO: Handle these
2205 
2206   assert(Arg->getRegister().isPhysical());
2207 
2208   MachineRegisterInfo &MRI = *B.getMRI();
2209 
2210   LLT Ty = MRI.getType(DstReg);
2211   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2212 
2213   if (Arg->isMasked()) {
2214     // TODO: Should we try to emit this once in the entry block?
2215     const LLT S32 = LLT::scalar(32);
2216     const unsigned Mask = Arg->getMask();
2217     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2218 
2219     Register AndMaskSrc = LiveIn;
2220 
2221     if (Shift != 0) {
2222       auto ShiftAmt = B.buildConstant(S32, Shift);
2223       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2224     }
2225 
2226     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2227   } else
2228     B.buildCopy(DstReg, LiveIn);
2229 
2230   // Insert the argument copy if it doens't already exist.
2231   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2232   if (!MRI.getVRegDef(LiveIn)) {
2233     // FIXME: Should have scoped insert pt
2234     MachineBasicBlock &OrigInsBB = B.getMBB();
2235     auto OrigInsPt = B.getInsertPt();
2236 
2237     MachineBasicBlock &EntryMBB = B.getMF().front();
2238     EntryMBB.addLiveIn(Arg->getRegister());
2239     B.setInsertPt(EntryMBB, EntryMBB.begin());
2240     B.buildCopy(LiveIn, Arg->getRegister());
2241 
2242     B.setInsertPt(OrigInsBB, OrigInsPt);
2243   }
2244 
2245   return true;
2246 }
2247 
2248 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2249   MachineInstr &MI,
2250   MachineRegisterInfo &MRI,
2251   MachineIRBuilder &B,
2252   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2253   B.setInstr(MI);
2254 
2255   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2256 
2257   const ArgDescriptor *Arg;
2258   const TargetRegisterClass *RC;
2259   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2260   if (!Arg) {
2261     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2262     return false;
2263   }
2264 
2265   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2266     MI.eraseFromParent();
2267     return true;
2268   }
2269 
2270   return false;
2271 }
2272 
2273 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2274                                        MachineRegisterInfo &MRI,
2275                                        MachineIRBuilder &B) const {
2276   B.setInstr(MI);
2277   Register Dst = MI.getOperand(0).getReg();
2278   LLT DstTy = MRI.getType(Dst);
2279   LLT S16 = LLT::scalar(16);
2280   LLT S32 = LLT::scalar(32);
2281   LLT S64 = LLT::scalar(64);
2282 
2283   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2284     return true;
2285 
2286   if (DstTy == S16)
2287     return legalizeFDIV16(MI, MRI, B);
2288   if (DstTy == S32)
2289     return legalizeFDIV32(MI, MRI, B);
2290   if (DstTy == S64)
2291     return legalizeFDIV64(MI, MRI, B);
2292 
2293   return false;
2294 }
2295 
2296 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2297                                                  MachineRegisterInfo &MRI,
2298                                                  MachineIRBuilder &B) const {
2299   Register Res = MI.getOperand(0).getReg();
2300   Register LHS = MI.getOperand(1).getReg();
2301   Register RHS = MI.getOperand(2).getReg();
2302 
2303   uint16_t Flags = MI.getFlags();
2304 
2305   LLT ResTy = MRI.getType(Res);
2306   LLT S32 = LLT::scalar(32);
2307   LLT S64 = LLT::scalar(64);
2308 
2309   const MachineFunction &MF = B.getMF();
2310   bool Unsafe =
2311     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2312 
2313   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2314     return false;
2315 
2316   if (!Unsafe && ResTy == S32 &&
2317       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2318     return false;
2319 
2320   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2321     // 1 / x -> RCP(x)
2322     if (CLHS->isExactlyValue(1.0)) {
2323       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2324         .addUse(RHS)
2325         .setMIFlags(Flags);
2326 
2327       MI.eraseFromParent();
2328       return true;
2329     }
2330 
2331     // -1 / x -> RCP( FNEG(x) )
2332     if (CLHS->isExactlyValue(-1.0)) {
2333       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2334       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2335         .addUse(FNeg.getReg(0))
2336         .setMIFlags(Flags);
2337 
2338       MI.eraseFromParent();
2339       return true;
2340     }
2341   }
2342 
2343   // x / y -> x * (1.0 / y)
2344   if (Unsafe) {
2345     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2346       .addUse(RHS)
2347       .setMIFlags(Flags);
2348     B.buildFMul(Res, LHS, RCP, Flags);
2349 
2350     MI.eraseFromParent();
2351     return true;
2352   }
2353 
2354   return false;
2355 }
2356 
2357 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2358                                          MachineRegisterInfo &MRI,
2359                                          MachineIRBuilder &B) const {
2360   B.setInstr(MI);
2361   Register Res = MI.getOperand(0).getReg();
2362   Register LHS = MI.getOperand(1).getReg();
2363   Register RHS = MI.getOperand(2).getReg();
2364 
2365   uint16_t Flags = MI.getFlags();
2366 
2367   LLT S16 = LLT::scalar(16);
2368   LLT S32 = LLT::scalar(32);
2369 
2370   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2371   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2372 
2373   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2374     .addUse(RHSExt.getReg(0))
2375     .setMIFlags(Flags);
2376 
2377   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2378   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2379 
2380   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2381     .addUse(RDst.getReg(0))
2382     .addUse(RHS)
2383     .addUse(LHS)
2384     .setMIFlags(Flags);
2385 
2386   MI.eraseFromParent();
2387   return true;
2388 }
2389 
2390 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2391 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2392 static void toggleSPDenormMode(bool Enable,
2393                                MachineIRBuilder &B,
2394                                const GCNSubtarget &ST,
2395                                AMDGPU::SIModeRegisterDefaults Mode) {
2396   // Set SP denorm mode to this value.
2397   unsigned SPDenormMode =
2398     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2399 
2400   if (ST.hasDenormModeInst()) {
2401     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2402     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2403 
2404     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2405     B.buildInstr(AMDGPU::S_DENORM_MODE)
2406       .addImm(NewDenormModeValue);
2407 
2408   } else {
2409     // Select FP32 bit field in mode register.
2410     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2411                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2412                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2413 
2414     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2415       .addImm(SPDenormMode)
2416       .addImm(SPDenormModeBitField);
2417   }
2418 }
2419 
2420 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2421                                          MachineRegisterInfo &MRI,
2422                                          MachineIRBuilder &B) const {
2423   B.setInstr(MI);
2424   Register Res = MI.getOperand(0).getReg();
2425   Register LHS = MI.getOperand(1).getReg();
2426   Register RHS = MI.getOperand(2).getReg();
2427   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2428   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2429 
2430   uint16_t Flags = MI.getFlags();
2431 
2432   LLT S32 = LLT::scalar(32);
2433   LLT S1 = LLT::scalar(1);
2434 
2435   auto One = B.buildFConstant(S32, 1.0f);
2436 
2437   auto DenominatorScaled =
2438     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2439       .addUse(RHS)
2440       .addUse(LHS)
2441       .addImm(1)
2442       .setMIFlags(Flags);
2443   auto NumeratorScaled =
2444     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2445       .addUse(LHS)
2446       .addUse(RHS)
2447       .addImm(0)
2448       .setMIFlags(Flags);
2449 
2450   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2451     .addUse(DenominatorScaled.getReg(0))
2452     .setMIFlags(Flags);
2453   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2454 
2455   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2456   // aren't modeled as reading it.
2457   if (!Mode.allFP32Denormals())
2458     toggleSPDenormMode(true, B, ST, Mode);
2459 
2460   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2461   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2462   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2463   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2464   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2465   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2466 
2467   if (!Mode.allFP32Denormals())
2468     toggleSPDenormMode(false, B, ST, Mode);
2469 
2470   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2471     .addUse(Fma4.getReg(0))
2472     .addUse(Fma1.getReg(0))
2473     .addUse(Fma3.getReg(0))
2474     .addUse(NumeratorScaled.getReg(1))
2475     .setMIFlags(Flags);
2476 
2477   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2478     .addUse(Fmas.getReg(0))
2479     .addUse(RHS)
2480     .addUse(LHS)
2481     .setMIFlags(Flags);
2482 
2483   MI.eraseFromParent();
2484   return true;
2485 }
2486 
2487 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2488                                          MachineRegisterInfo &MRI,
2489                                          MachineIRBuilder &B) const {
2490   B.setInstr(MI);
2491   Register Res = MI.getOperand(0).getReg();
2492   Register LHS = MI.getOperand(1).getReg();
2493   Register RHS = MI.getOperand(2).getReg();
2494 
2495   uint16_t Flags = MI.getFlags();
2496 
2497   LLT S64 = LLT::scalar(64);
2498   LLT S1 = LLT::scalar(1);
2499 
2500   auto One = B.buildFConstant(S64, 1.0);
2501 
2502   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2503     .addUse(LHS)
2504     .addUse(RHS)
2505     .addImm(1)
2506     .setMIFlags(Flags);
2507 
2508   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2509 
2510   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2511     .addUse(DivScale0.getReg(0))
2512     .setMIFlags(Flags);
2513 
2514   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2515   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2516   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2517 
2518   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2519     .addUse(LHS)
2520     .addUse(RHS)
2521     .addImm(0)
2522     .setMIFlags(Flags);
2523 
2524   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2525   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2526   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2527 
2528   Register Scale;
2529   if (!ST.hasUsableDivScaleConditionOutput()) {
2530     // Workaround a hardware bug on SI where the condition output from div_scale
2531     // is not usable.
2532 
2533     LLT S32 = LLT::scalar(32);
2534 
2535     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2536     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2537     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2538     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2539 
2540     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2541                               Scale1Unmerge.getReg(1));
2542     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2543                               Scale0Unmerge.getReg(1));
2544     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2545   } else {
2546     Scale = DivScale1.getReg(1);
2547   }
2548 
2549   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2550     .addUse(Fma4.getReg(0))
2551     .addUse(Fma3.getReg(0))
2552     .addUse(Mul.getReg(0))
2553     .addUse(Scale)
2554     .setMIFlags(Flags);
2555 
2556   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2557     .addUse(Fmas.getReg(0))
2558     .addUse(RHS)
2559     .addUse(LHS)
2560     .setMIFlags(Flags);
2561 
2562   MI.eraseFromParent();
2563   return true;
2564 }
2565 
2566 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2567                                                  MachineRegisterInfo &MRI,
2568                                                  MachineIRBuilder &B) const {
2569   B.setInstr(MI);
2570   Register Res = MI.getOperand(0).getReg();
2571   Register LHS = MI.getOperand(2).getReg();
2572   Register RHS = MI.getOperand(3).getReg();
2573   uint16_t Flags = MI.getFlags();
2574 
2575   LLT S32 = LLT::scalar(32);
2576   LLT S1 = LLT::scalar(1);
2577 
2578   auto Abs = B.buildFAbs(S32, RHS, Flags);
2579   const APFloat C0Val(1.0f);
2580 
2581   auto C0 = B.buildConstant(S32, 0x6f800000);
2582   auto C1 = B.buildConstant(S32, 0x2f800000);
2583   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2584 
2585   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2586   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2587 
2588   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2589 
2590   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2591     .addUse(Mul0.getReg(0))
2592     .setMIFlags(Flags);
2593 
2594   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2595 
2596   B.buildFMul(Res, Sel, Mul1, Flags);
2597 
2598   MI.eraseFromParent();
2599   return true;
2600 }
2601 
2602 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2603                                                  MachineRegisterInfo &MRI,
2604                                                  MachineIRBuilder &B) const {
2605   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2606   if (!MFI->isEntryFunction()) {
2607     return legalizePreloadedArgIntrin(MI, MRI, B,
2608                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2609   }
2610 
2611   B.setInstr(MI);
2612 
2613   uint64_t Offset =
2614     ST.getTargetLowering()->getImplicitParameterOffset(
2615       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2616   Register DstReg = MI.getOperand(0).getReg();
2617   LLT DstTy = MRI.getType(DstReg);
2618   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2619 
2620   const ArgDescriptor *Arg;
2621   const TargetRegisterClass *RC;
2622   std::tie(Arg, RC)
2623     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2624   if (!Arg)
2625     return false;
2626 
2627   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2628   if (!loadInputValue(KernargPtrReg, B, Arg))
2629     return false;
2630 
2631   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2632   MI.eraseFromParent();
2633   return true;
2634 }
2635 
2636 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2637                                               MachineRegisterInfo &MRI,
2638                                               MachineIRBuilder &B,
2639                                               unsigned AddrSpace) const {
2640   B.setInstr(MI);
2641   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2642   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2643   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2644   MI.eraseFromParent();
2645   return true;
2646 }
2647 
2648 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2649 // offset (the offset that is included in bounds checking and swizzling, to be
2650 // split between the instruction's voffset and immoffset fields) and soffset
2651 // (the offset that is excluded from bounds checking and swizzling, to go in
2652 // the instruction's soffset field).  This function takes the first kind of
2653 // offset and figures out how to split it between voffset and immoffset.
2654 std::tuple<Register, unsigned, unsigned>
2655 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2656                                         Register OrigOffset) const {
2657   const unsigned MaxImm = 4095;
2658   Register BaseReg;
2659   unsigned TotalConstOffset;
2660   MachineInstr *OffsetDef;
2661   const LLT S32 = LLT::scalar(32);
2662 
2663   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2664     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2665 
2666   unsigned ImmOffset = TotalConstOffset;
2667 
2668   // If the immediate value is too big for the immoffset field, put the value
2669   // and -4096 into the immoffset field so that the value that is copied/added
2670   // for the voffset field is a multiple of 4096, and it stands more chance
2671   // of being CSEd with the copy/add for another similar load/store.
2672   // However, do not do that rounding down to a multiple of 4096 if that is a
2673   // negative number, as it appears to be illegal to have a negative offset
2674   // in the vgpr, even if adding the immediate offset makes it positive.
2675   unsigned Overflow = ImmOffset & ~MaxImm;
2676   ImmOffset -= Overflow;
2677   if ((int32_t)Overflow < 0) {
2678     Overflow += ImmOffset;
2679     ImmOffset = 0;
2680   }
2681 
2682   if (Overflow != 0) {
2683     if (!BaseReg) {
2684       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2685     } else {
2686       auto OverflowVal = B.buildConstant(S32, Overflow);
2687       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2688     }
2689   }
2690 
2691   if (!BaseReg)
2692     BaseReg = B.buildConstant(S32, 0).getReg(0);
2693 
2694   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2695 }
2696 
2697 /// Handle register layout difference for f16 images for some subtargets.
2698 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2699                                              MachineRegisterInfo &MRI,
2700                                              Register Reg) const {
2701   if (!ST.hasUnpackedD16VMem())
2702     return Reg;
2703 
2704   const LLT S16 = LLT::scalar(16);
2705   const LLT S32 = LLT::scalar(32);
2706   LLT StoreVT = MRI.getType(Reg);
2707   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2708 
2709   auto Unmerge = B.buildUnmerge(S16, Reg);
2710 
2711   SmallVector<Register, 4> WideRegs;
2712   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2713     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2714 
2715   int NumElts = StoreVT.getNumElements();
2716 
2717   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2718 }
2719 
2720 Register AMDGPULegalizerInfo::fixStoreSourceType(
2721   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2722   MachineRegisterInfo *MRI = B.getMRI();
2723   LLT Ty = MRI->getType(VData);
2724 
2725   const LLT S16 = LLT::scalar(16);
2726 
2727   // Fixup illegal register types for i8 stores.
2728   if (Ty == LLT::scalar(8) || Ty == S16) {
2729     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2730     return AnyExt;
2731   }
2732 
2733   if (Ty.isVector()) {
2734     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2735       if (IsFormat)
2736         return handleD16VData(B, *MRI, VData);
2737     }
2738   }
2739 
2740   return VData;
2741 }
2742 
2743 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2744                                               MachineRegisterInfo &MRI,
2745                                               MachineIRBuilder &B,
2746                                               bool IsTyped,
2747                                               bool IsFormat) const {
2748   B.setInstr(MI);
2749 
2750   Register VData = MI.getOperand(1).getReg();
2751   LLT Ty = MRI.getType(VData);
2752   LLT EltTy = Ty.getScalarType();
2753   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2754   const LLT S32 = LLT::scalar(32);
2755 
2756   VData = fixStoreSourceType(B, VData, IsFormat);
2757   Register RSrc = MI.getOperand(2).getReg();
2758 
2759   MachineMemOperand *MMO = *MI.memoperands_begin();
2760   const int MemSize = MMO->getSize();
2761 
2762   unsigned ImmOffset;
2763   unsigned TotalOffset;
2764 
2765   // The typed intrinsics add an immediate after the registers.
2766   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2767 
2768   // The struct intrinsic variants add one additional operand over raw.
2769   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2770   Register VIndex;
2771   int OpOffset = 0;
2772   if (HasVIndex) {
2773     VIndex = MI.getOperand(3).getReg();
2774     OpOffset = 1;
2775   }
2776 
2777   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2778   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2779 
2780   unsigned Format = 0;
2781   if (IsTyped) {
2782     Format = MI.getOperand(5 + OpOffset).getImm();
2783     ++OpOffset;
2784   }
2785 
2786   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2787 
2788   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2789   if (TotalOffset != 0)
2790     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2791 
2792   unsigned Opc;
2793   if (IsTyped) {
2794     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2795                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2796   } else if (IsFormat) {
2797     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2798                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2799   } else {
2800     switch (MemSize) {
2801     case 1:
2802       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2803       break;
2804     case 2:
2805       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2806       break;
2807     default:
2808       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2809       break;
2810     }
2811   }
2812 
2813   if (!VIndex)
2814     VIndex = B.buildConstant(S32, 0).getReg(0);
2815 
2816   auto MIB = B.buildInstr(Opc)
2817     .addUse(VData)              // vdata
2818     .addUse(RSrc)               // rsrc
2819     .addUse(VIndex)             // vindex
2820     .addUse(VOffset)            // voffset
2821     .addUse(SOffset)            // soffset
2822     .addImm(ImmOffset);         // offset(imm)
2823 
2824   if (IsTyped)
2825     MIB.addImm(Format);
2826 
2827   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2828      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2829      .addMemOperand(MMO);
2830 
2831   MI.eraseFromParent();
2832   return true;
2833 }
2834 
2835 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2836                                              MachineRegisterInfo &MRI,
2837                                              MachineIRBuilder &B,
2838                                              bool IsFormat,
2839                                              bool IsTyped) const {
2840   B.setInstr(MI);
2841 
2842   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2843   MachineMemOperand *MMO = *MI.memoperands_begin();
2844   const int MemSize = MMO->getSize();
2845   const LLT S32 = LLT::scalar(32);
2846 
2847   Register Dst = MI.getOperand(0).getReg();
2848   Register RSrc = MI.getOperand(2).getReg();
2849 
2850   // The typed intrinsics add an immediate after the registers.
2851   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2852 
2853   // The struct intrinsic variants add one additional operand over raw.
2854   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2855   Register VIndex;
2856   int OpOffset = 0;
2857   if (HasVIndex) {
2858     VIndex = MI.getOperand(3).getReg();
2859     OpOffset = 1;
2860   }
2861 
2862   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2863   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2864 
2865   unsigned Format = 0;
2866   if (IsTyped) {
2867     Format = MI.getOperand(5 + OpOffset).getImm();
2868     ++OpOffset;
2869   }
2870 
2871   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2872   unsigned ImmOffset;
2873   unsigned TotalOffset;
2874 
2875   LLT Ty = MRI.getType(Dst);
2876   LLT EltTy = Ty.getScalarType();
2877   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2878   const bool Unpacked = ST.hasUnpackedD16VMem();
2879 
2880   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2881   if (TotalOffset != 0)
2882     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2883 
2884   unsigned Opc;
2885 
2886   if (IsTyped) {
2887     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2888                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2889   } else if (IsFormat) {
2890     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2891                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2892   } else {
2893     switch (MemSize) {
2894     case 1:
2895       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2896       break;
2897     case 2:
2898       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2899       break;
2900     default:
2901       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2902       break;
2903     }
2904   }
2905 
2906   Register LoadDstReg;
2907 
2908   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2909   LLT UnpackedTy = Ty.changeElementSize(32);
2910 
2911   if (IsExtLoad)
2912     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2913   else if (Unpacked && IsD16 && Ty.isVector())
2914     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2915   else
2916     LoadDstReg = Dst;
2917 
2918   if (!VIndex)
2919     VIndex = B.buildConstant(S32, 0).getReg(0);
2920 
2921   auto MIB = B.buildInstr(Opc)
2922     .addDef(LoadDstReg)         // vdata
2923     .addUse(RSrc)               // rsrc
2924     .addUse(VIndex)             // vindex
2925     .addUse(VOffset)            // voffset
2926     .addUse(SOffset)            // soffset
2927     .addImm(ImmOffset);         // offset(imm)
2928 
2929   if (IsTyped)
2930     MIB.addImm(Format);
2931 
2932   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2933      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2934      .addMemOperand(MMO);
2935 
2936   if (LoadDstReg != Dst) {
2937     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2938 
2939     // Widen result for extending loads was widened.
2940     if (IsExtLoad)
2941       B.buildTrunc(Dst, LoadDstReg);
2942     else {
2943       // Repack to original 16-bit vector result
2944       // FIXME: G_TRUNC should work, but legalization currently fails
2945       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2946       SmallVector<Register, 4> Repack;
2947       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2948         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2949       B.buildMerge(Dst, Repack);
2950     }
2951   }
2952 
2953   MI.eraseFromParent();
2954   return true;
2955 }
2956 
2957 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2958                                                MachineIRBuilder &B,
2959                                                bool IsInc) const {
2960   B.setInstr(MI);
2961   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2962                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2963   B.buildInstr(Opc)
2964     .addDef(MI.getOperand(0).getReg())
2965     .addUse(MI.getOperand(2).getReg())
2966     .addUse(MI.getOperand(3).getReg())
2967     .cloneMemRefs(MI);
2968   MI.eraseFromParent();
2969   return true;
2970 }
2971 
2972 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2973   switch (IntrID) {
2974   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2975   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2976     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2977   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2978   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2979     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2980   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2981   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2982     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2983   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2984   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2985     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2986   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2987   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2988     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2989   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2990   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2991     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2992   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2993   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2994     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2995   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2996   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2997     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2998   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2999   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3000     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3001   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3002   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3003     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3004   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3005   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3006     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3007   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3008   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3009     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3010   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3011   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3012     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3013   default:
3014     llvm_unreachable("unhandled atomic opcode");
3015   }
3016 }
3017 
3018 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3019                                                MachineIRBuilder &B,
3020                                                Intrinsic::ID IID) const {
3021   B.setInstr(MI);
3022 
3023   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3024                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3025 
3026   Register Dst = MI.getOperand(0).getReg();
3027   Register VData = MI.getOperand(2).getReg();
3028 
3029   Register CmpVal;
3030   int OpOffset = 0;
3031 
3032   if (IsCmpSwap) {
3033     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3034     ++OpOffset;
3035   }
3036 
3037   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3038   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3039 
3040   // The struct intrinsic variants add one additional operand over raw.
3041   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3042   Register VIndex;
3043   if (HasVIndex) {
3044     VIndex = MI.getOperand(4 + OpOffset).getReg();
3045     ++OpOffset;
3046   }
3047 
3048   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3049   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3050   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3051 
3052   MachineMemOperand *MMO = *MI.memoperands_begin();
3053 
3054   unsigned ImmOffset;
3055   unsigned TotalOffset;
3056   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3057   if (TotalOffset != 0)
3058     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3059 
3060   if (!VIndex)
3061     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3062 
3063   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3064     .addDef(Dst)
3065     .addUse(VData); // vdata
3066 
3067   if (IsCmpSwap)
3068     MIB.addReg(CmpVal);
3069 
3070   MIB.addUse(RSrc)               // rsrc
3071      .addUse(VIndex)             // vindex
3072      .addUse(VOffset)            // voffset
3073      .addUse(SOffset)            // soffset
3074      .addImm(ImmOffset)          // offset(imm)
3075      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3076      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3077      .addMemOperand(MMO);
3078 
3079   MI.eraseFromParent();
3080   return true;
3081 }
3082 
3083 // Produce a vector of s16 elements from s32 pieces.
3084 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3085                              ArrayRef<Register> UnmergeParts) {
3086   const LLT S16 = LLT::scalar(16);
3087 
3088   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3089   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3090     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3091 
3092   B.buildBuildVector(DstReg, RemergeParts);
3093 }
3094 
3095 /// Convert a set of s32 registers to a result vector with s16 elements.
3096 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3097                                ArrayRef<Register> UnmergeParts) {
3098   MachineRegisterInfo &MRI = *B.getMRI();
3099   const LLT V2S16 = LLT::vector(2, 16);
3100   LLT TargetTy = MRI.getType(DstReg);
3101   int NumElts = UnmergeParts.size();
3102 
3103   if (NumElts == 1) {
3104     assert(TargetTy == V2S16);
3105     B.buildBitcast(DstReg, UnmergeParts[0]);
3106     return;
3107   }
3108 
3109   SmallVector<Register, 4> RemergeParts(NumElts);
3110   for (int I = 0; I != NumElts; ++I)
3111     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3112 
3113   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3114     B.buildConcatVectors(DstReg, RemergeParts);
3115     return;
3116   }
3117 
3118   const LLT V3S16 = LLT::vector(3, 16);
3119   const LLT V6S16 = LLT::vector(6, 16);
3120 
3121   // Widen to v6s16 and unpack v3 parts.
3122   assert(TargetTy == V3S16);
3123 
3124   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3125   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3126   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3127 }
3128 
3129 // FIXME: Just vector trunc should be sufficent, but legalization currently
3130 // broken.
3131 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3132                                   Register WideDstReg) {
3133   const LLT S32 = LLT::scalar(32);
3134   const LLT S16 = LLT::scalar(16);
3135 
3136   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3137 
3138   int NumOps = Unmerge->getNumOperands() - 1;
3139   SmallVector<Register, 4> RemergeParts(NumOps);
3140   for (int I = 0; I != NumOps; ++I)
3141     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3142 
3143   B.buildBuildVector(DstReg, RemergeParts);
3144 }
3145 
3146 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3147     MachineInstr &MI, MachineIRBuilder &B,
3148     GISelChangeObserver &Observer,
3149     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3150   bool IsTFE = MI.getNumExplicitDefs() == 2;
3151 
3152   // We are only processing the operands of d16 image operations on subtargets
3153   // that use the unpacked register layout, or need to repack the TFE result.
3154 
3155   // TODO: Need to handle a16 images too
3156   // TODO: Do we need to guard against already legalized intrinsics?
3157   if (!IsTFE && !ST.hasUnpackedD16VMem())
3158     return true;
3159 
3160   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3161     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3162 
3163   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3164     return true;
3165 
3166   B.setInstr(MI);
3167 
3168   MachineRegisterInfo *MRI = B.getMRI();
3169   const LLT S32 = LLT::scalar(32);
3170   const LLT S16 = LLT::scalar(16);
3171 
3172   if (BaseOpcode->Store) { // No TFE for stores?
3173     Register VData = MI.getOperand(1).getReg();
3174     LLT Ty = MRI->getType(VData);
3175     if (!Ty.isVector() || Ty.getElementType() != S16)
3176       return true;
3177 
3178     B.setInstr(MI);
3179 
3180     Observer.changingInstr(MI);
3181     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3182     Observer.changedInstr(MI);
3183     return true;
3184   }
3185 
3186   Register DstReg = MI.getOperand(0).getReg();
3187   LLT Ty = MRI->getType(DstReg);
3188   const LLT EltTy = Ty.getScalarType();
3189   const bool IsD16 = Ty.getScalarType() == S16;
3190   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3191 
3192   if (IsTFE) {
3193     // In the IR, TFE is supposed to be used with a 2 element struct return
3194     // type. The intruction really returns these two values in one contiguous
3195     // register, with one additional dword beyond the loaded data. Rewrite the
3196     // return type to use a single register result.
3197     Register Dst1Reg = MI.getOperand(1).getReg();
3198     if (MRI->getType(Dst1Reg) != S32)
3199       return false;
3200 
3201     // TODO: Make sure the TFE operand bit is set.
3202 
3203     // The raw dword aligned data component of the load. The only legal cases
3204     // where this matters should be when using the packed D16 format, for
3205     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3206     LLT RoundedTy;
3207     LLT TFETy;
3208 
3209     if (IsD16 && ST.hasUnpackedD16VMem()) {
3210       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3211       TFETy = LLT::vector(NumElts + 1, 32);
3212     } else {
3213       unsigned EltSize = Ty.getScalarSizeInBits();
3214       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3215       unsigned RoundedSize = 32 * RoundedElts;
3216       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3217       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3218     }
3219 
3220     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3221     Observer.changingInstr(MI);
3222 
3223     MI.getOperand(0).setReg(TFEReg);
3224     MI.RemoveOperand(1);
3225 
3226     Observer.changedInstr(MI);
3227 
3228     // Insert after the instruction.
3229     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3230 
3231     // Now figure out how to copy the new result register back into the old
3232     // result.
3233 
3234     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3235     int NumDataElts = TFETy.getNumElements() - 1;
3236 
3237     if (!Ty.isVector()) {
3238       // Simplest case is a trivial unmerge (plus a truncate for d16).
3239       UnmergeResults[0] = Ty == S32 ?
3240         DstReg : MRI->createGenericVirtualRegister(S32);
3241 
3242       B.buildUnmerge(UnmergeResults, TFEReg);
3243       if (Ty != S32)
3244         B.buildTrunc(DstReg, UnmergeResults[0]);
3245       return true;
3246     }
3247 
3248     // We have to repack into a new vector of some kind.
3249     for (int I = 0; I != NumDataElts; ++I)
3250       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3251     B.buildUnmerge(UnmergeResults, TFEReg);
3252 
3253     // Drop the final TFE element.
3254     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3255 
3256     if (EltTy == S32)
3257       B.buildBuildVector(DstReg, DataPart);
3258     else if (ST.hasUnpackedD16VMem())
3259       truncToS16Vector(B, DstReg, DataPart);
3260     else
3261       bitcastToS16Vector(B, DstReg, DataPart);
3262 
3263     return true;
3264   }
3265 
3266   // Must be an image load.
3267   if (!Ty.isVector() || Ty.getElementType() != S16)
3268     return true;
3269 
3270   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3271 
3272   LLT WidenedTy = Ty.changeElementType(S32);
3273   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3274 
3275   Observer.changingInstr(MI);
3276   MI.getOperand(0).setReg(WideDstReg);
3277   Observer.changedInstr(MI);
3278 
3279   repackUnpackedD16Load(B, DstReg, WideDstReg);
3280   return true;
3281 }
3282 
3283 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3284   MachineInstr &MI, MachineIRBuilder &B,
3285   GISelChangeObserver &Observer) const {
3286   Register Dst = MI.getOperand(0).getReg();
3287   LLT Ty = B.getMRI()->getType(Dst);
3288   unsigned Size = Ty.getSizeInBits();
3289   MachineFunction &MF = B.getMF();
3290 
3291   Observer.changingInstr(MI);
3292 
3293   // FIXME: We don't really need this intermediate instruction. The intrinsic
3294   // should be fixed to have a memory operand. Since it's readnone, we're not
3295   // allowed to add one.
3296   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3297   MI.RemoveOperand(1); // Remove intrinsic ID
3298 
3299   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3300   // TODO: Should this use datalayout alignment?
3301   const unsigned MemSize = (Size + 7) / 8;
3302   const unsigned MemAlign = 4;
3303   MachineMemOperand *MMO = MF.getMachineMemOperand(
3304     MachinePointerInfo(),
3305     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3306     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3307   MI.addMemOperand(MF, MMO);
3308 
3309   // There are no 96-bit result scalar loads, but widening to 128-bit should
3310   // always be legal. We may need to restore this to a 96-bit result if it turns
3311   // out this needs to be converted to a vector load during RegBankSelect.
3312   if (!isPowerOf2_32(Size)) {
3313     LegalizerHelper Helper(MF, *this, Observer, B);
3314     B.setInstr(MI);
3315 
3316     if (Ty.isVector())
3317       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3318     else
3319       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3320   }
3321 
3322   Observer.changedInstr(MI);
3323   return true;
3324 }
3325 
3326 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3327                                             MachineIRBuilder &B,
3328                                             GISelChangeObserver &Observer) const {
3329   MachineRegisterInfo &MRI = *B.getMRI();
3330 
3331   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3332   auto IntrID = MI.getIntrinsicID();
3333   switch (IntrID) {
3334   case Intrinsic::amdgcn_if:
3335   case Intrinsic::amdgcn_else: {
3336     MachineInstr *Br = nullptr;
3337     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3338       const SIRegisterInfo *TRI
3339         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3340 
3341       B.setInstr(*BrCond);
3342       Register Def = MI.getOperand(1).getReg();
3343       Register Use = MI.getOperand(3).getReg();
3344 
3345       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3346       if (Br)
3347         BrTarget = Br->getOperand(0).getMBB();
3348 
3349       if (IntrID == Intrinsic::amdgcn_if) {
3350         B.buildInstr(AMDGPU::SI_IF)
3351           .addDef(Def)
3352           .addUse(Use)
3353           .addMBB(BrTarget);
3354       } else {
3355         B.buildInstr(AMDGPU::SI_ELSE)
3356           .addDef(Def)
3357           .addUse(Use)
3358           .addMBB(BrTarget)
3359           .addImm(0);
3360       }
3361 
3362       if (Br)
3363         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3364 
3365       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3366       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3367       MI.eraseFromParent();
3368       BrCond->eraseFromParent();
3369       return true;
3370     }
3371 
3372     return false;
3373   }
3374   case Intrinsic::amdgcn_loop: {
3375     MachineInstr *Br = nullptr;
3376     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3377       const SIRegisterInfo *TRI
3378         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3379 
3380       B.setInstr(*BrCond);
3381 
3382       // FIXME: Need to adjust branch targets based on unconditional branch.
3383       Register Reg = MI.getOperand(2).getReg();
3384       B.buildInstr(AMDGPU::SI_LOOP)
3385         .addUse(Reg)
3386         .addMBB(BrCond->getOperand(1).getMBB());
3387       MI.eraseFromParent();
3388       BrCond->eraseFromParent();
3389       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3390       return true;
3391     }
3392 
3393     return false;
3394   }
3395   case Intrinsic::amdgcn_kernarg_segment_ptr:
3396     return legalizePreloadedArgIntrin(
3397       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3398   case Intrinsic::amdgcn_implicitarg_ptr:
3399     return legalizeImplicitArgPtr(MI, MRI, B);
3400   case Intrinsic::amdgcn_workitem_id_x:
3401     return legalizePreloadedArgIntrin(MI, MRI, B,
3402                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3403   case Intrinsic::amdgcn_workitem_id_y:
3404     return legalizePreloadedArgIntrin(MI, MRI, B,
3405                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3406   case Intrinsic::amdgcn_workitem_id_z:
3407     return legalizePreloadedArgIntrin(MI, MRI, B,
3408                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3409   case Intrinsic::amdgcn_workgroup_id_x:
3410     return legalizePreloadedArgIntrin(MI, MRI, B,
3411                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3412   case Intrinsic::amdgcn_workgroup_id_y:
3413     return legalizePreloadedArgIntrin(MI, MRI, B,
3414                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3415   case Intrinsic::amdgcn_workgroup_id_z:
3416     return legalizePreloadedArgIntrin(MI, MRI, B,
3417                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3418   case Intrinsic::amdgcn_dispatch_ptr:
3419     return legalizePreloadedArgIntrin(MI, MRI, B,
3420                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3421   case Intrinsic::amdgcn_queue_ptr:
3422     return legalizePreloadedArgIntrin(MI, MRI, B,
3423                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3424   case Intrinsic::amdgcn_implicit_buffer_ptr:
3425     return legalizePreloadedArgIntrin(
3426       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3427   case Intrinsic::amdgcn_dispatch_id:
3428     return legalizePreloadedArgIntrin(MI, MRI, B,
3429                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3430   case Intrinsic::amdgcn_fdiv_fast:
3431     return legalizeFDIVFastIntrin(MI, MRI, B);
3432   case Intrinsic::amdgcn_is_shared:
3433     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3434   case Intrinsic::amdgcn_is_private:
3435     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3436   case Intrinsic::amdgcn_wavefrontsize: {
3437     B.setInstr(MI);
3438     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3439     MI.eraseFromParent();
3440     return true;
3441   }
3442   case Intrinsic::amdgcn_s_buffer_load:
3443     return legalizeSBufferLoad(MI, B, Observer);
3444   case Intrinsic::amdgcn_raw_buffer_store:
3445   case Intrinsic::amdgcn_struct_buffer_store:
3446     return legalizeBufferStore(MI, MRI, B, false, false);
3447   case Intrinsic::amdgcn_raw_buffer_store_format:
3448   case Intrinsic::amdgcn_struct_buffer_store_format:
3449     return legalizeBufferStore(MI, MRI, B, false, true);
3450   case Intrinsic::amdgcn_raw_tbuffer_store:
3451   case Intrinsic::amdgcn_struct_tbuffer_store:
3452     return legalizeBufferStore(MI, MRI, B, true, true);
3453   case Intrinsic::amdgcn_raw_buffer_load:
3454   case Intrinsic::amdgcn_struct_buffer_load:
3455     return legalizeBufferLoad(MI, MRI, B, false, false);
3456   case Intrinsic::amdgcn_raw_buffer_load_format:
3457   case Intrinsic::amdgcn_struct_buffer_load_format:
3458     return legalizeBufferLoad(MI, MRI, B, true, false);
3459   case Intrinsic::amdgcn_raw_tbuffer_load:
3460   case Intrinsic::amdgcn_struct_tbuffer_load:
3461     return legalizeBufferLoad(MI, MRI, B, true, true);
3462   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3463   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3464   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3465   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3466   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3467   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3468   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3469   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3470   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3471   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3472   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3473   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3474   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3475   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3476   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3477   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3478   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3479   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3480   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3481   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3482   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3483   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3484   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3485   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3486   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3487   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3488     return legalizeBufferAtomic(MI, B, IntrID);
3489   case Intrinsic::amdgcn_atomic_inc:
3490     return legalizeAtomicIncDec(MI, B, true);
3491   case Intrinsic::amdgcn_atomic_dec:
3492     return legalizeAtomicIncDec(MI, B, false);
3493   default: {
3494     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3495             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3496       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3497     return true;
3498   }
3499   }
3500 
3501   return true;
3502 }
3503