1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 static LegalityPredicate isMultiple32(unsigned TypeIdx,
46                                       unsigned MaxSize = 1024) {
47   return [=](const LegalityQuery &Query) {
48     const LLT Ty = Query.Types[TypeIdx];
49     const LLT EltTy = Ty.getScalarType();
50     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51   };
52 }
53 
54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
55   return [=](const LegalityQuery &Query) {
56     return Query.Types[TypeIdx].getSizeInBits() == Size;
57   };
58 }
59 
60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     return Ty.isVector() &&
64            Ty.getNumElements() % 2 != 0 &&
65            Ty.getElementType().getSizeInBits() < 32 &&
66            Ty.getSizeInBits() % 32 != 0;
67   };
68 }
69 
70 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
71   return [=](const LegalityQuery &Query) {
72     const LLT Ty = Query.Types[TypeIdx];
73     const LLT EltTy = Ty.getScalarType();
74     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
75   };
76 }
77 
78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT Ty = Query.Types[TypeIdx];
81     const LLT EltTy = Ty.getElementType();
82     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
83   };
84 }
85 
86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getElementType();
90     unsigned Size = Ty.getSizeInBits();
91     unsigned Pieces = (Size + 63) / 64;
92     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
93     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
94   };
95 }
96 
97 // Increase the number of vector elements to reach the next multiple of 32-bit
98 // type.
99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
100   return [=](const LegalityQuery &Query) {
101     const LLT Ty = Query.Types[TypeIdx];
102 
103     const LLT EltTy = Ty.getElementType();
104     const int Size = Ty.getSizeInBits();
105     const int EltSize = EltTy.getSizeInBits();
106     const int NextMul32 = (Size + 31) / 32;
107 
108     assert(EltSize < 32);
109 
110     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
111     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
112   };
113 }
114 
115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
116   return [=](const LegalityQuery &Query) {
117     const LLT QueryTy = Query.Types[TypeIdx];
118     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
119   };
120 }
121 
122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
123   return [=](const LegalityQuery &Query) {
124     const LLT QueryTy = Query.Types[TypeIdx];
125     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
126   };
127 }
128 
129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
133   };
134 }
135 
136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
137 // v2s16.
138 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT Ty = Query.Types[TypeIdx];
141     if (Ty.isVector()) {
142       const int EltSize = Ty.getElementType().getSizeInBits();
143       return EltSize == 32 || EltSize == 64 ||
144             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
145              EltSize == 128 || EltSize == 256;
146     }
147 
148     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
149   };
150 }
151 
152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
153   return [=](const LegalityQuery &Query) {
154     return Query.Types[TypeIdx].getElementType() == Type;
155   };
156 }
157 
158 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
159   return [=](const LegalityQuery &Query) {
160     const LLT Ty = Query.Types[TypeIdx];
161     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
162            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
163   };
164 }
165 
166 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
167                                          const GCNTargetMachine &TM)
168   :  ST(ST_) {
169   using namespace TargetOpcode;
170 
171   auto GetAddrSpacePtr = [&TM](unsigned AS) {
172     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
173   };
174 
175   const LLT S1 = LLT::scalar(1);
176   const LLT S8 = LLT::scalar(8);
177   const LLT S16 = LLT::scalar(16);
178   const LLT S32 = LLT::scalar(32);
179   const LLT S64 = LLT::scalar(64);
180   const LLT S96 = LLT::scalar(96);
181   const LLT S128 = LLT::scalar(128);
182   const LLT S256 = LLT::scalar(256);
183   const LLT S1024 = LLT::scalar(1024);
184 
185   const LLT V2S16 = LLT::vector(2, 16);
186   const LLT V4S16 = LLT::vector(4, 16);
187 
188   const LLT V2S32 = LLT::vector(2, 32);
189   const LLT V3S32 = LLT::vector(3, 32);
190   const LLT V4S32 = LLT::vector(4, 32);
191   const LLT V5S32 = LLT::vector(5, 32);
192   const LLT V6S32 = LLT::vector(6, 32);
193   const LLT V7S32 = LLT::vector(7, 32);
194   const LLT V8S32 = LLT::vector(8, 32);
195   const LLT V9S32 = LLT::vector(9, 32);
196   const LLT V10S32 = LLT::vector(10, 32);
197   const LLT V11S32 = LLT::vector(11, 32);
198   const LLT V12S32 = LLT::vector(12, 32);
199   const LLT V13S32 = LLT::vector(13, 32);
200   const LLT V14S32 = LLT::vector(14, 32);
201   const LLT V15S32 = LLT::vector(15, 32);
202   const LLT V16S32 = LLT::vector(16, 32);
203   const LLT V32S32 = LLT::vector(32, 32);
204 
205   const LLT V2S64 = LLT::vector(2, 64);
206   const LLT V3S64 = LLT::vector(3, 64);
207   const LLT V4S64 = LLT::vector(4, 64);
208   const LLT V5S64 = LLT::vector(5, 64);
209   const LLT V6S64 = LLT::vector(6, 64);
210   const LLT V7S64 = LLT::vector(7, 64);
211   const LLT V8S64 = LLT::vector(8, 64);
212   const LLT V16S64 = LLT::vector(16, 64);
213 
214   std::initializer_list<LLT> AllS32Vectors =
215     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
216      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
217   std::initializer_list<LLT> AllS64Vectors =
218     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
219 
220   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
221   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
222   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
223   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
224   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
225   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
226   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
227 
228   const LLT CodePtr = FlatPtr;
229 
230   const std::initializer_list<LLT> AddrSpaces64 = {
231     GlobalPtr, ConstantPtr, FlatPtr
232   };
233 
234   const std::initializer_list<LLT> AddrSpaces32 = {
235     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
236   };
237 
238   const std::initializer_list<LLT> FPTypesBase = {
239     S32, S64
240   };
241 
242   const std::initializer_list<LLT> FPTypes16 = {
243     S32, S64, S16
244   };
245 
246   const std::initializer_list<LLT> FPTypesPK16 = {
247     S32, S64, S16, V2S16
248   };
249 
250   const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32;
251 
252   setAction({G_BRCOND, S1}, Legal); // VCC branches
253   setAction({G_BRCOND, S32}, Legal); // SCC branches
254 
255   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
256   // elements for v3s16
257   getActionDefinitionsBuilder(G_PHI)
258     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
259     .legalFor(AllS32Vectors)
260     .legalFor(AllS64Vectors)
261     .legalFor(AddrSpaces64)
262     .legalFor(AddrSpaces32)
263     .clampScalar(0, S32, S256)
264     .widenScalarToNextPow2(0, 32)
265     .clampMaxNumElements(0, S32, 16)
266     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
267     .legalIf(isPointer(0));
268 
269   if (ST.has16BitInsts()) {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32, S16})
272       .clampScalar(0, S16, S32)
273       .scalarize(0);
274   } else {
275     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
276       .legalFor({S32})
277       .clampScalar(0, S32, S32)
278       .scalarize(0);
279   }
280 
281   // FIXME: Not really legal. Placeholder for custom lowering.
282   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
283     .legalFor({S32, S64})
284     .clampScalar(0, S32, S64)
285     .widenScalarToNextPow2(0, 32)
286     .scalarize(0);
287 
288   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
289     .legalFor({S32})
290     .clampScalar(0, S32, S32)
291     .scalarize(0);
292 
293   // Report legal for any types we can handle anywhere. For the cases only legal
294   // on the SALU, RegBankSelect will be able to re-legalize.
295   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
296     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
297     .clampScalar(0, S32, S64)
298     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
299     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
300     .widenScalarToNextPow2(0)
301     .scalarize(0);
302 
303   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
304                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
305     .legalFor({{S32, S1}, {S32, S32}})
306     .clampScalar(0, S32, S32)
307     .scalarize(0); // TODO: Implement.
308 
309   getActionDefinitionsBuilder(G_BITCAST)
310     // Don't worry about the size constraint.
311     .legalIf(all(isRegisterType(0), isRegisterType(1)))
312     // FIXME: Testing hack
313     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
314     .lower();
315 
316 
317   getActionDefinitionsBuilder(G_CONSTANT)
318     .legalFor({S1, S32, S64, S16, GlobalPtr,
319                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
320     .clampScalar(0, S32, S64)
321     .widenScalarToNextPow2(0)
322     .legalIf(isPointer(0));
323 
324   getActionDefinitionsBuilder(G_FCONSTANT)
325     .legalFor({S32, S64, S16})
326     .clampScalar(0, S16, S64);
327 
328   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
329     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
330                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
331     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
332     .clampScalarOrElt(0, S32, S1024)
333     .legalIf(isMultiple32(0))
334     .widenScalarToNextPow2(0, 32)
335     .clampMaxNumElements(0, S32, 16);
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   if (ST.has16BitInsts()) {
401     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
402       .legalFor({S32, S64, S16})
403       .scalarize(0)
404       .clampScalar(0, S16, S64);
405   } else {
406     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
407       .legalFor({S32, S64})
408       .scalarize(0)
409       .clampScalar(0, S32, S64);
410   }
411 
412   getActionDefinitionsBuilder(G_FPTRUNC)
413     .legalFor({{S32, S64}, {S16, S32}})
414     .scalarize(0);
415 
416   getActionDefinitionsBuilder(G_FPEXT)
417     .legalFor({{S64, S32}, {S32, S16}})
418     .lowerFor({{S64, S16}}) // FIXME: Implement
419     .scalarize(0);
420 
421   getActionDefinitionsBuilder(G_FSUB)
422       // Use actual fsub instruction
423       .legalFor({S32})
424       // Must use fadd + fneg
425       .lowerFor({S64, S16, V2S16})
426       .scalarize(0)
427       .clampScalar(0, S32, S64);
428 
429   // Whether this is legal depends on the floating point mode for the function.
430   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
431   if (ST.hasMadF16())
432     FMad.customFor({S32, S16});
433   else
434     FMad.customFor({S32});
435   FMad.scalarize(0)
436       .lower();
437 
438   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
439     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
440                {S32, S1}, {S64, S1}, {S16, S1},
441                {S96, S32},
442                // FIXME: Hack
443                {S64, LLT::scalar(33)},
444                {S32, S8}, {S32, LLT::scalar(24)}})
445     .scalarize(0)
446     .clampScalar(0, S32, S64);
447 
448   // TODO: Split s1->s64 during regbankselect for VALU.
449   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
450     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
451     .lowerFor({{S32, S64}})
452     .lowerIf(typeIs(1, S1))
453     .customFor({{S64, S64}});
454   if (ST.has16BitInsts())
455     IToFP.legalFor({{S16, S16}});
456   IToFP.clampScalar(1, S32, S64)
457        .scalarize(0);
458 
459   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
460     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
461   if (ST.has16BitInsts())
462     FPToI.legalFor({{S16, S16}});
463   else
464     FPToI.minScalar(1, S32);
465 
466   FPToI.minScalar(0, S32)
467        .scalarize(0);
468 
469   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
470     .scalarize(0)
471     .lower();
472 
473   if (ST.has16BitInsts()) {
474     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
475       .legalFor({S16, S32, S64})
476       .clampScalar(0, S16, S64)
477       .scalarize(0);
478   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
479     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
480       .legalFor({S32, S64})
481       .clampScalar(0, S32, S64)
482       .scalarize(0);
483   } else {
484     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
485       .legalFor({S32})
486       .customFor({S64})
487       .clampScalar(0, S32, S64)
488       .scalarize(0);
489   }
490 
491   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
492     .scalarize(0)
493     .alwaysLegal();
494 
495   auto &CmpBuilder =
496     getActionDefinitionsBuilder(G_ICMP)
497     // The compare output type differs based on the register bank of the output,
498     // so make both s1 and s32 legal.
499     //
500     // Scalar compares producing output in scc will be promoted to s32, as that
501     // is the allocatable register type that will be needed for the copy from
502     // scc. This will be promoted during RegBankSelect, and we assume something
503     // before that won't try to use s32 result types.
504     //
505     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
506     // bank.
507     .legalForCartesianProduct(
508       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
509     .legalForCartesianProduct(
510       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
511   if (ST.has16BitInsts()) {
512     CmpBuilder.legalFor({{S1, S16}});
513   }
514 
515   CmpBuilder
516     .widenScalarToNextPow2(1)
517     .clampScalar(1, S32, S64)
518     .scalarize(0)
519     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
520 
521   getActionDefinitionsBuilder(G_FCMP)
522     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
523     .widenScalarToNextPow2(1)
524     .clampScalar(1, S32, S64)
525     .scalarize(0);
526 
527   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
528   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
529                                G_FLOG, G_FLOG2, G_FLOG10})
530     .legalFor({S32})
531     .scalarize(0);
532 
533   // The 64-bit versions produce 32-bit results, but only on the SALU.
534   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
535                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
536                                G_CTPOP})
537     .legalFor({{S32, S32}, {S32, S64}})
538     .clampScalar(0, S32, S32)
539     .clampScalar(1, S32, S64)
540     .scalarize(0)
541     .widenScalarToNextPow2(0, 32)
542     .widenScalarToNextPow2(1, 32);
543 
544   // TODO: Expand for > s32
545   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
546     .legalFor({S32})
547     .clampScalar(0, S32, S32)
548     .scalarize(0);
549 
550   if (ST.has16BitInsts()) {
551     if (ST.hasVOP3PInsts()) {
552       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
553         .legalFor({S32, S16, V2S16})
554         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
555         .clampMaxNumElements(0, S16, 2)
556         .clampScalar(0, S16, S32)
557         .widenScalarToNextPow2(0)
558         .scalarize(0);
559     } else {
560       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
561         .legalFor({S32, S16})
562         .widenScalarToNextPow2(0)
563         .clampScalar(0, S16, S32)
564         .scalarize(0);
565     }
566   } else {
567     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
568       .legalFor({S32})
569       .clampScalar(0, S32, S32)
570       .widenScalarToNextPow2(0)
571       .scalarize(0);
572   }
573 
574   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
575     return [=](const LegalityQuery &Query) {
576       return Query.Types[TypeIdx0].getSizeInBits() <
577              Query.Types[TypeIdx1].getSizeInBits();
578     };
579   };
580 
581   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
582     return [=](const LegalityQuery &Query) {
583       return Query.Types[TypeIdx0].getSizeInBits() >
584              Query.Types[TypeIdx1].getSizeInBits();
585     };
586   };
587 
588   getActionDefinitionsBuilder(G_INTTOPTR)
589     // List the common cases
590     .legalForCartesianProduct(AddrSpaces64, {S64})
591     .legalForCartesianProduct(AddrSpaces32, {S32})
592     .scalarize(0)
593     // Accept any address space as long as the size matches
594     .legalIf(sameSize(0, 1))
595     .widenScalarIf(smallerThan(1, 0),
596       [](const LegalityQuery &Query) {
597         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
598       })
599     .narrowScalarIf(greaterThan(1, 0),
600       [](const LegalityQuery &Query) {
601         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
602       });
603 
604   getActionDefinitionsBuilder(G_PTRTOINT)
605     // List the common cases
606     .legalForCartesianProduct(AddrSpaces64, {S64})
607     .legalForCartesianProduct(AddrSpaces32, {S32})
608     .scalarize(0)
609     // Accept any address space as long as the size matches
610     .legalIf(sameSize(0, 1))
611     .widenScalarIf(smallerThan(0, 1),
612       [](const LegalityQuery &Query) {
613         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
614       })
615     .narrowScalarIf(
616       greaterThan(0, 1),
617       [](const LegalityQuery &Query) {
618         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
619       });
620 
621   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
622     .scalarize(0)
623     .custom();
624 
625   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
626   // handle some operations by just promoting the register during
627   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
628   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
629     switch (AS) {
630     // FIXME: Private element size.
631     case AMDGPUAS::PRIVATE_ADDRESS:
632       return 32;
633     // FIXME: Check subtarget
634     case AMDGPUAS::LOCAL_ADDRESS:
635       return ST.useDS128() ? 128 : 64;
636 
637     // Treat constant and global as identical. SMRD loads are sometimes usable
638     // for global loads (ideally constant address space should be eliminated)
639     // depending on the context. Legality cannot be context dependent, but
640     // RegBankSelect can split the load as necessary depending on the pointer
641     // register bank/uniformity and if the memory is invariant or not written in
642     // a kernel.
643     case AMDGPUAS::CONSTANT_ADDRESS:
644     case AMDGPUAS::GLOBAL_ADDRESS:
645       return 512;
646     default:
647       return 128;
648     }
649   };
650 
651   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
652     const LLT DstTy = Query.Types[0];
653 
654     // Split vector extloads.
655     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
656     unsigned Align = Query.MMODescrs[0].AlignInBits;
657 
658     if (MemSize < DstTy.getSizeInBits())
659       MemSize = std::max(MemSize, Align);
660 
661     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
662       return true;
663 
664     const LLT PtrTy = Query.Types[1];
665     unsigned AS = PtrTy.getAddressSpace();
666     if (MemSize > maxSizeForAddrSpace(AS))
667       return true;
668 
669     // Catch weird sized loads that don't evenly divide into the access sizes
670     // TODO: May be able to widen depending on alignment etc.
671     unsigned NumRegs = MemSize / 32;
672     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
673       return true;
674 
675     if (Align < MemSize) {
676       const SITargetLowering *TLI = ST.getTargetLowering();
677       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
678     }
679 
680     return false;
681   };
682 
683   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
684   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
685   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
686 
687   // TODO: Refine based on subtargets which support unaligned access or 128-bit
688   // LDS
689   // TODO: Unsupported flat for SI.
690 
691   for (unsigned Op : {G_LOAD, G_STORE}) {
692     const bool IsStore = Op == G_STORE;
693 
694     auto &Actions = getActionDefinitionsBuilder(Op);
695     // Whitelist the common cases.
696     // TODO: Pointer loads
697     // TODO: Wide constant loads
698     // TODO: Only CI+ has 3x loads
699     // TODO: Loads to s16 on gfx9
700     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
701                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
702                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
703                                       {S96, GlobalPtr, 96, GlobalAlign32},
704                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
705                                       {S128, GlobalPtr, 128, GlobalAlign32},
706                                       {S64, GlobalPtr, 64, GlobalAlign32},
707                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
708                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
709                                       {S32, GlobalPtr, 8, GlobalAlign8},
710                                       {S32, GlobalPtr, 16, GlobalAlign16},
711 
712                                       {S32, LocalPtr, 32, 32},
713                                       {S64, LocalPtr, 64, 32},
714                                       {V2S32, LocalPtr, 64, 32},
715                                       {S32, LocalPtr, 8, 8},
716                                       {S32, LocalPtr, 16, 16},
717                                       {V2S16, LocalPtr, 32, 32},
718 
719                                       {S32, PrivatePtr, 32, 32},
720                                       {S32, PrivatePtr, 8, 8},
721                                       {S32, PrivatePtr, 16, 16},
722                                       {V2S16, PrivatePtr, 32, 32},
723 
724                                       {S32, FlatPtr, 32, GlobalAlign32},
725                                       {S32, FlatPtr, 16, GlobalAlign16},
726                                       {S32, FlatPtr, 8, GlobalAlign8},
727                                       {V2S16, FlatPtr, 32, GlobalAlign32},
728 
729                                       {S32, ConstantPtr, 32, GlobalAlign32},
730                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
731                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
732                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
733                                       {S64, ConstantPtr, 64, GlobalAlign32},
734                                       {S128, ConstantPtr, 128, GlobalAlign32},
735                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
736     Actions
737         .customIf(typeIs(1, Constant32Ptr))
738         .narrowScalarIf(
739             [=](const LegalityQuery &Query) -> bool {
740               return !Query.Types[0].isVector() && needToSplitLoad(Query);
741             },
742             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
743               const LLT DstTy = Query.Types[0];
744               const LLT PtrTy = Query.Types[1];
745 
746               const unsigned DstSize = DstTy.getSizeInBits();
747               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
748 
749               // Split extloads.
750               if (DstSize > MemSize)
751                 return std::make_pair(0, LLT::scalar(MemSize));
752 
753               if (DstSize > 32 && (DstSize % 32 != 0)) {
754                 // FIXME: Need a way to specify non-extload of larger size if
755                 // suitably aligned.
756                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
757               }
758 
759               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
760               if (MemSize > MaxSize)
761                 return std::make_pair(0, LLT::scalar(MaxSize));
762 
763               unsigned Align = Query.MMODescrs[0].AlignInBits;
764               return std::make_pair(0, LLT::scalar(Align));
765             })
766         .fewerElementsIf(
767             [=](const LegalityQuery &Query) -> bool {
768               return Query.Types[0].isVector() && needToSplitLoad(Query);
769             },
770             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
771               const LLT DstTy = Query.Types[0];
772               const LLT PtrTy = Query.Types[1];
773 
774               LLT EltTy = DstTy.getElementType();
775               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
776 
777               // Split if it's too large for the address space.
778               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
779                 unsigned NumElts = DstTy.getNumElements();
780                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
781 
782                 // FIXME: Refine when odd breakdowns handled
783                 // The scalars will need to be re-legalized.
784                 if (NumPieces == 1 || NumPieces >= NumElts ||
785                     NumElts % NumPieces != 0)
786                   return std::make_pair(0, EltTy);
787 
788                 return std::make_pair(0,
789                                       LLT::vector(NumElts / NumPieces, EltTy));
790               }
791 
792               // Need to split because of alignment.
793               unsigned Align = Query.MMODescrs[0].AlignInBits;
794               unsigned EltSize = EltTy.getSizeInBits();
795               if (EltSize > Align &&
796                   (EltSize / Align < DstTy.getNumElements())) {
797                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
798               }
799 
800               // May need relegalization for the scalars.
801               return std::make_pair(0, EltTy);
802             })
803         .minScalar(0, S32);
804 
805     if (IsStore)
806       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
807 
808     // TODO: Need a bitcast lower option?
809     Actions
810         .legalIf([=](const LegalityQuery &Query) {
811           const LLT Ty0 = Query.Types[0];
812           unsigned Size = Ty0.getSizeInBits();
813           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
814           unsigned Align = Query.MMODescrs[0].AlignInBits;
815 
816           // FIXME: Widening store from alignment not valid.
817           if (MemSize < Size)
818             MemSize = std::max(MemSize, Align);
819 
820           // No extending vector loads.
821           if (Size > MemSize && Ty0.isVector())
822             return false;
823 
824           switch (MemSize) {
825           case 8:
826           case 16:
827             return Size == 32;
828           case 32:
829           case 64:
830           case 128:
831             return true;
832           case 96:
833             return ST.hasDwordx3LoadStores();
834           case 256:
835           case 512:
836             return true;
837           default:
838             return false;
839           }
840         })
841         .widenScalarToNextPow2(0)
842         // TODO: v3s32->v4s32 with alignment
843         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
844   }
845 
846   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
847                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
848                                                   {S32, GlobalPtr, 16, 2 * 8},
849                                                   {S32, LocalPtr, 8, 8},
850                                                   {S32, LocalPtr, 16, 16},
851                                                   {S32, PrivatePtr, 8, 8},
852                                                   {S32, PrivatePtr, 16, 16},
853                                                   {S32, ConstantPtr, 8, 8},
854                                                   {S32, ConstantPtr, 16, 2 * 8}});
855   if (ST.hasFlatAddressSpace()) {
856     ExtLoads.legalForTypesWithMemDesc(
857         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
858   }
859 
860   ExtLoads.clampScalar(0, S32, S32)
861           .widenScalarToNextPow2(0)
862           .unsupportedIfMemSizeNotPow2()
863           .lower();
864 
865   auto &Atomics = getActionDefinitionsBuilder(
866     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
867      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
868      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
869      G_ATOMICRMW_UMIN})
870     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
871                {S64, GlobalPtr}, {S64, LocalPtr}});
872   if (ST.hasFlatAddressSpace()) {
873     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
874   }
875 
876   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
877     .legalFor({{S32, LocalPtr}});
878 
879   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
880   // demarshalling
881   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
882     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
883                 {S32, FlatPtr}, {S64, FlatPtr}})
884     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
885                {S32, RegionPtr}, {S64, RegionPtr}});
886   // TODO: Pointer types, any 32-bit or 64-bit vector
887 
888   // Condition should be s32 for scalar, s1 for vector.
889   getActionDefinitionsBuilder(G_SELECT)
890     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
891           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
892           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
893     .clampScalar(0, S16, S64)
894     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
895     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
896     .scalarize(1)
897     .clampMaxNumElements(0, S32, 2)
898     .clampMaxNumElements(0, LocalPtr, 2)
899     .clampMaxNumElements(0, PrivatePtr, 2)
900     .scalarize(0)
901     .widenScalarToNextPow2(0)
902     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
903 
904   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
905   // be more flexible with the shift amount type.
906   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
907     .legalFor({{S32, S32}, {S64, S32}});
908   if (ST.has16BitInsts()) {
909     if (ST.hasVOP3PInsts()) {
910       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
911             .clampMaxNumElements(0, S16, 2);
912     } else
913       Shifts.legalFor({{S16, S32}, {S16, S16}});
914 
915     // TODO: Support 16-bit shift amounts
916     Shifts.clampScalar(1, S32, S32);
917     Shifts.clampScalar(0, S16, S64);
918     Shifts.widenScalarToNextPow2(0, 16);
919   } else {
920     // Make sure we legalize the shift amount type first, as the general
921     // expansion for the shifted type will produce much worse code if it hasn't
922     // been truncated already.
923     Shifts.clampScalar(1, S32, S32);
924     Shifts.clampScalar(0, S32, S64);
925     Shifts.widenScalarToNextPow2(0, 32);
926   }
927   Shifts.scalarize(0);
928 
929   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
930     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
931     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
932     unsigned IdxTypeIdx = 2;
933 
934     getActionDefinitionsBuilder(Op)
935       .customIf([=](const LegalityQuery &Query) {
936           const LLT EltTy = Query.Types[EltTypeIdx];
937           const LLT VecTy = Query.Types[VecTypeIdx];
938           const LLT IdxTy = Query.Types[IdxTypeIdx];
939           return (EltTy.getSizeInBits() == 16 ||
940                   EltTy.getSizeInBits() % 32 == 0) &&
941                  VecTy.getSizeInBits() % 32 == 0 &&
942                  VecTy.getSizeInBits() <= 1024 &&
943                  IdxTy.getSizeInBits() == 32;
944         })
945       .clampScalar(EltTypeIdx, S32, S64)
946       .clampScalar(VecTypeIdx, S32, S64)
947       .clampScalar(IdxTypeIdx, S32, S32);
948   }
949 
950   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
951     .unsupportedIf([=](const LegalityQuery &Query) {
952         const LLT &EltTy = Query.Types[1].getElementType();
953         return Query.Types[0] != EltTy;
954       });
955 
956   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
957     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
958     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
959 
960     // FIXME: Doesn't handle extract of illegal sizes.
961     getActionDefinitionsBuilder(Op)
962       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
963       // FIXME: Multiples of 16 should not be legal.
964       .legalIf([=](const LegalityQuery &Query) {
965           const LLT BigTy = Query.Types[BigTyIdx];
966           const LLT LitTy = Query.Types[LitTyIdx];
967           return (BigTy.getSizeInBits() % 32 == 0) &&
968                  (LitTy.getSizeInBits() % 16 == 0);
969         })
970       .widenScalarIf(
971         [=](const LegalityQuery &Query) {
972           const LLT BigTy = Query.Types[BigTyIdx];
973           return (BigTy.getScalarSizeInBits() < 16);
974         },
975         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
976       .widenScalarIf(
977         [=](const LegalityQuery &Query) {
978           const LLT LitTy = Query.Types[LitTyIdx];
979           return (LitTy.getScalarSizeInBits() < 16);
980         },
981         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
982       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
983       .widenScalarToNextPow2(BigTyIdx, 32);
984 
985   }
986 
987   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
988     .legalForCartesianProduct(AllS32Vectors, {S32})
989     .legalForCartesianProduct(AllS64Vectors, {S64})
990     .clampNumElements(0, V16S32, V32S32)
991     .clampNumElements(0, V2S64, V16S64)
992     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
993 
994   if (ST.hasScalarPackInsts())
995     BuildVector.legalFor({V2S16, S32});
996 
997   BuildVector
998     .minScalarSameAs(1, 0)
999     .legalIf(isRegisterType(0))
1000     .minScalarOrElt(0, S32);
1001 
1002   if (ST.hasScalarPackInsts()) {
1003     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1004       .legalFor({V2S16, S32})
1005       .lower();
1006   } else {
1007     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1008       .lower();
1009   }
1010 
1011   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1012     .legalIf(isRegisterType(0));
1013 
1014   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1015   // pre-legalize.
1016   if (ST.hasVOP3PInsts()) {
1017     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1018       .customFor({V2S16, V2S16})
1019       .lower();
1020   } else
1021     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1022 
1023   // Merge/Unmerge
1024   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1025     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1026     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1027 
1028     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1029       const LLT &Ty = Query.Types[TypeIdx];
1030       if (Ty.isVector()) {
1031         const LLT &EltTy = Ty.getElementType();
1032         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1033           return true;
1034         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1035           return true;
1036       }
1037       return false;
1038     };
1039 
1040     auto &Builder = getActionDefinitionsBuilder(Op)
1041       // Try to widen to s16 first for small types.
1042       // TODO: Only do this on targets with legal s16 shifts
1043       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1044 
1045       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1046       .lowerFor({{S16, V2S16}})
1047       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1048       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1049                            elementTypeIs(1, S16)),
1050                        changeTo(1, V2S16))
1051       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1052       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1053       // valid.
1054       .clampScalar(LitTyIdx, S32, S256)
1055       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1056       // Break up vectors with weird elements into scalars
1057       .fewerElementsIf(
1058         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1059         scalarize(0))
1060       .fewerElementsIf(
1061         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1062         scalarize(1))
1063       .clampScalar(BigTyIdx, S32, S1024);
1064 
1065     if (Op == G_MERGE_VALUES) {
1066       Builder.widenScalarIf(
1067         // TODO: Use 16-bit shifts if legal for 8-bit values?
1068         [=](const LegalityQuery &Query) {
1069           const LLT Ty = Query.Types[LitTyIdx];
1070           return Ty.getSizeInBits() < 32;
1071         },
1072         changeTo(LitTyIdx, S32));
1073     }
1074 
1075     Builder.widenScalarIf(
1076       [=](const LegalityQuery &Query) {
1077         const LLT Ty = Query.Types[BigTyIdx];
1078         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1079           Ty.getSizeInBits() % 16 != 0;
1080       },
1081       [=](const LegalityQuery &Query) {
1082         // Pick the next power of 2, or a multiple of 64 over 128.
1083         // Whichever is smaller.
1084         const LLT &Ty = Query.Types[BigTyIdx];
1085         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1086         if (NewSizeInBits >= 256) {
1087           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1088           if (RoundedTo < NewSizeInBits)
1089             NewSizeInBits = RoundedTo;
1090         }
1091         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1092       })
1093       .legalIf([=](const LegalityQuery &Query) {
1094           const LLT &BigTy = Query.Types[BigTyIdx];
1095           const LLT &LitTy = Query.Types[LitTyIdx];
1096 
1097           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1098             return false;
1099           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1100             return false;
1101 
1102           return BigTy.getSizeInBits() % 16 == 0 &&
1103                  LitTy.getSizeInBits() % 16 == 0 &&
1104                  BigTy.getSizeInBits() <= 1024;
1105         })
1106       // Any vectors left are the wrong size. Scalarize them.
1107       .scalarize(0)
1108       .scalarize(1);
1109   }
1110 
1111   // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect.
1112   getActionDefinitionsBuilder(G_SEXT_INREG)
1113     .clampScalar(0, MinLegalScalarShiftTy, S64)
1114     .lower();
1115 
1116   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1117     .legalFor({S64});
1118 
1119   getActionDefinitionsBuilder({
1120       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1121       G_FCOPYSIGN,
1122 
1123       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1124       G_READ_REGISTER,
1125       G_WRITE_REGISTER,
1126 
1127       G_SADDO, G_SSUBO,
1128 
1129        // TODO: Implement
1130       G_FMINIMUM, G_FMAXIMUM
1131     }).lower();
1132 
1133   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1134         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1135         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1136     .unsupported();
1137 
1138   computeTables();
1139   verify(*ST.getInstrInfo());
1140 }
1141 
1142 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1143                                          MachineRegisterInfo &MRI,
1144                                          MachineIRBuilder &B,
1145                                          GISelChangeObserver &Observer) const {
1146   switch (MI.getOpcode()) {
1147   case TargetOpcode::G_ADDRSPACE_CAST:
1148     return legalizeAddrSpaceCast(MI, MRI, B);
1149   case TargetOpcode::G_FRINT:
1150     return legalizeFrint(MI, MRI, B);
1151   case TargetOpcode::G_FCEIL:
1152     return legalizeFceil(MI, MRI, B);
1153   case TargetOpcode::G_INTRINSIC_TRUNC:
1154     return legalizeIntrinsicTrunc(MI, MRI, B);
1155   case TargetOpcode::G_SITOFP:
1156     return legalizeITOFP(MI, MRI, B, true);
1157   case TargetOpcode::G_UITOFP:
1158     return legalizeITOFP(MI, MRI, B, false);
1159   case TargetOpcode::G_FMINNUM:
1160   case TargetOpcode::G_FMAXNUM:
1161   case TargetOpcode::G_FMINNUM_IEEE:
1162   case TargetOpcode::G_FMAXNUM_IEEE:
1163     return legalizeMinNumMaxNum(MI, MRI, B);
1164   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1165     return legalizeExtractVectorElt(MI, MRI, B);
1166   case TargetOpcode::G_INSERT_VECTOR_ELT:
1167     return legalizeInsertVectorElt(MI, MRI, B);
1168   case TargetOpcode::G_SHUFFLE_VECTOR:
1169     return legalizeShuffleVector(MI, MRI, B);
1170   case TargetOpcode::G_FSIN:
1171   case TargetOpcode::G_FCOS:
1172     return legalizeSinCos(MI, MRI, B);
1173   case TargetOpcode::G_GLOBAL_VALUE:
1174     return legalizeGlobalValue(MI, MRI, B);
1175   case TargetOpcode::G_LOAD:
1176     return legalizeLoad(MI, MRI, B, Observer);
1177   case TargetOpcode::G_FMAD:
1178     return legalizeFMad(MI, MRI, B);
1179   case TargetOpcode::G_FDIV:
1180     return legalizeFDIV(MI, MRI, B);
1181   case TargetOpcode::G_ATOMIC_CMPXCHG:
1182     return legalizeAtomicCmpXChg(MI, MRI, B);
1183   default:
1184     return false;
1185   }
1186 
1187   llvm_unreachable("expected switch to return");
1188 }
1189 
1190 Register AMDGPULegalizerInfo::getSegmentAperture(
1191   unsigned AS,
1192   MachineRegisterInfo &MRI,
1193   MachineIRBuilder &B) const {
1194   MachineFunction &MF = B.getMF();
1195   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1196   const LLT S32 = LLT::scalar(32);
1197 
1198   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1199 
1200   if (ST.hasApertureRegs()) {
1201     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1202     // getreg.
1203     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1204         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1205         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1206     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1207         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1208         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1209     unsigned Encoding =
1210         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1211         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1212         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1213 
1214     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1215     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1216 
1217     B.buildInstr(AMDGPU::S_GETREG_B32)
1218       .addDef(GetReg)
1219       .addImm(Encoding);
1220     MRI.setType(GetReg, S32);
1221 
1222     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1223     B.buildInstr(TargetOpcode::G_SHL)
1224       .addDef(ApertureReg)
1225       .addUse(GetReg)
1226       .addUse(ShiftAmt.getReg(0));
1227 
1228     return ApertureReg;
1229   }
1230 
1231   Register QueuePtr = MRI.createGenericVirtualRegister(
1232     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1233 
1234   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1235   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1236     return Register();
1237 
1238   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1239   // private_segment_aperture_base_hi.
1240   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1241 
1242   // TODO: can we be smarter about machine pointer info?
1243   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1244   MachineMemOperand *MMO = MF.getMachineMemOperand(
1245     PtrInfo,
1246     MachineMemOperand::MOLoad |
1247     MachineMemOperand::MODereferenceable |
1248     MachineMemOperand::MOInvariant,
1249     4,
1250     MinAlign(64, StructOffset));
1251 
1252   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1253   Register LoadAddr;
1254 
1255   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1256   B.buildLoad(LoadResult, LoadAddr, *MMO);
1257   return LoadResult;
1258 }
1259 
1260 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1261   MachineInstr &MI, MachineRegisterInfo &MRI,
1262   MachineIRBuilder &B) const {
1263   MachineFunction &MF = B.getMF();
1264 
1265   B.setInstr(MI);
1266 
1267   const LLT S32 = LLT::scalar(32);
1268   Register Dst = MI.getOperand(0).getReg();
1269   Register Src = MI.getOperand(1).getReg();
1270 
1271   LLT DstTy = MRI.getType(Dst);
1272   LLT SrcTy = MRI.getType(Src);
1273   unsigned DestAS = DstTy.getAddressSpace();
1274   unsigned SrcAS = SrcTy.getAddressSpace();
1275 
1276   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1277   // vector element.
1278   assert(!DstTy.isVector());
1279 
1280   const AMDGPUTargetMachine &TM
1281     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1282 
1283   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1284   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1285     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1286     return true;
1287   }
1288 
1289   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1290     // Truncate.
1291     B.buildExtract(Dst, Src, 0);
1292     MI.eraseFromParent();
1293     return true;
1294   }
1295 
1296   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1297     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1298     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1299 
1300     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1301     // another. Merge operands are required to be the same type, but creating an
1302     // extra ptrtoint would be kind of pointless.
1303     auto HighAddr = B.buildConstant(
1304       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1305     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1306     MI.eraseFromParent();
1307     return true;
1308   }
1309 
1310   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1311     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1312            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1313     unsigned NullVal = TM.getNullPointerValue(DestAS);
1314 
1315     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1316     auto FlatNull = B.buildConstant(SrcTy, 0);
1317 
1318     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1319 
1320     // Extract low 32-bits of the pointer.
1321     B.buildExtract(PtrLo32, Src, 0);
1322 
1323     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1324     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1325     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1326 
1327     MI.eraseFromParent();
1328     return true;
1329   }
1330 
1331   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1332     return false;
1333 
1334   if (!ST.hasFlatAddressSpace())
1335     return false;
1336 
1337   auto SegmentNull =
1338       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1339   auto FlatNull =
1340       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1341 
1342   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1343   if (!ApertureReg.isValid())
1344     return false;
1345 
1346   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1347   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1348 
1349   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1350 
1351   // Coerce the type of the low half of the result so we can use merge_values.
1352   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1353   B.buildInstr(TargetOpcode::G_PTRTOINT)
1354     .addDef(SrcAsInt)
1355     .addUse(Src);
1356 
1357   // TODO: Should we allow mismatched types but matching sizes in merges to
1358   // avoid the ptrtoint?
1359   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1360   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1361 
1362   MI.eraseFromParent();
1363   return true;
1364 }
1365 
1366 bool AMDGPULegalizerInfo::legalizeFrint(
1367   MachineInstr &MI, MachineRegisterInfo &MRI,
1368   MachineIRBuilder &B) const {
1369   B.setInstr(MI);
1370 
1371   Register Src = MI.getOperand(1).getReg();
1372   LLT Ty = MRI.getType(Src);
1373   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1374 
1375   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1376   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1377 
1378   auto C1 = B.buildFConstant(Ty, C1Val);
1379   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1380 
1381   // TODO: Should this propagate fast-math-flags?
1382   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1383   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1384 
1385   auto C2 = B.buildFConstant(Ty, C2Val);
1386   auto Fabs = B.buildFAbs(Ty, Src);
1387 
1388   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1389   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1390   return true;
1391 }
1392 
1393 bool AMDGPULegalizerInfo::legalizeFceil(
1394   MachineInstr &MI, MachineRegisterInfo &MRI,
1395   MachineIRBuilder &B) const {
1396   B.setInstr(MI);
1397 
1398   const LLT S1 = LLT::scalar(1);
1399   const LLT S64 = LLT::scalar(64);
1400 
1401   Register Src = MI.getOperand(1).getReg();
1402   assert(MRI.getType(Src) == S64);
1403 
1404   // result = trunc(src)
1405   // if (src > 0.0 && src != result)
1406   //   result += 1.0
1407 
1408   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1409 
1410   const auto Zero = B.buildFConstant(S64, 0.0);
1411   const auto One = B.buildFConstant(S64, 1.0);
1412   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1413   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1414   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1415   auto Add = B.buildSelect(S64, And, One, Zero);
1416 
1417   // TODO: Should this propagate fast-math-flags?
1418   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1419   return true;
1420 }
1421 
1422 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1423                                               MachineIRBuilder &B) {
1424   const unsigned FractBits = 52;
1425   const unsigned ExpBits = 11;
1426   LLT S32 = LLT::scalar(32);
1427 
1428   auto Const0 = B.buildConstant(S32, FractBits - 32);
1429   auto Const1 = B.buildConstant(S32, ExpBits);
1430 
1431   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1432     .addUse(Const0.getReg(0))
1433     .addUse(Const1.getReg(0));
1434 
1435   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1436 }
1437 
1438 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1439   MachineInstr &MI, MachineRegisterInfo &MRI,
1440   MachineIRBuilder &B) const {
1441   B.setInstr(MI);
1442 
1443   const LLT S1 = LLT::scalar(1);
1444   const LLT S32 = LLT::scalar(32);
1445   const LLT S64 = LLT::scalar(64);
1446 
1447   Register Src = MI.getOperand(1).getReg();
1448   assert(MRI.getType(Src) == S64);
1449 
1450   // TODO: Should this use extract since the low half is unused?
1451   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1452   Register Hi = Unmerge.getReg(1);
1453 
1454   // Extract the upper half, since this is where we will find the sign and
1455   // exponent.
1456   auto Exp = extractF64Exponent(Hi, B);
1457 
1458   const unsigned FractBits = 52;
1459 
1460   // Extract the sign bit.
1461   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1462   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1463 
1464   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1465 
1466   const auto Zero32 = B.buildConstant(S32, 0);
1467 
1468   // Extend back to 64-bits.
1469   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1470 
1471   auto Shr = B.buildAShr(S64, FractMask, Exp);
1472   auto Not = B.buildNot(S64, Shr);
1473   auto Tmp0 = B.buildAnd(S64, Src, Not);
1474   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1475 
1476   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1477   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1478 
1479   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1480   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1481   return true;
1482 }
1483 
1484 bool AMDGPULegalizerInfo::legalizeITOFP(
1485   MachineInstr &MI, MachineRegisterInfo &MRI,
1486   MachineIRBuilder &B, bool Signed) const {
1487   B.setInstr(MI);
1488 
1489   Register Dst = MI.getOperand(0).getReg();
1490   Register Src = MI.getOperand(1).getReg();
1491 
1492   const LLT S64 = LLT::scalar(64);
1493   const LLT S32 = LLT::scalar(32);
1494 
1495   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1496 
1497   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1498 
1499   auto CvtHi = Signed ?
1500     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1501     B.buildUITOFP(S64, Unmerge.getReg(1));
1502 
1503   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1504 
1505   auto ThirtyTwo = B.buildConstant(S32, 32);
1506   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1507     .addUse(CvtHi.getReg(0))
1508     .addUse(ThirtyTwo.getReg(0));
1509 
1510   // TODO: Should this propagate fast-math-flags?
1511   B.buildFAdd(Dst, LdExp, CvtLo);
1512   MI.eraseFromParent();
1513   return true;
1514 }
1515 
1516 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1517   MachineInstr &MI, MachineRegisterInfo &MRI,
1518   MachineIRBuilder &B) const {
1519   MachineFunction &MF = B.getMF();
1520   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1521 
1522   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1523                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1524 
1525   // With ieee_mode disabled, the instructions have the correct behavior
1526   // already for G_FMINNUM/G_FMAXNUM
1527   if (!MFI->getMode().IEEE)
1528     return !IsIEEEOp;
1529 
1530   if (IsIEEEOp)
1531     return true;
1532 
1533   MachineIRBuilder HelperBuilder(MI);
1534   GISelObserverWrapper DummyObserver;
1535   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1536   HelperBuilder.setInstr(MI);
1537   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1538 }
1539 
1540 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1541   MachineInstr &MI, MachineRegisterInfo &MRI,
1542   MachineIRBuilder &B) const {
1543   // TODO: Should move some of this into LegalizerHelper.
1544 
1545   // TODO: Promote dynamic indexing of s16 to s32
1546   // TODO: Dynamic s64 indexing is only legal for SGPR.
1547   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1548   if (!IdxVal) // Dynamic case will be selected to register indexing.
1549     return true;
1550 
1551   Register Dst = MI.getOperand(0).getReg();
1552   Register Vec = MI.getOperand(1).getReg();
1553 
1554   LLT VecTy = MRI.getType(Vec);
1555   LLT EltTy = VecTy.getElementType();
1556   assert(EltTy == MRI.getType(Dst));
1557 
1558   B.setInstr(MI);
1559 
1560   if (IdxVal.getValue() < VecTy.getNumElements())
1561     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1562   else
1563     B.buildUndef(Dst);
1564 
1565   MI.eraseFromParent();
1566   return true;
1567 }
1568 
1569 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1570   MachineInstr &MI, MachineRegisterInfo &MRI,
1571   MachineIRBuilder &B) const {
1572   // TODO: Should move some of this into LegalizerHelper.
1573 
1574   // TODO: Promote dynamic indexing of s16 to s32
1575   // TODO: Dynamic s64 indexing is only legal for SGPR.
1576   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1577   if (!IdxVal) // Dynamic case will be selected to register indexing.
1578     return true;
1579 
1580   Register Dst = MI.getOperand(0).getReg();
1581   Register Vec = MI.getOperand(1).getReg();
1582   Register Ins = MI.getOperand(2).getReg();
1583 
1584   LLT VecTy = MRI.getType(Vec);
1585   LLT EltTy = VecTy.getElementType();
1586   assert(EltTy == MRI.getType(Ins));
1587 
1588   B.setInstr(MI);
1589 
1590   if (IdxVal.getValue() < VecTy.getNumElements())
1591     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1592   else
1593     B.buildUndef(Dst);
1594 
1595   MI.eraseFromParent();
1596   return true;
1597 }
1598 
1599 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1600   assert(Mask.size() == 2);
1601 
1602   // If one half is undef, the other is trivially in the same reg.
1603   if (Mask[0] == -1 || Mask[1] == -1)
1604     return true;
1605   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1606          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1607 }
1608 
1609 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1610   MachineInstr &MI, MachineRegisterInfo &MRI,
1611   MachineIRBuilder &B) const {
1612   const LLT V2S16 = LLT::vector(2, 16);
1613 
1614   Register Dst = MI.getOperand(0).getReg();
1615   Register Src0 = MI.getOperand(1).getReg();
1616   LLT DstTy = MRI.getType(Dst);
1617   LLT SrcTy = MRI.getType(Src0);
1618 
1619   if (SrcTy == V2S16 && DstTy == V2S16 &&
1620       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1621     return true;
1622 
1623   MachineIRBuilder HelperBuilder(MI);
1624   GISelObserverWrapper DummyObserver;
1625   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1626   HelperBuilder.setInstr(MI);
1627   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1628 }
1629 
1630 bool AMDGPULegalizerInfo::legalizeSinCos(
1631   MachineInstr &MI, MachineRegisterInfo &MRI,
1632   MachineIRBuilder &B) const {
1633   B.setInstr(MI);
1634 
1635   Register DstReg = MI.getOperand(0).getReg();
1636   Register SrcReg = MI.getOperand(1).getReg();
1637   LLT Ty = MRI.getType(DstReg);
1638   unsigned Flags = MI.getFlags();
1639 
1640   Register TrigVal;
1641   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1642   if (ST.hasTrigReducedRange()) {
1643     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1644     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1645       .addUse(MulVal.getReg(0))
1646       .setMIFlags(Flags).getReg(0);
1647   } else
1648     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1649 
1650   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1651     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1652   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1653     .addUse(TrigVal)
1654     .setMIFlags(Flags);
1655   MI.eraseFromParent();
1656   return true;
1657 }
1658 
1659 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1660   Register DstReg, LLT PtrTy,
1661   MachineIRBuilder &B, const GlobalValue *GV,
1662   unsigned Offset, unsigned GAFlags) const {
1663   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1664   // to the following code sequence:
1665   //
1666   // For constant address space:
1667   //   s_getpc_b64 s[0:1]
1668   //   s_add_u32 s0, s0, $symbol
1669   //   s_addc_u32 s1, s1, 0
1670   //
1671   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1672   //   a fixup or relocation is emitted to replace $symbol with a literal
1673   //   constant, which is a pc-relative offset from the encoding of the $symbol
1674   //   operand to the global variable.
1675   //
1676   // For global address space:
1677   //   s_getpc_b64 s[0:1]
1678   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1679   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1680   //
1681   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1682   //   fixups or relocations are emitted to replace $symbol@*@lo and
1683   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1684   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1685   //   operand to the global variable.
1686   //
1687   // What we want here is an offset from the value returned by s_getpc
1688   // (which is the address of the s_add_u32 instruction) to the global
1689   // variable, but since the encoding of $symbol starts 4 bytes after the start
1690   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1691   // small. This requires us to add 4 to the global variable offset in order to
1692   // compute the correct address.
1693 
1694   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1695 
1696   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1697     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1698 
1699   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1700     .addDef(PCReg);
1701 
1702   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1703   if (GAFlags == SIInstrInfo::MO_NONE)
1704     MIB.addImm(0);
1705   else
1706     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1707 
1708   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1709 
1710   if (PtrTy.getSizeInBits() == 32)
1711     B.buildExtract(DstReg, PCReg, 0);
1712   return true;
1713  }
1714 
1715 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1716   MachineInstr &MI, MachineRegisterInfo &MRI,
1717   MachineIRBuilder &B) const {
1718   Register DstReg = MI.getOperand(0).getReg();
1719   LLT Ty = MRI.getType(DstReg);
1720   unsigned AS = Ty.getAddressSpace();
1721 
1722   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1723   MachineFunction &MF = B.getMF();
1724   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1725   B.setInstr(MI);
1726 
1727   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1728     if (!MFI->isEntryFunction()) {
1729       const Function &Fn = MF.getFunction();
1730       DiagnosticInfoUnsupported BadLDSDecl(
1731         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1732       Fn.getContext().diagnose(BadLDSDecl);
1733     }
1734 
1735     // TODO: We could emit code to handle the initialization somewhere.
1736     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1737       const SITargetLowering *TLI = ST.getTargetLowering();
1738       if (!TLI->shouldUseLDSConstAddress(GV)) {
1739         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1740         return true; // Leave in place;
1741       }
1742 
1743       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1744       MI.eraseFromParent();
1745       return true;
1746     }
1747 
1748     const Function &Fn = MF.getFunction();
1749     DiagnosticInfoUnsupported BadInit(
1750       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1751     Fn.getContext().diagnose(BadInit);
1752     return true;
1753   }
1754 
1755   const SITargetLowering *TLI = ST.getTargetLowering();
1756 
1757   if (TLI->shouldEmitFixup(GV)) {
1758     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1759     MI.eraseFromParent();
1760     return true;
1761   }
1762 
1763   if (TLI->shouldEmitPCReloc(GV)) {
1764     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1765     MI.eraseFromParent();
1766     return true;
1767   }
1768 
1769   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1770   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1771 
1772   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1773     MachinePointerInfo::getGOT(MF),
1774     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1775     MachineMemOperand::MOInvariant,
1776     8 /*Size*/, 8 /*Align*/);
1777 
1778   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1779 
1780   if (Ty.getSizeInBits() == 32) {
1781     // Truncate if this is a 32-bit constant adrdess.
1782     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1783     B.buildExtract(DstReg, Load, 0);
1784   } else
1785     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1786 
1787   MI.eraseFromParent();
1788   return true;
1789 }
1790 
1791 bool AMDGPULegalizerInfo::legalizeLoad(
1792   MachineInstr &MI, MachineRegisterInfo &MRI,
1793   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1794   B.setInstr(MI);
1795   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1796   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1797   Observer.changingInstr(MI);
1798   MI.getOperand(1).setReg(Cast.getReg(0));
1799   Observer.changedInstr(MI);
1800   return true;
1801 }
1802 
1803 bool AMDGPULegalizerInfo::legalizeFMad(
1804   MachineInstr &MI, MachineRegisterInfo &MRI,
1805   MachineIRBuilder &B) const {
1806   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1807   assert(Ty.isScalar());
1808 
1809   MachineFunction &MF = B.getMF();
1810   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1811 
1812   // TODO: Always legal with future ftz flag.
1813   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1814     return true;
1815   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1816     return true;
1817 
1818 
1819   MachineIRBuilder HelperBuilder(MI);
1820   GISelObserverWrapper DummyObserver;
1821   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1822   HelperBuilder.setMBB(*MI.getParent());
1823   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1824 }
1825 
1826 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1827   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1828   Register DstReg = MI.getOperand(0).getReg();
1829   Register PtrReg = MI.getOperand(1).getReg();
1830   Register CmpVal = MI.getOperand(2).getReg();
1831   Register NewVal = MI.getOperand(3).getReg();
1832 
1833   assert(SITargetLowering::isFlatGlobalAddrSpace(
1834            MRI.getType(PtrReg).getAddressSpace()) &&
1835          "this should not have been custom lowered");
1836 
1837   LLT ValTy = MRI.getType(CmpVal);
1838   LLT VecTy = LLT::vector(2, ValTy);
1839 
1840   B.setInstr(MI);
1841   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1842 
1843   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1844     .addDef(DstReg)
1845     .addUse(PtrReg)
1846     .addUse(PackedVal)
1847     .setMemRefs(MI.memoperands());
1848 
1849   MI.eraseFromParent();
1850   return true;
1851 }
1852 
1853 // Return the use branch instruction, otherwise null if the usage is invalid.
1854 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1855                                        MachineRegisterInfo &MRI,
1856                                        MachineInstr *&Br) {
1857   Register CondDef = MI.getOperand(0).getReg();
1858   if (!MRI.hasOneNonDBGUse(CondDef))
1859     return nullptr;
1860 
1861   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1862   if (UseMI.getParent() != MI.getParent() ||
1863       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1864     return nullptr;
1865 
1866   // Make sure the cond br is followed by a G_BR
1867   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1868   if (Next != MI.getParent()->end()) {
1869     if (Next->getOpcode() != AMDGPU::G_BR)
1870       return nullptr;
1871     Br = &*Next;
1872   }
1873 
1874   return &UseMI;
1875 }
1876 
1877 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1878                                                 Register Reg, LLT Ty) const {
1879   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1880   if (LiveIn)
1881     return LiveIn;
1882 
1883   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1884   MRI.addLiveIn(Reg, NewReg);
1885   return NewReg;
1886 }
1887 
1888 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1889                                          const ArgDescriptor *Arg) const {
1890   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1891     return false; // TODO: Handle these
1892 
1893   assert(Arg->getRegister().isPhysical());
1894 
1895   MachineRegisterInfo &MRI = *B.getMRI();
1896 
1897   LLT Ty = MRI.getType(DstReg);
1898   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1899 
1900   if (Arg->isMasked()) {
1901     // TODO: Should we try to emit this once in the entry block?
1902     const LLT S32 = LLT::scalar(32);
1903     const unsigned Mask = Arg->getMask();
1904     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1905 
1906     Register AndMaskSrc = LiveIn;
1907 
1908     if (Shift != 0) {
1909       auto ShiftAmt = B.buildConstant(S32, Shift);
1910       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1911     }
1912 
1913     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1914   } else
1915     B.buildCopy(DstReg, LiveIn);
1916 
1917   // Insert the argument copy if it doens't already exist.
1918   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1919   if (!MRI.getVRegDef(LiveIn)) {
1920     // FIXME: Should have scoped insert pt
1921     MachineBasicBlock &OrigInsBB = B.getMBB();
1922     auto OrigInsPt = B.getInsertPt();
1923 
1924     MachineBasicBlock &EntryMBB = B.getMF().front();
1925     EntryMBB.addLiveIn(Arg->getRegister());
1926     B.setInsertPt(EntryMBB, EntryMBB.begin());
1927     B.buildCopy(LiveIn, Arg->getRegister());
1928 
1929     B.setInsertPt(OrigInsBB, OrigInsPt);
1930   }
1931 
1932   return true;
1933 }
1934 
1935 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1936   MachineInstr &MI,
1937   MachineRegisterInfo &MRI,
1938   MachineIRBuilder &B,
1939   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1940   B.setInstr(MI);
1941 
1942   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1943 
1944   const ArgDescriptor *Arg;
1945   const TargetRegisterClass *RC;
1946   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1947   if (!Arg) {
1948     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1949     return false;
1950   }
1951 
1952   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1953     MI.eraseFromParent();
1954     return true;
1955   }
1956 
1957   return false;
1958 }
1959 
1960 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1961                                        MachineRegisterInfo &MRI,
1962                                        MachineIRBuilder &B) const {
1963   B.setInstr(MI);
1964   Register Dst = MI.getOperand(0).getReg();
1965   LLT DstTy = MRI.getType(Dst);
1966   LLT S16 = LLT::scalar(16);
1967   LLT S32 = LLT::scalar(32);
1968   LLT S64 = LLT::scalar(64);
1969 
1970   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1971     return true;
1972 
1973   if (DstTy == S16)
1974     return legalizeFDIV16(MI, MRI, B);
1975   if (DstTy == S32)
1976     return legalizeFDIV32(MI, MRI, B);
1977   if (DstTy == S64)
1978     return legalizeFDIV64(MI, MRI, B);
1979 
1980   return false;
1981 }
1982 
1983 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1984                                                  MachineRegisterInfo &MRI,
1985                                                  MachineIRBuilder &B) const {
1986   Register Res = MI.getOperand(0).getReg();
1987   Register LHS = MI.getOperand(1).getReg();
1988   Register RHS = MI.getOperand(2).getReg();
1989 
1990   uint16_t Flags = MI.getFlags();
1991 
1992   LLT ResTy = MRI.getType(Res);
1993   LLT S32 = LLT::scalar(32);
1994   LLT S64 = LLT::scalar(64);
1995 
1996   const MachineFunction &MF = B.getMF();
1997   bool Unsafe =
1998     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1999 
2000   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2001     return false;
2002 
2003   if (!Unsafe && ResTy == S32 &&
2004       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
2005     return false;
2006 
2007   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2008     // 1 / x -> RCP(x)
2009     if (CLHS->isExactlyValue(1.0)) {
2010       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2011         .addUse(RHS)
2012         .setMIFlags(Flags);
2013 
2014       MI.eraseFromParent();
2015       return true;
2016     }
2017 
2018     // -1 / x -> RCP( FNEG(x) )
2019     if (CLHS->isExactlyValue(-1.0)) {
2020       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2021       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2022         .addUse(FNeg.getReg(0))
2023         .setMIFlags(Flags);
2024 
2025       MI.eraseFromParent();
2026       return true;
2027     }
2028   }
2029 
2030   // x / y -> x * (1.0 / y)
2031   if (Unsafe) {
2032     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2033       .addUse(RHS)
2034       .setMIFlags(Flags);
2035     B.buildFMul(Res, LHS, RCP, Flags);
2036 
2037     MI.eraseFromParent();
2038     return true;
2039   }
2040 
2041   return false;
2042 }
2043 
2044 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2045                                          MachineRegisterInfo &MRI,
2046                                          MachineIRBuilder &B) const {
2047   B.setInstr(MI);
2048   Register Res = MI.getOperand(0).getReg();
2049   Register LHS = MI.getOperand(1).getReg();
2050   Register RHS = MI.getOperand(2).getReg();
2051 
2052   uint16_t Flags = MI.getFlags();
2053 
2054   LLT S16 = LLT::scalar(16);
2055   LLT S32 = LLT::scalar(32);
2056 
2057   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2058   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2059 
2060   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2061     .addUse(RHSExt.getReg(0))
2062     .setMIFlags(Flags);
2063 
2064   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2065   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2066 
2067   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2068     .addUse(RDst.getReg(0))
2069     .addUse(RHS)
2070     .addUse(LHS)
2071     .setMIFlags(Flags);
2072 
2073   MI.eraseFromParent();
2074   return true;
2075 }
2076 
2077 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2078 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2079 static void toggleSPDenormMode(bool Enable,
2080                                MachineIRBuilder &B,
2081                                const GCNSubtarget &ST,
2082                                AMDGPU::SIModeRegisterDefaults Mode) {
2083   // Set SP denorm mode to this value.
2084   unsigned SPDenormMode =
2085     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2086 
2087   if (ST.hasDenormModeInst()) {
2088     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2089     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2090                                    ? FP_DENORM_FLUSH_NONE
2091                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2092 
2093     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2094     B.buildInstr(AMDGPU::S_DENORM_MODE)
2095       .addImm(NewDenormModeValue);
2096 
2097   } else {
2098     // Select FP32 bit field in mode register.
2099     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2100                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2101                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2102 
2103     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2104       .addImm(SPDenormMode)
2105       .addImm(SPDenormModeBitField);
2106   }
2107 }
2108 
2109 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2110                                          MachineRegisterInfo &MRI,
2111                                          MachineIRBuilder &B) const {
2112   B.setInstr(MI);
2113   Register Res = MI.getOperand(0).getReg();
2114   Register LHS = MI.getOperand(1).getReg();
2115   Register RHS = MI.getOperand(2).getReg();
2116   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2117   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2118 
2119   uint16_t Flags = MI.getFlags();
2120 
2121   LLT S32 = LLT::scalar(32);
2122   LLT S1 = LLT::scalar(1);
2123 
2124   auto One = B.buildFConstant(S32, 1.0f);
2125 
2126   auto DenominatorScaled =
2127     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2128       .addUse(RHS)
2129       .addUse(LHS)
2130       .addImm(1)
2131       .setMIFlags(Flags);
2132   auto NumeratorScaled =
2133     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2134       .addUse(LHS)
2135       .addUse(RHS)
2136       .addImm(0)
2137       .setMIFlags(Flags);
2138 
2139   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2140     .addUse(DenominatorScaled.getReg(0))
2141     .setMIFlags(Flags);
2142   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2143 
2144   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2145   // aren't modeled as reading it.
2146   if (!Mode.FP32Denormals)
2147     toggleSPDenormMode(true, B, ST, Mode);
2148 
2149   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2150   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2151   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2152   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2153   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2154   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2155 
2156   if (!Mode.FP32Denormals)
2157     toggleSPDenormMode(false, B, ST, Mode);
2158 
2159   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2160     .addUse(Fma4.getReg(0))
2161     .addUse(Fma1.getReg(0))
2162     .addUse(Fma3.getReg(0))
2163     .addUse(NumeratorScaled.getReg(1))
2164     .setMIFlags(Flags);
2165 
2166   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2167     .addUse(Fmas.getReg(0))
2168     .addUse(RHS)
2169     .addUse(LHS)
2170     .setMIFlags(Flags);
2171 
2172   MI.eraseFromParent();
2173   return true;
2174 }
2175 
2176 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2177                                          MachineRegisterInfo &MRI,
2178                                          MachineIRBuilder &B) const {
2179   B.setInstr(MI);
2180   Register Res = MI.getOperand(0).getReg();
2181   Register LHS = MI.getOperand(1).getReg();
2182   Register RHS = MI.getOperand(2).getReg();
2183 
2184   uint16_t Flags = MI.getFlags();
2185 
2186   LLT S64 = LLT::scalar(64);
2187   LLT S1 = LLT::scalar(1);
2188 
2189   auto One = B.buildFConstant(S64, 1.0);
2190 
2191   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2192     .addUse(LHS)
2193     .addUse(RHS)
2194     .addImm(1)
2195     .setMIFlags(Flags);
2196 
2197   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2198 
2199   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2200     .addUse(DivScale0.getReg(0))
2201     .setMIFlags(Flags);
2202 
2203   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2204   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2205   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2206 
2207   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2208     .addUse(LHS)
2209     .addUse(RHS)
2210     .addImm(0)
2211     .setMIFlags(Flags);
2212 
2213   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2214   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2215   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2216 
2217   Register Scale;
2218   if (!ST.hasUsableDivScaleConditionOutput()) {
2219     // Workaround a hardware bug on SI where the condition output from div_scale
2220     // is not usable.
2221 
2222     Scale = MRI.createGenericVirtualRegister(S1);
2223 
2224     LLT S32 = LLT::scalar(32);
2225 
2226     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2227     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2228     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2229     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2230 
2231     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2232                               Scale1Unmerge.getReg(1));
2233     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2234                               Scale0Unmerge.getReg(1));
2235     B.buildXor(Scale, CmpNum, CmpDen);
2236   } else {
2237     Scale = DivScale1.getReg(1);
2238   }
2239 
2240   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2241     .addUse(Fma4.getReg(0))
2242     .addUse(Fma3.getReg(0))
2243     .addUse(Mul.getReg(0))
2244     .addUse(Scale)
2245     .setMIFlags(Flags);
2246 
2247   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2248     .addUse(Fmas.getReg(0))
2249     .addUse(RHS)
2250     .addUse(LHS)
2251     .setMIFlags(Flags);
2252 
2253   MI.eraseFromParent();
2254   return true;
2255 }
2256 
2257 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2258                                                  MachineRegisterInfo &MRI,
2259                                                  MachineIRBuilder &B) const {
2260   B.setInstr(MI);
2261   Register Res = MI.getOperand(0).getReg();
2262   Register LHS = MI.getOperand(2).getReg();
2263   Register RHS = MI.getOperand(3).getReg();
2264   uint16_t Flags = MI.getFlags();
2265 
2266   LLT S32 = LLT::scalar(32);
2267   LLT S1 = LLT::scalar(1);
2268 
2269   auto Abs = B.buildFAbs(S32, RHS, Flags);
2270   const APFloat C0Val(1.0f);
2271 
2272   auto C0 = B.buildConstant(S32, 0x6f800000);
2273   auto C1 = B.buildConstant(S32, 0x2f800000);
2274   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2275 
2276   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2277   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2278 
2279   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2280 
2281   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2282     .addUse(Mul0.getReg(0))
2283     .setMIFlags(Flags);
2284 
2285   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2286 
2287   B.buildFMul(Res, Sel, Mul1, Flags);
2288 
2289   MI.eraseFromParent();
2290   return true;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2294                                                  MachineRegisterInfo &MRI,
2295                                                  MachineIRBuilder &B) const {
2296   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2297   if (!MFI->isEntryFunction()) {
2298     return legalizePreloadedArgIntrin(MI, MRI, B,
2299                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2300   }
2301 
2302   B.setInstr(MI);
2303 
2304   uint64_t Offset =
2305     ST.getTargetLowering()->getImplicitParameterOffset(
2306       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2307   Register DstReg = MI.getOperand(0).getReg();
2308   LLT DstTy = MRI.getType(DstReg);
2309   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2310 
2311   const ArgDescriptor *Arg;
2312   const TargetRegisterClass *RC;
2313   std::tie(Arg, RC)
2314     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2315   if (!Arg)
2316     return false;
2317 
2318   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2319   if (!loadInputValue(KernargPtrReg, B, Arg))
2320     return false;
2321 
2322   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2323   MI.eraseFromParent();
2324   return true;
2325 }
2326 
2327 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2328                                               MachineRegisterInfo &MRI,
2329                                               MachineIRBuilder &B,
2330                                               unsigned AddrSpace) const {
2331   B.setInstr(MI);
2332   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2333   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2334   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2335   MI.eraseFromParent();
2336   return true;
2337 }
2338 
2339 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2340 // offset (the offset that is included in bounds checking and swizzling, to be
2341 // split between the instruction's voffset and immoffset fields) and soffset
2342 // (the offset that is excluded from bounds checking and swizzling, to go in
2343 // the instruction's soffset field).  This function takes the first kind of
2344 // offset and figures out how to split it between voffset and immoffset.
2345 std::tuple<Register, unsigned, unsigned>
2346 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2347                                         Register OrigOffset) const {
2348   const unsigned MaxImm = 4095;
2349   Register BaseReg;
2350   unsigned TotalConstOffset;
2351   MachineInstr *OffsetDef;
2352   const LLT S32 = LLT::scalar(32);
2353 
2354   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2355     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2356 
2357   unsigned ImmOffset = TotalConstOffset;
2358 
2359   // If the immediate value is too big for the immoffset field, put the value
2360   // and -4096 into the immoffset field so that the value that is copied/added
2361   // for the voffset field is a multiple of 4096, and it stands more chance
2362   // of being CSEd with the copy/add for another similar load/store.
2363   // However, do not do that rounding down to a multiple of 4096 if that is a
2364   // negative number, as it appears to be illegal to have a negative offset
2365   // in the vgpr, even if adding the immediate offset makes it positive.
2366   unsigned Overflow = ImmOffset & ~MaxImm;
2367   ImmOffset -= Overflow;
2368   if ((int32_t)Overflow < 0) {
2369     Overflow += ImmOffset;
2370     ImmOffset = 0;
2371   }
2372 
2373   if (Overflow != 0) {
2374     if (!BaseReg) {
2375       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2376     } else {
2377       auto OverflowVal = B.buildConstant(S32, Overflow);
2378       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2379     }
2380   }
2381 
2382   if (!BaseReg)
2383     BaseReg = B.buildConstant(S32, 0).getReg(0);
2384 
2385   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2386 }
2387 
2388 /// Handle register layout difference for f16 images for some subtargets.
2389 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2390                                              MachineRegisterInfo &MRI,
2391                                              Register Reg) const {
2392   if (!ST.hasUnpackedD16VMem())
2393     return Reg;
2394 
2395   const LLT S16 = LLT::scalar(16);
2396   const LLT S32 = LLT::scalar(32);
2397   LLT StoreVT = MRI.getType(Reg);
2398   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2399 
2400   auto Unmerge = B.buildUnmerge(S16, Reg);
2401 
2402   SmallVector<Register, 4> WideRegs;
2403   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2404     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2405 
2406   int NumElts = StoreVT.getNumElements();
2407 
2408   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2409 }
2410 
2411 Register AMDGPULegalizerInfo::fixStoreSourceType(
2412   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2413   MachineRegisterInfo *MRI = B.getMRI();
2414   LLT Ty = MRI->getType(VData);
2415 
2416   const LLT S16 = LLT::scalar(16);
2417 
2418   // Fixup illegal register types for i8 stores.
2419   if (Ty == LLT::scalar(8) || Ty == S16) {
2420     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2421     return AnyExt;
2422   }
2423 
2424   if (Ty.isVector()) {
2425     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2426       if (IsFormat)
2427         return handleD16VData(B, *MRI, VData);
2428     }
2429   }
2430 
2431   return VData;
2432 }
2433 
2434 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2435                                               MachineRegisterInfo &MRI,
2436                                               MachineIRBuilder &B,
2437                                               bool IsTyped,
2438                                               bool IsFormat) const {
2439   B.setInstr(MI);
2440 
2441   Register VData = MI.getOperand(1).getReg();
2442   LLT Ty = MRI.getType(VData);
2443   LLT EltTy = Ty.getScalarType();
2444   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2445   const LLT S32 = LLT::scalar(32);
2446 
2447   VData = fixStoreSourceType(B, VData, IsFormat);
2448   Register RSrc = MI.getOperand(2).getReg();
2449 
2450   MachineMemOperand *MMO = *MI.memoperands_begin();
2451   const int MemSize = MMO->getSize();
2452 
2453   unsigned ImmOffset;
2454   unsigned TotalOffset;
2455 
2456   // The typed intrinsics add an immediate after the registers.
2457   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2458 
2459   // The struct intrinsic variants add one additional operand over raw.
2460   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2461   Register VIndex;
2462   int OpOffset = 0;
2463   if (HasVIndex) {
2464     VIndex = MI.getOperand(3).getReg();
2465     OpOffset = 1;
2466   }
2467 
2468   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2469   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2470 
2471   unsigned Format = 0;
2472   if (IsTyped) {
2473     Format = MI.getOperand(5 + OpOffset).getImm();
2474     ++OpOffset;
2475   }
2476 
2477   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2478 
2479   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2480   if (TotalOffset != 0)
2481     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2482 
2483   unsigned Opc;
2484   if (IsTyped) {
2485     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2486                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2487   } else if (IsFormat) {
2488     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2489                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2490   } else {
2491     switch (MemSize) {
2492     case 1:
2493       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2494       break;
2495     case 2:
2496       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2497       break;
2498     default:
2499       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2500       break;
2501     }
2502   }
2503 
2504   if (!VIndex)
2505     VIndex = B.buildConstant(S32, 0).getReg(0);
2506 
2507   auto MIB = B.buildInstr(Opc)
2508     .addUse(VData)              // vdata
2509     .addUse(RSrc)               // rsrc
2510     .addUse(VIndex)             // vindex
2511     .addUse(VOffset)            // voffset
2512     .addUse(SOffset)            // soffset
2513     .addImm(ImmOffset);         // offset(imm)
2514 
2515   if (IsTyped)
2516     MIB.addImm(Format);
2517 
2518   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2519      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2520      .addMemOperand(MMO);
2521 
2522   MI.eraseFromParent();
2523   return true;
2524 }
2525 
2526 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2527                                              MachineRegisterInfo &MRI,
2528                                              MachineIRBuilder &B,
2529                                              bool IsFormat,
2530                                              bool IsTyped) const {
2531   B.setInstr(MI);
2532 
2533   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2534   MachineMemOperand *MMO = *MI.memoperands_begin();
2535   const int MemSize = MMO->getSize();
2536   const LLT S32 = LLT::scalar(32);
2537 
2538   Register Dst = MI.getOperand(0).getReg();
2539   Register RSrc = MI.getOperand(2).getReg();
2540 
2541   // The typed intrinsics add an immediate after the registers.
2542   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2543 
2544   // The struct intrinsic variants add one additional operand over raw.
2545   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2546   Register VIndex;
2547   int OpOffset = 0;
2548   if (HasVIndex) {
2549     VIndex = MI.getOperand(3).getReg();
2550     OpOffset = 1;
2551   }
2552 
2553   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2554   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2555 
2556   unsigned Format = 0;
2557   if (IsTyped) {
2558     Format = MI.getOperand(5 + OpOffset).getImm();
2559     ++OpOffset;
2560   }
2561 
2562   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2563   unsigned ImmOffset;
2564   unsigned TotalOffset;
2565 
2566   LLT Ty = MRI.getType(Dst);
2567   LLT EltTy = Ty.getScalarType();
2568   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2569   const bool Unpacked = ST.hasUnpackedD16VMem();
2570 
2571   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2572   if (TotalOffset != 0)
2573     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2574 
2575   unsigned Opc;
2576 
2577   if (IsTyped) {
2578     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2579                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2580   } else if (IsFormat) {
2581     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2582                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2583   } else {
2584     switch (MemSize) {
2585     case 1:
2586       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2587       break;
2588     case 2:
2589       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2590       break;
2591     default:
2592       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2593       break;
2594     }
2595   }
2596 
2597   Register LoadDstReg;
2598 
2599   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2600   LLT UnpackedTy = Ty.changeElementSize(32);
2601 
2602   if (IsExtLoad)
2603     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2604   else if (Unpacked && IsD16 && Ty.isVector())
2605     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2606   else
2607     LoadDstReg = Dst;
2608 
2609   if (!VIndex)
2610     VIndex = B.buildConstant(S32, 0).getReg(0);
2611 
2612   auto MIB = B.buildInstr(Opc)
2613     .addDef(LoadDstReg)         // vdata
2614     .addUse(RSrc)               // rsrc
2615     .addUse(VIndex)             // vindex
2616     .addUse(VOffset)            // voffset
2617     .addUse(SOffset)            // soffset
2618     .addImm(ImmOffset);         // offset(imm)
2619 
2620   if (IsTyped)
2621     MIB.addImm(Format);
2622 
2623   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2624      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2625      .addMemOperand(MMO);
2626 
2627   if (LoadDstReg != Dst) {
2628     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2629 
2630     // Widen result for extending loads was widened.
2631     if (IsExtLoad)
2632       B.buildTrunc(Dst, LoadDstReg);
2633     else {
2634       // Repack to original 16-bit vector result
2635       // FIXME: G_TRUNC should work, but legalization currently fails
2636       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2637       SmallVector<Register, 4> Repack;
2638       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2639         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2640       B.buildMerge(Dst, Repack);
2641     }
2642   }
2643 
2644   MI.eraseFromParent();
2645   return true;
2646 }
2647 
2648 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2649                                                MachineIRBuilder &B,
2650                                                bool IsInc) const {
2651   B.setInstr(MI);
2652   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2653                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2654   B.buildInstr(Opc)
2655     .addDef(MI.getOperand(0).getReg())
2656     .addUse(MI.getOperand(2).getReg())
2657     .addUse(MI.getOperand(3).getReg())
2658     .cloneMemRefs(MI);
2659   MI.eraseFromParent();
2660   return true;
2661 }
2662 
2663 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2664   switch (IntrID) {
2665   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2666   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2667     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2668   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2669   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2670     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2671   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2672   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2673     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2674   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2675   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2676     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2677   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2678   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2679     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2680   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2681   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2682     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2683   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2684   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2685     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2686   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2687   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2688     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2689   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2690   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2691     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2692   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2693   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2694     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2695   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2696   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2697     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2698   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2699   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2700     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2701   default:
2702     llvm_unreachable("unhandled atomic opcode");
2703   }
2704 }
2705 
2706 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2707                                                MachineIRBuilder &B,
2708                                                Intrinsic::ID IID) const {
2709   B.setInstr(MI);
2710 
2711   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2712                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2713 
2714   Register Dst = MI.getOperand(0).getReg();
2715   Register VData = MI.getOperand(2).getReg();
2716 
2717   Register CmpVal;
2718   int OpOffset = 0;
2719 
2720   if (IsCmpSwap) {
2721     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2722     ++OpOffset;
2723   }
2724 
2725   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2726   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2727 
2728   // The struct intrinsic variants add one additional operand over raw.
2729   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2730   Register VIndex;
2731   if (HasVIndex) {
2732     VIndex = MI.getOperand(4).getReg();
2733     ++OpOffset;
2734   }
2735 
2736   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2737   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2738   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2739 
2740   MachineMemOperand *MMO = *MI.memoperands_begin();
2741 
2742   unsigned ImmOffset;
2743   unsigned TotalOffset;
2744   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2745   if (TotalOffset != 0)
2746     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
2747 
2748   if (!VIndex)
2749     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
2750 
2751   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
2752     .addDef(Dst)
2753     .addUse(VData); // vdata
2754 
2755   if (IsCmpSwap)
2756     MIB.addReg(CmpVal);
2757 
2758   MIB.addUse(RSrc)               // rsrc
2759      .addUse(VIndex)             // vindex
2760      .addUse(VOffset)            // voffset
2761      .addUse(SOffset)            // soffset
2762      .addImm(ImmOffset)          // offset(imm)
2763      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2764      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2765      .addMemOperand(MMO);
2766 
2767   MI.eraseFromParent();
2768   return true;
2769 }
2770 
2771 bool AMDGPULegalizerInfo::legalizeIntrinsic(
2772     MachineInstr &MI, MachineIRBuilder &B,
2773     GISelChangeObserver &Observer) const {
2774   MachineRegisterInfo &MRI = *B.getMRI();
2775   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2776   auto IntrID = MI.getIntrinsicID();
2777   switch (IntrID) {
2778   case Intrinsic::amdgcn_if:
2779   case Intrinsic::amdgcn_else: {
2780     MachineInstr *Br = nullptr;
2781     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2782       const SIRegisterInfo *TRI
2783         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2784 
2785       B.setInstr(*BrCond);
2786       Register Def = MI.getOperand(1).getReg();
2787       Register Use = MI.getOperand(3).getReg();
2788 
2789       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2790       if (Br)
2791         BrTarget = Br->getOperand(0).getMBB();
2792 
2793       if (IntrID == Intrinsic::amdgcn_if) {
2794         B.buildInstr(AMDGPU::SI_IF)
2795           .addDef(Def)
2796           .addUse(Use)
2797           .addMBB(BrTarget);
2798       } else {
2799         B.buildInstr(AMDGPU::SI_ELSE)
2800           .addDef(Def)
2801           .addUse(Use)
2802           .addMBB(BrTarget)
2803           .addImm(0);
2804       }
2805 
2806       if (Br)
2807         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2808 
2809       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2810       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2811       MI.eraseFromParent();
2812       BrCond->eraseFromParent();
2813       return true;
2814     }
2815 
2816     return false;
2817   }
2818   case Intrinsic::amdgcn_loop: {
2819     MachineInstr *Br = nullptr;
2820     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2821       const SIRegisterInfo *TRI
2822         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2823 
2824       B.setInstr(*BrCond);
2825 
2826       // FIXME: Need to adjust branch targets based on unconditional branch.
2827       Register Reg = MI.getOperand(2).getReg();
2828       B.buildInstr(AMDGPU::SI_LOOP)
2829         .addUse(Reg)
2830         .addMBB(BrCond->getOperand(1).getMBB());
2831       MI.eraseFromParent();
2832       BrCond->eraseFromParent();
2833       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2834       return true;
2835     }
2836 
2837     return false;
2838   }
2839   case Intrinsic::amdgcn_kernarg_segment_ptr:
2840     return legalizePreloadedArgIntrin(
2841       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2842   case Intrinsic::amdgcn_implicitarg_ptr:
2843     return legalizeImplicitArgPtr(MI, MRI, B);
2844   case Intrinsic::amdgcn_workitem_id_x:
2845     return legalizePreloadedArgIntrin(MI, MRI, B,
2846                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2847   case Intrinsic::amdgcn_workitem_id_y:
2848     return legalizePreloadedArgIntrin(MI, MRI, B,
2849                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2850   case Intrinsic::amdgcn_workitem_id_z:
2851     return legalizePreloadedArgIntrin(MI, MRI, B,
2852                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2853   case Intrinsic::amdgcn_workgroup_id_x:
2854     return legalizePreloadedArgIntrin(MI, MRI, B,
2855                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2856   case Intrinsic::amdgcn_workgroup_id_y:
2857     return legalizePreloadedArgIntrin(MI, MRI, B,
2858                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2859   case Intrinsic::amdgcn_workgroup_id_z:
2860     return legalizePreloadedArgIntrin(MI, MRI, B,
2861                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2862   case Intrinsic::amdgcn_dispatch_ptr:
2863     return legalizePreloadedArgIntrin(MI, MRI, B,
2864                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2865   case Intrinsic::amdgcn_queue_ptr:
2866     return legalizePreloadedArgIntrin(MI, MRI, B,
2867                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2868   case Intrinsic::amdgcn_implicit_buffer_ptr:
2869     return legalizePreloadedArgIntrin(
2870       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2871   case Intrinsic::amdgcn_dispatch_id:
2872     return legalizePreloadedArgIntrin(MI, MRI, B,
2873                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2874   case Intrinsic::amdgcn_fdiv_fast:
2875     return legalizeFDIVFastIntrin(MI, MRI, B);
2876   case Intrinsic::amdgcn_is_shared:
2877     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2878   case Intrinsic::amdgcn_is_private:
2879     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2880   case Intrinsic::amdgcn_wavefrontsize: {
2881     B.setInstr(MI);
2882     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2883     MI.eraseFromParent();
2884     return true;
2885   }
2886   case Intrinsic::amdgcn_raw_buffer_store:
2887   case Intrinsic::amdgcn_struct_buffer_store:
2888     return legalizeBufferStore(MI, MRI, B, false, false);
2889   case Intrinsic::amdgcn_raw_buffer_store_format:
2890   case Intrinsic::amdgcn_struct_buffer_store_format:
2891     return legalizeBufferStore(MI, MRI, B, false, true);
2892   case Intrinsic::amdgcn_raw_tbuffer_store:
2893   case Intrinsic::amdgcn_struct_tbuffer_store:
2894     return legalizeBufferStore(MI, MRI, B, true, true);
2895   case Intrinsic::amdgcn_raw_buffer_load:
2896   case Intrinsic::amdgcn_struct_buffer_load:
2897     return legalizeBufferLoad(MI, MRI, B, false, false);
2898   case Intrinsic::amdgcn_raw_buffer_load_format:
2899   case Intrinsic::amdgcn_struct_buffer_load_format:
2900     return legalizeBufferLoad(MI, MRI, B, true, false);
2901   case Intrinsic::amdgcn_raw_tbuffer_load:
2902   case Intrinsic::amdgcn_struct_tbuffer_load:
2903     return legalizeBufferLoad(MI, MRI, B, true, true);
2904   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2905   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2906   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2907   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2908   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2909   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2910   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2911   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2912   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2913   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2914   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2915   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2916   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2917   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2918   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2919   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2920   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2921   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2922   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2923   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2924   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2925   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2926   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2927   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2928   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2929   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2930     return legalizeBufferAtomic(MI, B, IntrID);
2931   case Intrinsic::amdgcn_atomic_inc:
2932     return legalizeAtomicIncDec(MI, B, true);
2933   case Intrinsic::amdgcn_atomic_dec:
2934     return legalizeAtomicIncDec(MI, B, false);
2935   default:
2936     return true;
2937   }
2938 
2939   return true;
2940 }
2941