1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 static LegalityPredicate isMultiple32(unsigned TypeIdx,
46                                       unsigned MaxSize = 1024) {
47   return [=](const LegalityQuery &Query) {
48     const LLT Ty = Query.Types[TypeIdx];
49     const LLT EltTy = Ty.getScalarType();
50     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51   };
52 }
53 
54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
55   return [=](const LegalityQuery &Query) {
56     return Query.Types[TypeIdx].getSizeInBits() == Size;
57   };
58 }
59 
60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     return Ty.isVector() &&
64            Ty.getNumElements() % 2 != 0 &&
65            Ty.getElementType().getSizeInBits() < 32 &&
66            Ty.getSizeInBits() % 32 != 0;
67   };
68 }
69 
70 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
71   return [=](const LegalityQuery &Query) {
72     const LLT Ty = Query.Types[TypeIdx];
73     const LLT EltTy = Ty.getScalarType();
74     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
75   };
76 }
77 
78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT Ty = Query.Types[TypeIdx];
81     const LLT EltTy = Ty.getElementType();
82     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
83   };
84 }
85 
86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getElementType();
90     unsigned Size = Ty.getSizeInBits();
91     unsigned Pieces = (Size + 63) / 64;
92     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
93     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
94   };
95 }
96 
97 // Increase the number of vector elements to reach the next multiple of 32-bit
98 // type.
99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
100   return [=](const LegalityQuery &Query) {
101     const LLT Ty = Query.Types[TypeIdx];
102 
103     const LLT EltTy = Ty.getElementType();
104     const int Size = Ty.getSizeInBits();
105     const int EltSize = EltTy.getSizeInBits();
106     const int NextMul32 = (Size + 31) / 32;
107 
108     assert(EltSize < 32);
109 
110     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
111     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
112   };
113 }
114 
115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
116   return [=](const LegalityQuery &Query) {
117     const LLT QueryTy = Query.Types[TypeIdx];
118     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
119   };
120 }
121 
122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
123   return [=](const LegalityQuery &Query) {
124     const LLT QueryTy = Query.Types[TypeIdx];
125     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
126   };
127 }
128 
129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
133   };
134 }
135 
136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
137 // v2s16.
138 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT Ty = Query.Types[TypeIdx];
141     if (Ty.isVector()) {
142       const int EltSize = Ty.getElementType().getSizeInBits();
143       return EltSize == 32 || EltSize == 64 ||
144             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
145              EltSize == 128 || EltSize == 256;
146     }
147 
148     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
149   };
150 }
151 
152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
153   return [=](const LegalityQuery &Query) {
154     return Query.Types[TypeIdx].getElementType() == Type;
155   };
156 }
157 
158 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
159   return [=](const LegalityQuery &Query) {
160     const LLT Ty = Query.Types[TypeIdx];
161     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
162            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
163   };
164 }
165 
166 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
167                                          const GCNTargetMachine &TM)
168   :  ST(ST_) {
169   using namespace TargetOpcode;
170 
171   auto GetAddrSpacePtr = [&TM](unsigned AS) {
172     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
173   };
174 
175   const LLT S1 = LLT::scalar(1);
176   const LLT S8 = LLT::scalar(8);
177   const LLT S16 = LLT::scalar(16);
178   const LLT S32 = LLT::scalar(32);
179   const LLT S64 = LLT::scalar(64);
180   const LLT S96 = LLT::scalar(96);
181   const LLT S128 = LLT::scalar(128);
182   const LLT S256 = LLT::scalar(256);
183   const LLT S1024 = LLT::scalar(1024);
184 
185   const LLT V2S16 = LLT::vector(2, 16);
186   const LLT V4S16 = LLT::vector(4, 16);
187 
188   const LLT V2S32 = LLT::vector(2, 32);
189   const LLT V3S32 = LLT::vector(3, 32);
190   const LLT V4S32 = LLT::vector(4, 32);
191   const LLT V5S32 = LLT::vector(5, 32);
192   const LLT V6S32 = LLT::vector(6, 32);
193   const LLT V7S32 = LLT::vector(7, 32);
194   const LLT V8S32 = LLT::vector(8, 32);
195   const LLT V9S32 = LLT::vector(9, 32);
196   const LLT V10S32 = LLT::vector(10, 32);
197   const LLT V11S32 = LLT::vector(11, 32);
198   const LLT V12S32 = LLT::vector(12, 32);
199   const LLT V13S32 = LLT::vector(13, 32);
200   const LLT V14S32 = LLT::vector(14, 32);
201   const LLT V15S32 = LLT::vector(15, 32);
202   const LLT V16S32 = LLT::vector(16, 32);
203   const LLT V32S32 = LLT::vector(32, 32);
204 
205   const LLT V2S64 = LLT::vector(2, 64);
206   const LLT V3S64 = LLT::vector(3, 64);
207   const LLT V4S64 = LLT::vector(4, 64);
208   const LLT V5S64 = LLT::vector(5, 64);
209   const LLT V6S64 = LLT::vector(6, 64);
210   const LLT V7S64 = LLT::vector(7, 64);
211   const LLT V8S64 = LLT::vector(8, 64);
212   const LLT V16S64 = LLT::vector(16, 64);
213 
214   std::initializer_list<LLT> AllS32Vectors =
215     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
216      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
217   std::initializer_list<LLT> AllS64Vectors =
218     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
219 
220   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
221   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
222   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
223   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
224   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
225   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
226   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
227 
228   const LLT CodePtr = FlatPtr;
229 
230   const std::initializer_list<LLT> AddrSpaces64 = {
231     GlobalPtr, ConstantPtr, FlatPtr
232   };
233 
234   const std::initializer_list<LLT> AddrSpaces32 = {
235     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
236   };
237 
238   const std::initializer_list<LLT> FPTypesBase = {
239     S32, S64
240   };
241 
242   const std::initializer_list<LLT> FPTypes16 = {
243     S32, S64, S16
244   };
245 
246   const std::initializer_list<LLT> FPTypesPK16 = {
247     S32, S64, S16, V2S16
248   };
249 
250   const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32;
251 
252   setAction({G_BRCOND, S1}, Legal); // VCC branches
253   setAction({G_BRCOND, S32}, Legal); // SCC branches
254 
255   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
256   // elements for v3s16
257   getActionDefinitionsBuilder(G_PHI)
258     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
259     .legalFor(AllS32Vectors)
260     .legalFor(AllS64Vectors)
261     .legalFor(AddrSpaces64)
262     .legalFor(AddrSpaces32)
263     .clampScalar(0, S32, S256)
264     .widenScalarToNextPow2(0, 32)
265     .clampMaxNumElements(0, S32, 16)
266     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
267     .legalIf(isPointer(0));
268 
269   if (ST.has16BitInsts()) {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32, S16})
272       .clampScalar(0, S16, S32)
273       .scalarize(0);
274   } else {
275     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
276       .legalFor({S32})
277       .clampScalar(0, S32, S32)
278       .scalarize(0);
279   }
280 
281   // FIXME: Not really legal. Placeholder for custom lowering.
282   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
283     .legalFor({S32, S64})
284     .clampScalar(0, S32, S64)
285     .widenScalarToNextPow2(0, 32)
286     .scalarize(0);
287 
288   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
289     .legalFor({S32})
290     .clampScalar(0, S32, S32)
291     .scalarize(0);
292 
293   // Report legal for any types we can handle anywhere. For the cases only legal
294   // on the SALU, RegBankSelect will be able to re-legalize.
295   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
296     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
297     .clampScalar(0, S32, S64)
298     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
299     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
300     .widenScalarToNextPow2(0)
301     .scalarize(0);
302 
303   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
304                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
305     .legalFor({{S32, S1}, {S32, S32}})
306     .clampScalar(0, S32, S32)
307     .scalarize(0); // TODO: Implement.
308 
309   getActionDefinitionsBuilder(G_BITCAST)
310     // Don't worry about the size constraint.
311     .legalIf(all(isRegisterType(0), isRegisterType(1)))
312     // FIXME: Testing hack
313     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
314     .lower();
315 
316 
317   getActionDefinitionsBuilder(G_CONSTANT)
318     .legalFor({S1, S32, S64, S16, GlobalPtr,
319                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
320     .clampScalar(0, S32, S64)
321     .widenScalarToNextPow2(0)
322     .legalIf(isPointer(0));
323 
324   getActionDefinitionsBuilder(G_FCONSTANT)
325     .legalFor({S32, S64, S16})
326     .clampScalar(0, S16, S64);
327 
328   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
329     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
330                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
331     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
332     .clampScalarOrElt(0, S32, S1024)
333     .legalIf(isMultiple32(0))
334     .widenScalarToNextPow2(0, 32)
335     .clampMaxNumElements(0, S32, 16);
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   if (ST.has16BitInsts()) {
401     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
402       .legalFor({S32, S64, S16})
403       .scalarize(0)
404       .clampScalar(0, S16, S64);
405   } else {
406     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
407       .legalFor({S32, S64})
408       .scalarize(0)
409       .clampScalar(0, S32, S64);
410   }
411 
412   getActionDefinitionsBuilder(G_FPTRUNC)
413     .legalFor({{S32, S64}, {S16, S32}})
414     .scalarize(0);
415 
416   getActionDefinitionsBuilder(G_FPEXT)
417     .legalFor({{S64, S32}, {S32, S16}})
418     .lowerFor({{S64, S16}}) // FIXME: Implement
419     .scalarize(0);
420 
421   getActionDefinitionsBuilder(G_FSUB)
422       // Use actual fsub instruction
423       .legalFor({S32})
424       // Must use fadd + fneg
425       .lowerFor({S64, S16, V2S16})
426       .scalarize(0)
427       .clampScalar(0, S32, S64);
428 
429   // Whether this is legal depends on the floating point mode for the function.
430   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
431   if (ST.hasMadF16())
432     FMad.customFor({S32, S16});
433   else
434     FMad.customFor({S32});
435   FMad.scalarize(0)
436       .lower();
437 
438   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
439     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
440                {S32, S1}, {S64, S1}, {S16, S1},
441                {S96, S32},
442                // FIXME: Hack
443                {S64, LLT::scalar(33)},
444                {S32, S8}, {S32, LLT::scalar(24)}})
445     .scalarize(0)
446     .clampScalar(0, S32, S64);
447 
448   // TODO: Split s1->s64 during regbankselect for VALU.
449   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
450     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
451     .lowerFor({{S32, S64}})
452     .lowerIf(typeIs(1, S1))
453     .customFor({{S64, S64}});
454   if (ST.has16BitInsts())
455     IToFP.legalFor({{S16, S16}});
456   IToFP.clampScalar(1, S32, S64)
457        .scalarize(0);
458 
459   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
460     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
461   if (ST.has16BitInsts())
462     FPToI.legalFor({{S16, S16}});
463   else
464     FPToI.minScalar(1, S32);
465 
466   FPToI.minScalar(0, S32)
467        .scalarize(0);
468 
469   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
470     .scalarize(0)
471     .lower();
472 
473   if (ST.has16BitInsts()) {
474     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
475       .legalFor({S16, S32, S64})
476       .clampScalar(0, S16, S64)
477       .scalarize(0);
478   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
479     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
480       .legalFor({S32, S64})
481       .clampScalar(0, S32, S64)
482       .scalarize(0);
483   } else {
484     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
485       .legalFor({S32})
486       .customFor({S64})
487       .clampScalar(0, S32, S64)
488       .scalarize(0);
489   }
490 
491   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
492     .scalarize(0)
493     .alwaysLegal();
494 
495   auto &CmpBuilder =
496     getActionDefinitionsBuilder(G_ICMP)
497     // The compare output type differs based on the register bank of the output,
498     // so make both s1 and s32 legal.
499     //
500     // Scalar compares producing output in scc will be promoted to s32, as that
501     // is the allocatable register type that will be needed for the copy from
502     // scc. This will be promoted during RegBankSelect, and we assume something
503     // before that won't try to use s32 result types.
504     //
505     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
506     // bank.
507     .legalForCartesianProduct(
508       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
509     .legalForCartesianProduct(
510       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
511   if (ST.has16BitInsts()) {
512     CmpBuilder.legalFor({{S1, S16}});
513   }
514 
515   CmpBuilder
516     .widenScalarToNextPow2(1)
517     .clampScalar(1, S32, S64)
518     .scalarize(0)
519     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
520 
521   getActionDefinitionsBuilder(G_FCMP)
522     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
523     .widenScalarToNextPow2(1)
524     .clampScalar(1, S32, S64)
525     .scalarize(0);
526 
527   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
528   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
529                                G_FLOG, G_FLOG2, G_FLOG10})
530     .legalFor({S32})
531     .scalarize(0);
532 
533   // The 64-bit versions produce 32-bit results, but only on the SALU.
534   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
535                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
536                                G_CTPOP})
537     .legalFor({{S32, S32}, {S32, S64}})
538     .clampScalar(0, S32, S32)
539     .clampScalar(1, S32, S64)
540     .scalarize(0)
541     .widenScalarToNextPow2(0, 32)
542     .widenScalarToNextPow2(1, 32);
543 
544   // TODO: Expand for > s32
545   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
546     .legalFor({S32})
547     .clampScalar(0, S32, S32)
548     .scalarize(0);
549 
550   if (ST.has16BitInsts()) {
551     if (ST.hasVOP3PInsts()) {
552       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
553         .legalFor({S32, S16, V2S16})
554         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
555         .clampMaxNumElements(0, S16, 2)
556         .clampScalar(0, S16, S32)
557         .widenScalarToNextPow2(0)
558         .scalarize(0);
559     } else {
560       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
561         .legalFor({S32, S16})
562         .widenScalarToNextPow2(0)
563         .clampScalar(0, S16, S32)
564         .scalarize(0);
565     }
566   } else {
567     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
568       .legalFor({S32})
569       .clampScalar(0, S32, S32)
570       .widenScalarToNextPow2(0)
571       .scalarize(0);
572   }
573 
574   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
575     return [=](const LegalityQuery &Query) {
576       return Query.Types[TypeIdx0].getSizeInBits() <
577              Query.Types[TypeIdx1].getSizeInBits();
578     };
579   };
580 
581   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
582     return [=](const LegalityQuery &Query) {
583       return Query.Types[TypeIdx0].getSizeInBits() >
584              Query.Types[TypeIdx1].getSizeInBits();
585     };
586   };
587 
588   getActionDefinitionsBuilder(G_INTTOPTR)
589     // List the common cases
590     .legalForCartesianProduct(AddrSpaces64, {S64})
591     .legalForCartesianProduct(AddrSpaces32, {S32})
592     .scalarize(0)
593     // Accept any address space as long as the size matches
594     .legalIf(sameSize(0, 1))
595     .widenScalarIf(smallerThan(1, 0),
596       [](const LegalityQuery &Query) {
597         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
598       })
599     .narrowScalarIf(greaterThan(1, 0),
600       [](const LegalityQuery &Query) {
601         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
602       });
603 
604   getActionDefinitionsBuilder(G_PTRTOINT)
605     // List the common cases
606     .legalForCartesianProduct(AddrSpaces64, {S64})
607     .legalForCartesianProduct(AddrSpaces32, {S32})
608     .scalarize(0)
609     // Accept any address space as long as the size matches
610     .legalIf(sameSize(0, 1))
611     .widenScalarIf(smallerThan(0, 1),
612       [](const LegalityQuery &Query) {
613         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
614       })
615     .narrowScalarIf(
616       greaterThan(0, 1),
617       [](const LegalityQuery &Query) {
618         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
619       });
620 
621   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
622     .scalarize(0)
623     .custom();
624 
625   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
626   // handle some operations by just promoting the register during
627   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
628   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
629     switch (AS) {
630     // FIXME: Private element size.
631     case AMDGPUAS::PRIVATE_ADDRESS:
632       return 32;
633     // FIXME: Check subtarget
634     case AMDGPUAS::LOCAL_ADDRESS:
635       return ST.useDS128() ? 128 : 64;
636 
637     // Treat constant and global as identical. SMRD loads are sometimes usable
638     // for global loads (ideally constant address space should be eliminated)
639     // depending on the context. Legality cannot be context dependent, but
640     // RegBankSelect can split the load as necessary depending on the pointer
641     // register bank/uniformity and if the memory is invariant or not written in
642     // a kernel.
643     case AMDGPUAS::CONSTANT_ADDRESS:
644     case AMDGPUAS::GLOBAL_ADDRESS:
645       return 512;
646     default:
647       return 128;
648     }
649   };
650 
651   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
652     const LLT DstTy = Query.Types[0];
653 
654     // Split vector extloads.
655     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
656     unsigned Align = Query.MMODescrs[0].AlignInBits;
657 
658     if (MemSize < DstTy.getSizeInBits())
659       MemSize = std::max(MemSize, Align);
660 
661     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
662       return true;
663 
664     const LLT PtrTy = Query.Types[1];
665     unsigned AS = PtrTy.getAddressSpace();
666     if (MemSize > maxSizeForAddrSpace(AS))
667       return true;
668 
669     // Catch weird sized loads that don't evenly divide into the access sizes
670     // TODO: May be able to widen depending on alignment etc.
671     unsigned NumRegs = MemSize / 32;
672     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
673       return true;
674 
675     if (Align < MemSize) {
676       const SITargetLowering *TLI = ST.getTargetLowering();
677       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
678     }
679 
680     return false;
681   };
682 
683   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
684   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
685   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
686 
687   // TODO: Refine based on subtargets which support unaligned access or 128-bit
688   // LDS
689   // TODO: Unsupported flat for SI.
690 
691   for (unsigned Op : {G_LOAD, G_STORE}) {
692     const bool IsStore = Op == G_STORE;
693 
694     auto &Actions = getActionDefinitionsBuilder(Op);
695     // Whitelist the common cases.
696     // TODO: Pointer loads
697     // TODO: Wide constant loads
698     // TODO: Only CI+ has 3x loads
699     // TODO: Loads to s16 on gfx9
700     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
701                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
702                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
703                                       {S96, GlobalPtr, 96, GlobalAlign32},
704                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
705                                       {S128, GlobalPtr, 128, GlobalAlign32},
706                                       {S64, GlobalPtr, 64, GlobalAlign32},
707                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
708                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
709                                       {S32, GlobalPtr, 8, GlobalAlign8},
710                                       {S32, GlobalPtr, 16, GlobalAlign16},
711 
712                                       {S32, LocalPtr, 32, 32},
713                                       {S64, LocalPtr, 64, 32},
714                                       {V2S32, LocalPtr, 64, 32},
715                                       {S32, LocalPtr, 8, 8},
716                                       {S32, LocalPtr, 16, 16},
717                                       {V2S16, LocalPtr, 32, 32},
718 
719                                       {S32, PrivatePtr, 32, 32},
720                                       {S32, PrivatePtr, 8, 8},
721                                       {S32, PrivatePtr, 16, 16},
722                                       {V2S16, PrivatePtr, 32, 32},
723 
724                                       {S32, FlatPtr, 32, GlobalAlign32},
725                                       {S32, FlatPtr, 16, GlobalAlign16},
726                                       {S32, FlatPtr, 8, GlobalAlign8},
727                                       {V2S16, FlatPtr, 32, GlobalAlign32},
728 
729                                       {S32, ConstantPtr, 32, GlobalAlign32},
730                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
731                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
732                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
733                                       {S64, ConstantPtr, 64, GlobalAlign32},
734                                       {S128, ConstantPtr, 128, GlobalAlign32},
735                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
736     Actions
737         .customIf(typeIs(1, Constant32Ptr))
738         .narrowScalarIf(
739             [=](const LegalityQuery &Query) -> bool {
740               return !Query.Types[0].isVector() && needToSplitLoad(Query);
741             },
742             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
743               const LLT DstTy = Query.Types[0];
744               const LLT PtrTy = Query.Types[1];
745 
746               const unsigned DstSize = DstTy.getSizeInBits();
747               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
748 
749               // Split extloads.
750               if (DstSize > MemSize)
751                 return std::make_pair(0, LLT::scalar(MemSize));
752 
753               if (DstSize > 32 && (DstSize % 32 != 0)) {
754                 // FIXME: Need a way to specify non-extload of larger size if
755                 // suitably aligned.
756                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
757               }
758 
759               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
760               if (MemSize > MaxSize)
761                 return std::make_pair(0, LLT::scalar(MaxSize));
762 
763               unsigned Align = Query.MMODescrs[0].AlignInBits;
764               return std::make_pair(0, LLT::scalar(Align));
765             })
766         .fewerElementsIf(
767             [=](const LegalityQuery &Query) -> bool {
768               return Query.Types[0].isVector() && needToSplitLoad(Query);
769             },
770             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
771               const LLT DstTy = Query.Types[0];
772               const LLT PtrTy = Query.Types[1];
773 
774               LLT EltTy = DstTy.getElementType();
775               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
776 
777               // Split if it's too large for the address space.
778               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
779                 unsigned NumElts = DstTy.getNumElements();
780                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
781 
782                 // FIXME: Refine when odd breakdowns handled
783                 // The scalars will need to be re-legalized.
784                 if (NumPieces == 1 || NumPieces >= NumElts ||
785                     NumElts % NumPieces != 0)
786                   return std::make_pair(0, EltTy);
787 
788                 return std::make_pair(0,
789                                       LLT::vector(NumElts / NumPieces, EltTy));
790               }
791 
792               // Need to split because of alignment.
793               unsigned Align = Query.MMODescrs[0].AlignInBits;
794               unsigned EltSize = EltTy.getSizeInBits();
795               if (EltSize > Align &&
796                   (EltSize / Align < DstTy.getNumElements())) {
797                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
798               }
799 
800               // May need relegalization for the scalars.
801               return std::make_pair(0, EltTy);
802             })
803         .minScalar(0, S32);
804 
805     if (IsStore)
806       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
807 
808     // TODO: Need a bitcast lower option?
809     Actions
810         .legalIf([=](const LegalityQuery &Query) {
811           const LLT Ty0 = Query.Types[0];
812           unsigned Size = Ty0.getSizeInBits();
813           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
814           unsigned Align = Query.MMODescrs[0].AlignInBits;
815 
816           // FIXME: Widening store from alignment not valid.
817           if (MemSize < Size)
818             MemSize = std::max(MemSize, Align);
819 
820           // No extending vector loads.
821           if (Size > MemSize && Ty0.isVector())
822             return false;
823 
824           switch (MemSize) {
825           case 8:
826           case 16:
827             return Size == 32;
828           case 32:
829           case 64:
830           case 128:
831             return true;
832           case 96:
833             return ST.hasDwordx3LoadStores();
834           case 256:
835           case 512:
836             return true;
837           default:
838             return false;
839           }
840         })
841         .widenScalarToNextPow2(0)
842         // TODO: v3s32->v4s32 with alignment
843         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
844   }
845 
846   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
847                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
848                                                   {S32, GlobalPtr, 16, 2 * 8},
849                                                   {S32, LocalPtr, 8, 8},
850                                                   {S32, LocalPtr, 16, 16},
851                                                   {S32, PrivatePtr, 8, 8},
852                                                   {S32, PrivatePtr, 16, 16},
853                                                   {S32, ConstantPtr, 8, 8},
854                                                   {S32, ConstantPtr, 16, 2 * 8}});
855   if (ST.hasFlatAddressSpace()) {
856     ExtLoads.legalForTypesWithMemDesc(
857         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
858   }
859 
860   ExtLoads.clampScalar(0, S32, S32)
861           .widenScalarToNextPow2(0)
862           .unsupportedIfMemSizeNotPow2()
863           .lower();
864 
865   auto &Atomics = getActionDefinitionsBuilder(
866     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
867      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
868      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
869      G_ATOMICRMW_UMIN})
870     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
871                {S64, GlobalPtr}, {S64, LocalPtr}});
872   if (ST.hasFlatAddressSpace()) {
873     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
874   }
875 
876   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
877     .legalFor({{S32, LocalPtr}});
878 
879   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
880   // demarshalling
881   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
882     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
883                 {S32, FlatPtr}, {S64, FlatPtr}})
884     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
885                {S32, RegionPtr}, {S64, RegionPtr}});
886   // TODO: Pointer types, any 32-bit or 64-bit vector
887 
888   // Condition should be s32 for scalar, s1 for vector.
889   getActionDefinitionsBuilder(G_SELECT)
890     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
891           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
892           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
893     .clampScalar(0, S16, S64)
894     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
895     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
896     .scalarize(1)
897     .clampMaxNumElements(0, S32, 2)
898     .clampMaxNumElements(0, LocalPtr, 2)
899     .clampMaxNumElements(0, PrivatePtr, 2)
900     .scalarize(0)
901     .widenScalarToNextPow2(0)
902     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
903 
904   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
905   // be more flexible with the shift amount type.
906   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
907     .legalFor({{S32, S32}, {S64, S32}});
908   if (ST.has16BitInsts()) {
909     if (ST.hasVOP3PInsts()) {
910       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
911             .clampMaxNumElements(0, S16, 2);
912     } else
913       Shifts.legalFor({{S16, S32}, {S16, S16}});
914 
915     // TODO: Support 16-bit shift amounts
916     Shifts.clampScalar(1, S32, S32);
917     Shifts.clampScalar(0, S16, S64);
918     Shifts.widenScalarToNextPow2(0, 16);
919   } else {
920     // Make sure we legalize the shift amount type first, as the general
921     // expansion for the shifted type will produce much worse code if it hasn't
922     // been truncated already.
923     Shifts.clampScalar(1, S32, S32);
924     Shifts.clampScalar(0, S32, S64);
925     Shifts.widenScalarToNextPow2(0, 32);
926   }
927   Shifts.scalarize(0);
928 
929   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
930     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
931     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
932     unsigned IdxTypeIdx = 2;
933 
934     getActionDefinitionsBuilder(Op)
935       .customIf([=](const LegalityQuery &Query) {
936           const LLT EltTy = Query.Types[EltTypeIdx];
937           const LLT VecTy = Query.Types[VecTypeIdx];
938           const LLT IdxTy = Query.Types[IdxTypeIdx];
939           return (EltTy.getSizeInBits() == 16 ||
940                   EltTy.getSizeInBits() % 32 == 0) &&
941                  VecTy.getSizeInBits() % 32 == 0 &&
942                  VecTy.getSizeInBits() <= 1024 &&
943                  IdxTy.getSizeInBits() == 32;
944         })
945       .clampScalar(EltTypeIdx, S32, S64)
946       .clampScalar(VecTypeIdx, S32, S64)
947       .clampScalar(IdxTypeIdx, S32, S32);
948   }
949 
950   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
951     .unsupportedIf([=](const LegalityQuery &Query) {
952         const LLT &EltTy = Query.Types[1].getElementType();
953         return Query.Types[0] != EltTy;
954       });
955 
956   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
957     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
958     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
959 
960     // FIXME: Doesn't handle extract of illegal sizes.
961     getActionDefinitionsBuilder(Op)
962       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
963       // FIXME: Multiples of 16 should not be legal.
964       .legalIf([=](const LegalityQuery &Query) {
965           const LLT BigTy = Query.Types[BigTyIdx];
966           const LLT LitTy = Query.Types[LitTyIdx];
967           return (BigTy.getSizeInBits() % 32 == 0) &&
968                  (LitTy.getSizeInBits() % 16 == 0);
969         })
970       .widenScalarIf(
971         [=](const LegalityQuery &Query) {
972           const LLT BigTy = Query.Types[BigTyIdx];
973           return (BigTy.getScalarSizeInBits() < 16);
974         },
975         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
976       .widenScalarIf(
977         [=](const LegalityQuery &Query) {
978           const LLT LitTy = Query.Types[LitTyIdx];
979           return (LitTy.getScalarSizeInBits() < 16);
980         },
981         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
982       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
983       .widenScalarToNextPow2(BigTyIdx, 32);
984 
985   }
986 
987   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
988     .legalForCartesianProduct(AllS32Vectors, {S32})
989     .legalForCartesianProduct(AllS64Vectors, {S64})
990     .clampNumElements(0, V16S32, V32S32)
991     .clampNumElements(0, V2S64, V16S64)
992     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
993 
994   if (ST.hasScalarPackInsts())
995     BuildVector.legalFor({V2S16, S32});
996 
997   BuildVector
998     .minScalarSameAs(1, 0)
999     .legalIf(isRegisterType(0))
1000     .minScalarOrElt(0, S32);
1001 
1002   if (ST.hasScalarPackInsts()) {
1003     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1004       .legalFor({V2S16, S32})
1005       .lower();
1006   } else {
1007     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1008       .lower();
1009   }
1010 
1011   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1012     .legalIf(isRegisterType(0));
1013 
1014   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1015   // pre-legalize.
1016   if (ST.hasVOP3PInsts()) {
1017     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1018       .customFor({V2S16, V2S16})
1019       .lower();
1020   } else
1021     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1022 
1023   // Merge/Unmerge
1024   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1025     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1026     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1027 
1028     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1029       const LLT &Ty = Query.Types[TypeIdx];
1030       if (Ty.isVector()) {
1031         const LLT &EltTy = Ty.getElementType();
1032         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1033           return true;
1034         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1035           return true;
1036       }
1037       return false;
1038     };
1039 
1040     auto &Builder = getActionDefinitionsBuilder(Op)
1041       // Try to widen to s16 first for small types.
1042       // TODO: Only do this on targets with legal s16 shifts
1043       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1044 
1045       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1046       .lowerFor({{S16, V2S16}})
1047       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1048       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1049                            elementTypeIs(1, S16)),
1050                        changeTo(1, V2S16))
1051       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1052       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1053       // valid.
1054       .clampScalar(LitTyIdx, S32, S256)
1055       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1056       // Break up vectors with weird elements into scalars
1057       .fewerElementsIf(
1058         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1059         scalarize(0))
1060       .fewerElementsIf(
1061         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1062         scalarize(1))
1063       .clampScalar(BigTyIdx, S32, S1024);
1064 
1065     if (Op == G_MERGE_VALUES) {
1066       Builder.widenScalarIf(
1067         // TODO: Use 16-bit shifts if legal for 8-bit values?
1068         [=](const LegalityQuery &Query) {
1069           const LLT Ty = Query.Types[LitTyIdx];
1070           return Ty.getSizeInBits() < 32;
1071         },
1072         changeTo(LitTyIdx, S32));
1073     }
1074 
1075     Builder.widenScalarIf(
1076       [=](const LegalityQuery &Query) {
1077         const LLT Ty = Query.Types[BigTyIdx];
1078         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1079           Ty.getSizeInBits() % 16 != 0;
1080       },
1081       [=](const LegalityQuery &Query) {
1082         // Pick the next power of 2, or a multiple of 64 over 128.
1083         // Whichever is smaller.
1084         const LLT &Ty = Query.Types[BigTyIdx];
1085         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1086         if (NewSizeInBits >= 256) {
1087           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1088           if (RoundedTo < NewSizeInBits)
1089             NewSizeInBits = RoundedTo;
1090         }
1091         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1092       })
1093       .legalIf([=](const LegalityQuery &Query) {
1094           const LLT &BigTy = Query.Types[BigTyIdx];
1095           const LLT &LitTy = Query.Types[LitTyIdx];
1096 
1097           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1098             return false;
1099           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1100             return false;
1101 
1102           return BigTy.getSizeInBits() % 16 == 0 &&
1103                  LitTy.getSizeInBits() % 16 == 0 &&
1104                  BigTy.getSizeInBits() <= 1024;
1105         })
1106       // Any vectors left are the wrong size. Scalarize them.
1107       .scalarize(0)
1108       .scalarize(1);
1109   }
1110 
1111   // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect.
1112   getActionDefinitionsBuilder(G_SEXT_INREG)
1113     .clampScalar(0, MinLegalScalarShiftTy, S64)
1114     .lower();
1115 
1116   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1117     .legalFor({S64});
1118 
1119   getActionDefinitionsBuilder({
1120       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1121       G_FCOPYSIGN,
1122 
1123       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1124       G_READ_REGISTER,
1125       G_WRITE_REGISTER,
1126 
1127       G_SADDO, G_SSUBO,
1128 
1129        // TODO: Implement
1130       G_FMINIMUM, G_FMAXIMUM
1131     }).lower();
1132 
1133   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1134         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1135         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1136     .unsupported();
1137 
1138   computeTables();
1139   verify(*ST.getInstrInfo());
1140 }
1141 
1142 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1143                                          MachineRegisterInfo &MRI,
1144                                          MachineIRBuilder &B,
1145                                          GISelChangeObserver &Observer) const {
1146   switch (MI.getOpcode()) {
1147   case TargetOpcode::G_ADDRSPACE_CAST:
1148     return legalizeAddrSpaceCast(MI, MRI, B);
1149   case TargetOpcode::G_FRINT:
1150     return legalizeFrint(MI, MRI, B);
1151   case TargetOpcode::G_FCEIL:
1152     return legalizeFceil(MI, MRI, B);
1153   case TargetOpcode::G_INTRINSIC_TRUNC:
1154     return legalizeIntrinsicTrunc(MI, MRI, B);
1155   case TargetOpcode::G_SITOFP:
1156     return legalizeITOFP(MI, MRI, B, true);
1157   case TargetOpcode::G_UITOFP:
1158     return legalizeITOFP(MI, MRI, B, false);
1159   case TargetOpcode::G_FMINNUM:
1160   case TargetOpcode::G_FMAXNUM:
1161   case TargetOpcode::G_FMINNUM_IEEE:
1162   case TargetOpcode::G_FMAXNUM_IEEE:
1163     return legalizeMinNumMaxNum(MI, MRI, B);
1164   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1165     return legalizeExtractVectorElt(MI, MRI, B);
1166   case TargetOpcode::G_INSERT_VECTOR_ELT:
1167     return legalizeInsertVectorElt(MI, MRI, B);
1168   case TargetOpcode::G_SHUFFLE_VECTOR:
1169     return legalizeShuffleVector(MI, MRI, B);
1170   case TargetOpcode::G_FSIN:
1171   case TargetOpcode::G_FCOS:
1172     return legalizeSinCos(MI, MRI, B);
1173   case TargetOpcode::G_GLOBAL_VALUE:
1174     return legalizeGlobalValue(MI, MRI, B);
1175   case TargetOpcode::G_LOAD:
1176     return legalizeLoad(MI, MRI, B, Observer);
1177   case TargetOpcode::G_FMAD:
1178     return legalizeFMad(MI, MRI, B);
1179   case TargetOpcode::G_FDIV:
1180     return legalizeFDIV(MI, MRI, B);
1181   case TargetOpcode::G_ATOMIC_CMPXCHG:
1182     return legalizeAtomicCmpXChg(MI, MRI, B);
1183   default:
1184     return false;
1185   }
1186 
1187   llvm_unreachable("expected switch to return");
1188 }
1189 
1190 Register AMDGPULegalizerInfo::getSegmentAperture(
1191   unsigned AS,
1192   MachineRegisterInfo &MRI,
1193   MachineIRBuilder &B) const {
1194   MachineFunction &MF = B.getMF();
1195   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1196   const LLT S32 = LLT::scalar(32);
1197 
1198   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1199 
1200   if (ST.hasApertureRegs()) {
1201     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1202     // getreg.
1203     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1204         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1205         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1206     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1207         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1208         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1209     unsigned Encoding =
1210         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1211         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1212         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1213 
1214     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1215     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1216 
1217     B.buildInstr(AMDGPU::S_GETREG_B32)
1218       .addDef(GetReg)
1219       .addImm(Encoding);
1220     MRI.setType(GetReg, S32);
1221 
1222     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1223     B.buildInstr(TargetOpcode::G_SHL)
1224       .addDef(ApertureReg)
1225       .addUse(GetReg)
1226       .addUse(ShiftAmt.getReg(0));
1227 
1228     return ApertureReg;
1229   }
1230 
1231   Register QueuePtr = MRI.createGenericVirtualRegister(
1232     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1233 
1234   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1235   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1236     return Register();
1237 
1238   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1239   // private_segment_aperture_base_hi.
1240   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1241 
1242   // TODO: can we be smarter about machine pointer info?
1243   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1244   MachineMemOperand *MMO = MF.getMachineMemOperand(
1245     PtrInfo,
1246     MachineMemOperand::MOLoad |
1247     MachineMemOperand::MODereferenceable |
1248     MachineMemOperand::MOInvariant,
1249     4,
1250     MinAlign(64, StructOffset));
1251 
1252   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1253   Register LoadAddr;
1254 
1255   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1256   B.buildLoad(LoadResult, LoadAddr, *MMO);
1257   return LoadResult;
1258 }
1259 
1260 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1261   MachineInstr &MI, MachineRegisterInfo &MRI,
1262   MachineIRBuilder &B) const {
1263   MachineFunction &MF = B.getMF();
1264 
1265   B.setInstr(MI);
1266 
1267   const LLT S32 = LLT::scalar(32);
1268   Register Dst = MI.getOperand(0).getReg();
1269   Register Src = MI.getOperand(1).getReg();
1270 
1271   LLT DstTy = MRI.getType(Dst);
1272   LLT SrcTy = MRI.getType(Src);
1273   unsigned DestAS = DstTy.getAddressSpace();
1274   unsigned SrcAS = SrcTy.getAddressSpace();
1275 
1276   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1277   // vector element.
1278   assert(!DstTy.isVector());
1279 
1280   const AMDGPUTargetMachine &TM
1281     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1282 
1283   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1284   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1285     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1286     return true;
1287   }
1288 
1289   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1290     // Truncate.
1291     B.buildExtract(Dst, Src, 0);
1292     MI.eraseFromParent();
1293     return true;
1294   }
1295 
1296   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1297     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1298     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1299 
1300     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1301     // another. Merge operands are required to be the same type, but creating an
1302     // extra ptrtoint would be kind of pointless.
1303     auto HighAddr = B.buildConstant(
1304       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1305     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1306     MI.eraseFromParent();
1307     return true;
1308   }
1309 
1310   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1311     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1312            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1313     unsigned NullVal = TM.getNullPointerValue(DestAS);
1314 
1315     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1316     auto FlatNull = B.buildConstant(SrcTy, 0);
1317 
1318     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1319 
1320     // Extract low 32-bits of the pointer.
1321     B.buildExtract(PtrLo32, Src, 0);
1322 
1323     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1324     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1325     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1326 
1327     MI.eraseFromParent();
1328     return true;
1329   }
1330 
1331   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1332     return false;
1333 
1334   if (!ST.hasFlatAddressSpace())
1335     return false;
1336 
1337   auto SegmentNull =
1338       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1339   auto FlatNull =
1340       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1341 
1342   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1343   if (!ApertureReg.isValid())
1344     return false;
1345 
1346   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1347   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1348 
1349   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1350 
1351   // Coerce the type of the low half of the result so we can use merge_values.
1352   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1353   B.buildInstr(TargetOpcode::G_PTRTOINT)
1354     .addDef(SrcAsInt)
1355     .addUse(Src);
1356 
1357   // TODO: Should we allow mismatched types but matching sizes in merges to
1358   // avoid the ptrtoint?
1359   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1360   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1361 
1362   MI.eraseFromParent();
1363   return true;
1364 }
1365 
1366 bool AMDGPULegalizerInfo::legalizeFrint(
1367   MachineInstr &MI, MachineRegisterInfo &MRI,
1368   MachineIRBuilder &B) const {
1369   B.setInstr(MI);
1370 
1371   Register Src = MI.getOperand(1).getReg();
1372   LLT Ty = MRI.getType(Src);
1373   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1374 
1375   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1376   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1377 
1378   auto C1 = B.buildFConstant(Ty, C1Val);
1379   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1380 
1381   // TODO: Should this propagate fast-math-flags?
1382   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1383   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1384 
1385   auto C2 = B.buildFConstant(Ty, C2Val);
1386   auto Fabs = B.buildFAbs(Ty, Src);
1387 
1388   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1389   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1390   return true;
1391 }
1392 
1393 bool AMDGPULegalizerInfo::legalizeFceil(
1394   MachineInstr &MI, MachineRegisterInfo &MRI,
1395   MachineIRBuilder &B) const {
1396   B.setInstr(MI);
1397 
1398   const LLT S1 = LLT::scalar(1);
1399   const LLT S64 = LLT::scalar(64);
1400 
1401   Register Src = MI.getOperand(1).getReg();
1402   assert(MRI.getType(Src) == S64);
1403 
1404   // result = trunc(src)
1405   // if (src > 0.0 && src != result)
1406   //   result += 1.0
1407 
1408   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1409 
1410   const auto Zero = B.buildFConstant(S64, 0.0);
1411   const auto One = B.buildFConstant(S64, 1.0);
1412   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1413   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1414   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1415   auto Add = B.buildSelect(S64, And, One, Zero);
1416 
1417   // TODO: Should this propagate fast-math-flags?
1418   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1419   return true;
1420 }
1421 
1422 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1423                                               MachineIRBuilder &B) {
1424   const unsigned FractBits = 52;
1425   const unsigned ExpBits = 11;
1426   LLT S32 = LLT::scalar(32);
1427 
1428   auto Const0 = B.buildConstant(S32, FractBits - 32);
1429   auto Const1 = B.buildConstant(S32, ExpBits);
1430 
1431   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1432     .addUse(Const0.getReg(0))
1433     .addUse(Const1.getReg(0));
1434 
1435   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1436 }
1437 
1438 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1439   MachineInstr &MI, MachineRegisterInfo &MRI,
1440   MachineIRBuilder &B) const {
1441   B.setInstr(MI);
1442 
1443   const LLT S1 = LLT::scalar(1);
1444   const LLT S32 = LLT::scalar(32);
1445   const LLT S64 = LLT::scalar(64);
1446 
1447   Register Src = MI.getOperand(1).getReg();
1448   assert(MRI.getType(Src) == S64);
1449 
1450   // TODO: Should this use extract since the low half is unused?
1451   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1452   Register Hi = Unmerge.getReg(1);
1453 
1454   // Extract the upper half, since this is where we will find the sign and
1455   // exponent.
1456   auto Exp = extractF64Exponent(Hi, B);
1457 
1458   const unsigned FractBits = 52;
1459 
1460   // Extract the sign bit.
1461   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1462   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1463 
1464   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1465 
1466   const auto Zero32 = B.buildConstant(S32, 0);
1467 
1468   // Extend back to 64-bits.
1469   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1470 
1471   auto Shr = B.buildAShr(S64, FractMask, Exp);
1472   auto Not = B.buildNot(S64, Shr);
1473   auto Tmp0 = B.buildAnd(S64, Src, Not);
1474   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1475 
1476   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1477   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1478 
1479   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1480   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1481   return true;
1482 }
1483 
1484 bool AMDGPULegalizerInfo::legalizeITOFP(
1485   MachineInstr &MI, MachineRegisterInfo &MRI,
1486   MachineIRBuilder &B, bool Signed) const {
1487   B.setInstr(MI);
1488 
1489   Register Dst = MI.getOperand(0).getReg();
1490   Register Src = MI.getOperand(1).getReg();
1491 
1492   const LLT S64 = LLT::scalar(64);
1493   const LLT S32 = LLT::scalar(32);
1494 
1495   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1496 
1497   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1498 
1499   auto CvtHi = Signed ?
1500     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1501     B.buildUITOFP(S64, Unmerge.getReg(1));
1502 
1503   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1504 
1505   auto ThirtyTwo = B.buildConstant(S32, 32);
1506   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1507     .addUse(CvtHi.getReg(0))
1508     .addUse(ThirtyTwo.getReg(0));
1509 
1510   // TODO: Should this propagate fast-math-flags?
1511   B.buildFAdd(Dst, LdExp, CvtLo);
1512   MI.eraseFromParent();
1513   return true;
1514 }
1515 
1516 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1517   MachineInstr &MI, MachineRegisterInfo &MRI,
1518   MachineIRBuilder &B) const {
1519   MachineFunction &MF = B.getMF();
1520   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1521 
1522   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1523                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1524 
1525   // With ieee_mode disabled, the instructions have the correct behavior
1526   // already for G_FMINNUM/G_FMAXNUM
1527   if (!MFI->getMode().IEEE)
1528     return !IsIEEEOp;
1529 
1530   if (IsIEEEOp)
1531     return true;
1532 
1533   MachineIRBuilder HelperBuilder(MI);
1534   GISelObserverWrapper DummyObserver;
1535   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1536   HelperBuilder.setInstr(MI);
1537   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1538 }
1539 
1540 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1541   MachineInstr &MI, MachineRegisterInfo &MRI,
1542   MachineIRBuilder &B) const {
1543   // TODO: Should move some of this into LegalizerHelper.
1544 
1545   // TODO: Promote dynamic indexing of s16 to s32
1546   // TODO: Dynamic s64 indexing is only legal for SGPR.
1547   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1548   if (!IdxVal) // Dynamic case will be selected to register indexing.
1549     return true;
1550 
1551   Register Dst = MI.getOperand(0).getReg();
1552   Register Vec = MI.getOperand(1).getReg();
1553 
1554   LLT VecTy = MRI.getType(Vec);
1555   LLT EltTy = VecTy.getElementType();
1556   assert(EltTy == MRI.getType(Dst));
1557 
1558   B.setInstr(MI);
1559 
1560   if (IdxVal.getValue() < VecTy.getNumElements())
1561     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1562   else
1563     B.buildUndef(Dst);
1564 
1565   MI.eraseFromParent();
1566   return true;
1567 }
1568 
1569 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1570   MachineInstr &MI, MachineRegisterInfo &MRI,
1571   MachineIRBuilder &B) const {
1572   // TODO: Should move some of this into LegalizerHelper.
1573 
1574   // TODO: Promote dynamic indexing of s16 to s32
1575   // TODO: Dynamic s64 indexing is only legal for SGPR.
1576   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1577   if (!IdxVal) // Dynamic case will be selected to register indexing.
1578     return true;
1579 
1580   Register Dst = MI.getOperand(0).getReg();
1581   Register Vec = MI.getOperand(1).getReg();
1582   Register Ins = MI.getOperand(2).getReg();
1583 
1584   LLT VecTy = MRI.getType(Vec);
1585   LLT EltTy = VecTy.getElementType();
1586   assert(EltTy == MRI.getType(Ins));
1587 
1588   B.setInstr(MI);
1589 
1590   if (IdxVal.getValue() < VecTy.getNumElements())
1591     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1592   else
1593     B.buildUndef(Dst);
1594 
1595   MI.eraseFromParent();
1596   return true;
1597 }
1598 
1599 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1600   assert(Mask.size() == 2);
1601 
1602   // If one half is undef, the other is trivially in the same reg.
1603   if (Mask[0] == -1 || Mask[1] == -1)
1604     return true;
1605   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1606          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1607 }
1608 
1609 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1610   MachineInstr &MI, MachineRegisterInfo &MRI,
1611   MachineIRBuilder &B) const {
1612   const LLT V2S16 = LLT::vector(2, 16);
1613 
1614   Register Dst = MI.getOperand(0).getReg();
1615   Register Src0 = MI.getOperand(1).getReg();
1616   LLT DstTy = MRI.getType(Dst);
1617   LLT SrcTy = MRI.getType(Src0);
1618 
1619   if (SrcTy == V2S16 && DstTy == V2S16 &&
1620       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1621     return true;
1622 
1623   MachineIRBuilder HelperBuilder(MI);
1624   GISelObserverWrapper DummyObserver;
1625   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1626   HelperBuilder.setInstr(MI);
1627   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1628 }
1629 
1630 bool AMDGPULegalizerInfo::legalizeSinCos(
1631   MachineInstr &MI, MachineRegisterInfo &MRI,
1632   MachineIRBuilder &B) const {
1633   B.setInstr(MI);
1634 
1635   Register DstReg = MI.getOperand(0).getReg();
1636   Register SrcReg = MI.getOperand(1).getReg();
1637   LLT Ty = MRI.getType(DstReg);
1638   unsigned Flags = MI.getFlags();
1639 
1640   Register TrigVal;
1641   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1642   if (ST.hasTrigReducedRange()) {
1643     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1644     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1645       .addUse(MulVal.getReg(0))
1646       .setMIFlags(Flags).getReg(0);
1647   } else
1648     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1649 
1650   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1651     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1652   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1653     .addUse(TrigVal)
1654     .setMIFlags(Flags);
1655   MI.eraseFromParent();
1656   return true;
1657 }
1658 
1659 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1660   Register DstReg, LLT PtrTy,
1661   MachineIRBuilder &B, const GlobalValue *GV,
1662   unsigned Offset, unsigned GAFlags) const {
1663   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1664   // to the following code sequence:
1665   //
1666   // For constant address space:
1667   //   s_getpc_b64 s[0:1]
1668   //   s_add_u32 s0, s0, $symbol
1669   //   s_addc_u32 s1, s1, 0
1670   //
1671   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1672   //   a fixup or relocation is emitted to replace $symbol with a literal
1673   //   constant, which is a pc-relative offset from the encoding of the $symbol
1674   //   operand to the global variable.
1675   //
1676   // For global address space:
1677   //   s_getpc_b64 s[0:1]
1678   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1679   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1680   //
1681   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1682   //   fixups or relocations are emitted to replace $symbol@*@lo and
1683   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1684   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1685   //   operand to the global variable.
1686   //
1687   // What we want here is an offset from the value returned by s_getpc
1688   // (which is the address of the s_add_u32 instruction) to the global
1689   // variable, but since the encoding of $symbol starts 4 bytes after the start
1690   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1691   // small. This requires us to add 4 to the global variable offset in order to
1692   // compute the correct address.
1693 
1694   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1695 
1696   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1697     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1698 
1699   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1700     .addDef(PCReg);
1701 
1702   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1703   if (GAFlags == SIInstrInfo::MO_NONE)
1704     MIB.addImm(0);
1705   else
1706     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1707 
1708   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1709 
1710   if (PtrTy.getSizeInBits() == 32)
1711     B.buildExtract(DstReg, PCReg, 0);
1712   return true;
1713  }
1714 
1715 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1716   MachineInstr &MI, MachineRegisterInfo &MRI,
1717   MachineIRBuilder &B) const {
1718   Register DstReg = MI.getOperand(0).getReg();
1719   LLT Ty = MRI.getType(DstReg);
1720   unsigned AS = Ty.getAddressSpace();
1721 
1722   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1723   MachineFunction &MF = B.getMF();
1724   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1725   B.setInstr(MI);
1726 
1727   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1728     if (!MFI->isEntryFunction()) {
1729       const Function &Fn = MF.getFunction();
1730       DiagnosticInfoUnsupported BadLDSDecl(
1731         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1732       Fn.getContext().diagnose(BadLDSDecl);
1733     }
1734 
1735     // TODO: We could emit code to handle the initialization somewhere.
1736     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1737       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1738       MI.eraseFromParent();
1739       return true;
1740     }
1741 
1742     const Function &Fn = MF.getFunction();
1743     DiagnosticInfoUnsupported BadInit(
1744       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1745     Fn.getContext().diagnose(BadInit);
1746     return true;
1747   }
1748 
1749   const SITargetLowering *TLI = ST.getTargetLowering();
1750 
1751   if (TLI->shouldEmitFixup(GV)) {
1752     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1753     MI.eraseFromParent();
1754     return true;
1755   }
1756 
1757   if (TLI->shouldEmitPCReloc(GV)) {
1758     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1759     MI.eraseFromParent();
1760     return true;
1761   }
1762 
1763   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1764   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1765 
1766   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1767     MachinePointerInfo::getGOT(MF),
1768     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1769     MachineMemOperand::MOInvariant,
1770     8 /*Size*/, 8 /*Align*/);
1771 
1772   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1773 
1774   if (Ty.getSizeInBits() == 32) {
1775     // Truncate if this is a 32-bit constant adrdess.
1776     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1777     B.buildExtract(DstReg, Load, 0);
1778   } else
1779     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1780 
1781   MI.eraseFromParent();
1782   return true;
1783 }
1784 
1785 bool AMDGPULegalizerInfo::legalizeLoad(
1786   MachineInstr &MI, MachineRegisterInfo &MRI,
1787   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1788   B.setInstr(MI);
1789   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1790   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1791   Observer.changingInstr(MI);
1792   MI.getOperand(1).setReg(Cast.getReg(0));
1793   Observer.changedInstr(MI);
1794   return true;
1795 }
1796 
1797 bool AMDGPULegalizerInfo::legalizeFMad(
1798   MachineInstr &MI, MachineRegisterInfo &MRI,
1799   MachineIRBuilder &B) const {
1800   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1801   assert(Ty.isScalar());
1802 
1803   MachineFunction &MF = B.getMF();
1804   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1805 
1806   // TODO: Always legal with future ftz flag.
1807   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1808     return true;
1809   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1810     return true;
1811 
1812 
1813   MachineIRBuilder HelperBuilder(MI);
1814   GISelObserverWrapper DummyObserver;
1815   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1816   HelperBuilder.setMBB(*MI.getParent());
1817   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1818 }
1819 
1820 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1821   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1822   Register DstReg = MI.getOperand(0).getReg();
1823   Register PtrReg = MI.getOperand(1).getReg();
1824   Register CmpVal = MI.getOperand(2).getReg();
1825   Register NewVal = MI.getOperand(3).getReg();
1826 
1827   assert(SITargetLowering::isFlatGlobalAddrSpace(
1828            MRI.getType(PtrReg).getAddressSpace()) &&
1829          "this should not have been custom lowered");
1830 
1831   LLT ValTy = MRI.getType(CmpVal);
1832   LLT VecTy = LLT::vector(2, ValTy);
1833 
1834   B.setInstr(MI);
1835   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1836 
1837   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1838     .addDef(DstReg)
1839     .addUse(PtrReg)
1840     .addUse(PackedVal)
1841     .setMemRefs(MI.memoperands());
1842 
1843   MI.eraseFromParent();
1844   return true;
1845 }
1846 
1847 // Return the use branch instruction, otherwise null if the usage is invalid.
1848 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1849                                        MachineRegisterInfo &MRI,
1850                                        MachineInstr *&Br) {
1851   Register CondDef = MI.getOperand(0).getReg();
1852   if (!MRI.hasOneNonDBGUse(CondDef))
1853     return nullptr;
1854 
1855   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1856   if (UseMI.getParent() != MI.getParent() ||
1857       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1858     return nullptr;
1859 
1860   // Make sure the cond br is followed by a G_BR
1861   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1862   if (Next != MI.getParent()->end()) {
1863     if (Next->getOpcode() != AMDGPU::G_BR)
1864       return nullptr;
1865     Br = &*Next;
1866   }
1867 
1868   return &UseMI;
1869 }
1870 
1871 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1872                                                 Register Reg, LLT Ty) const {
1873   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1874   if (LiveIn)
1875     return LiveIn;
1876 
1877   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1878   MRI.addLiveIn(Reg, NewReg);
1879   return NewReg;
1880 }
1881 
1882 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1883                                          const ArgDescriptor *Arg) const {
1884   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1885     return false; // TODO: Handle these
1886 
1887   assert(Arg->getRegister().isPhysical());
1888 
1889   MachineRegisterInfo &MRI = *B.getMRI();
1890 
1891   LLT Ty = MRI.getType(DstReg);
1892   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1893 
1894   if (Arg->isMasked()) {
1895     // TODO: Should we try to emit this once in the entry block?
1896     const LLT S32 = LLT::scalar(32);
1897     const unsigned Mask = Arg->getMask();
1898     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1899 
1900     Register AndMaskSrc = LiveIn;
1901 
1902     if (Shift != 0) {
1903       auto ShiftAmt = B.buildConstant(S32, Shift);
1904       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1905     }
1906 
1907     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1908   } else
1909     B.buildCopy(DstReg, LiveIn);
1910 
1911   // Insert the argument copy if it doens't already exist.
1912   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1913   if (!MRI.getVRegDef(LiveIn)) {
1914     // FIXME: Should have scoped insert pt
1915     MachineBasicBlock &OrigInsBB = B.getMBB();
1916     auto OrigInsPt = B.getInsertPt();
1917 
1918     MachineBasicBlock &EntryMBB = B.getMF().front();
1919     EntryMBB.addLiveIn(Arg->getRegister());
1920     B.setInsertPt(EntryMBB, EntryMBB.begin());
1921     B.buildCopy(LiveIn, Arg->getRegister());
1922 
1923     B.setInsertPt(OrigInsBB, OrigInsPt);
1924   }
1925 
1926   return true;
1927 }
1928 
1929 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1930   MachineInstr &MI,
1931   MachineRegisterInfo &MRI,
1932   MachineIRBuilder &B,
1933   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1934   B.setInstr(MI);
1935 
1936   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1937 
1938   const ArgDescriptor *Arg;
1939   const TargetRegisterClass *RC;
1940   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1941   if (!Arg) {
1942     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1943     return false;
1944   }
1945 
1946   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1947     MI.eraseFromParent();
1948     return true;
1949   }
1950 
1951   return false;
1952 }
1953 
1954 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1955                                        MachineRegisterInfo &MRI,
1956                                        MachineIRBuilder &B) const {
1957   B.setInstr(MI);
1958   Register Dst = MI.getOperand(0).getReg();
1959   LLT DstTy = MRI.getType(Dst);
1960   LLT S16 = LLT::scalar(16);
1961   LLT S32 = LLT::scalar(32);
1962   LLT S64 = LLT::scalar(64);
1963 
1964   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1965     return true;
1966 
1967   if (DstTy == S16)
1968     return legalizeFDIV16(MI, MRI, B);
1969   if (DstTy == S32)
1970     return legalizeFDIV32(MI, MRI, B);
1971   if (DstTy == S64)
1972     return legalizeFDIV64(MI, MRI, B);
1973 
1974   return false;
1975 }
1976 
1977 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1978                                                  MachineRegisterInfo &MRI,
1979                                                  MachineIRBuilder &B) const {
1980   Register Res = MI.getOperand(0).getReg();
1981   Register LHS = MI.getOperand(1).getReg();
1982   Register RHS = MI.getOperand(2).getReg();
1983 
1984   uint16_t Flags = MI.getFlags();
1985 
1986   LLT ResTy = MRI.getType(Res);
1987   LLT S32 = LLT::scalar(32);
1988   LLT S64 = LLT::scalar(64);
1989 
1990   const MachineFunction &MF = B.getMF();
1991   bool Unsafe =
1992     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1993 
1994   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1995     return false;
1996 
1997   if (!Unsafe && ResTy == S32 &&
1998       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1999     return false;
2000 
2001   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2002     // 1 / x -> RCP(x)
2003     if (CLHS->isExactlyValue(1.0)) {
2004       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2005         .addUse(RHS)
2006         .setMIFlags(Flags);
2007 
2008       MI.eraseFromParent();
2009       return true;
2010     }
2011 
2012     // -1 / x -> RCP( FNEG(x) )
2013     if (CLHS->isExactlyValue(-1.0)) {
2014       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2015       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2016         .addUse(FNeg.getReg(0))
2017         .setMIFlags(Flags);
2018 
2019       MI.eraseFromParent();
2020       return true;
2021     }
2022   }
2023 
2024   // x / y -> x * (1.0 / y)
2025   if (Unsafe) {
2026     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2027       .addUse(RHS)
2028       .setMIFlags(Flags);
2029     B.buildFMul(Res, LHS, RCP, Flags);
2030 
2031     MI.eraseFromParent();
2032     return true;
2033   }
2034 
2035   return false;
2036 }
2037 
2038 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2039                                          MachineRegisterInfo &MRI,
2040                                          MachineIRBuilder &B) const {
2041   B.setInstr(MI);
2042   Register Res = MI.getOperand(0).getReg();
2043   Register LHS = MI.getOperand(1).getReg();
2044   Register RHS = MI.getOperand(2).getReg();
2045 
2046   uint16_t Flags = MI.getFlags();
2047 
2048   LLT S16 = LLT::scalar(16);
2049   LLT S32 = LLT::scalar(32);
2050 
2051   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2052   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2053 
2054   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2055     .addUse(RHSExt.getReg(0))
2056     .setMIFlags(Flags);
2057 
2058   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2059   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2060 
2061   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2062     .addUse(RDst.getReg(0))
2063     .addUse(RHS)
2064     .addUse(LHS)
2065     .setMIFlags(Flags);
2066 
2067   MI.eraseFromParent();
2068   return true;
2069 }
2070 
2071 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2072 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2073 static void toggleSPDenormMode(bool Enable,
2074                                MachineIRBuilder &B,
2075                                const GCNSubtarget &ST,
2076                                AMDGPU::SIModeRegisterDefaults Mode) {
2077   // Set SP denorm mode to this value.
2078   unsigned SPDenormMode =
2079     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2080 
2081   if (ST.hasDenormModeInst()) {
2082     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2083     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2084                                    ? FP_DENORM_FLUSH_NONE
2085                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2086 
2087     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2088     B.buildInstr(AMDGPU::S_DENORM_MODE)
2089       .addImm(NewDenormModeValue);
2090 
2091   } else {
2092     // Select FP32 bit field in mode register.
2093     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2094                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2095                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2096 
2097     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2098       .addImm(SPDenormMode)
2099       .addImm(SPDenormModeBitField);
2100   }
2101 }
2102 
2103 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2104                                          MachineRegisterInfo &MRI,
2105                                          MachineIRBuilder &B) const {
2106   B.setInstr(MI);
2107   Register Res = MI.getOperand(0).getReg();
2108   Register LHS = MI.getOperand(1).getReg();
2109   Register RHS = MI.getOperand(2).getReg();
2110   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2111   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2112 
2113   uint16_t Flags = MI.getFlags();
2114 
2115   LLT S32 = LLT::scalar(32);
2116   LLT S1 = LLT::scalar(1);
2117 
2118   auto One = B.buildFConstant(S32, 1.0f);
2119 
2120   auto DenominatorScaled =
2121     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2122       .addUse(RHS)
2123       .addUse(LHS)
2124       .addImm(1)
2125       .setMIFlags(Flags);
2126   auto NumeratorScaled =
2127     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2128       .addUse(LHS)
2129       .addUse(RHS)
2130       .addImm(0)
2131       .setMIFlags(Flags);
2132 
2133   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2134     .addUse(DenominatorScaled.getReg(0))
2135     .setMIFlags(Flags);
2136   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2137 
2138   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2139   // aren't modeled as reading it.
2140   if (!Mode.FP32Denormals)
2141     toggleSPDenormMode(true, B, ST, Mode);
2142 
2143   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2144   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2145   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2146   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2147   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2148   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2149 
2150   if (!Mode.FP32Denormals)
2151     toggleSPDenormMode(false, B, ST, Mode);
2152 
2153   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2154     .addUse(Fma4.getReg(0))
2155     .addUse(Fma1.getReg(0))
2156     .addUse(Fma3.getReg(0))
2157     .addUse(NumeratorScaled.getReg(1))
2158     .setMIFlags(Flags);
2159 
2160   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2161     .addUse(Fmas.getReg(0))
2162     .addUse(RHS)
2163     .addUse(LHS)
2164     .setMIFlags(Flags);
2165 
2166   MI.eraseFromParent();
2167   return true;
2168 }
2169 
2170 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2171                                          MachineRegisterInfo &MRI,
2172                                          MachineIRBuilder &B) const {
2173   B.setInstr(MI);
2174   Register Res = MI.getOperand(0).getReg();
2175   Register LHS = MI.getOperand(1).getReg();
2176   Register RHS = MI.getOperand(2).getReg();
2177 
2178   uint16_t Flags = MI.getFlags();
2179 
2180   LLT S64 = LLT::scalar(64);
2181   LLT S1 = LLT::scalar(1);
2182 
2183   auto One = B.buildFConstant(S64, 1.0);
2184 
2185   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2186     .addUse(LHS)
2187     .addUse(RHS)
2188     .addImm(1)
2189     .setMIFlags(Flags);
2190 
2191   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2192 
2193   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2194     .addUse(DivScale0.getReg(0))
2195     .setMIFlags(Flags);
2196 
2197   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2198   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2199   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2200 
2201   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2202     .addUse(LHS)
2203     .addUse(RHS)
2204     .addImm(0)
2205     .setMIFlags(Flags);
2206 
2207   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2208   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2209   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2210 
2211   Register Scale;
2212   if (!ST.hasUsableDivScaleConditionOutput()) {
2213     // Workaround a hardware bug on SI where the condition output from div_scale
2214     // is not usable.
2215 
2216     Scale = MRI.createGenericVirtualRegister(S1);
2217 
2218     LLT S32 = LLT::scalar(32);
2219 
2220     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2221     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2222     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2223     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2224 
2225     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2226                               Scale1Unmerge.getReg(1));
2227     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2228                               Scale0Unmerge.getReg(1));
2229     B.buildXor(Scale, CmpNum, CmpDen);
2230   } else {
2231     Scale = DivScale1.getReg(1);
2232   }
2233 
2234   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2235     .addUse(Fma4.getReg(0))
2236     .addUse(Fma3.getReg(0))
2237     .addUse(Mul.getReg(0))
2238     .addUse(Scale)
2239     .setMIFlags(Flags);
2240 
2241   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2242     .addUse(Fmas.getReg(0))
2243     .addUse(RHS)
2244     .addUse(LHS)
2245     .setMIFlags(Flags);
2246 
2247   MI.eraseFromParent();
2248   return true;
2249 }
2250 
2251 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2252                                                  MachineRegisterInfo &MRI,
2253                                                  MachineIRBuilder &B) const {
2254   B.setInstr(MI);
2255   Register Res = MI.getOperand(0).getReg();
2256   Register LHS = MI.getOperand(2).getReg();
2257   Register RHS = MI.getOperand(3).getReg();
2258   uint16_t Flags = MI.getFlags();
2259 
2260   LLT S32 = LLT::scalar(32);
2261   LLT S1 = LLT::scalar(1);
2262 
2263   auto Abs = B.buildFAbs(S32, RHS, Flags);
2264   const APFloat C0Val(1.0f);
2265 
2266   auto C0 = B.buildConstant(S32, 0x6f800000);
2267   auto C1 = B.buildConstant(S32, 0x2f800000);
2268   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2269 
2270   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2271   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2272 
2273   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2274 
2275   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2276     .addUse(Mul0.getReg(0))
2277     .setMIFlags(Flags);
2278 
2279   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2280 
2281   B.buildFMul(Res, Sel, Mul1, Flags);
2282 
2283   MI.eraseFromParent();
2284   return true;
2285 }
2286 
2287 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2288                                                  MachineRegisterInfo &MRI,
2289                                                  MachineIRBuilder &B) const {
2290   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2291   if (!MFI->isEntryFunction()) {
2292     return legalizePreloadedArgIntrin(MI, MRI, B,
2293                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2294   }
2295 
2296   B.setInstr(MI);
2297 
2298   uint64_t Offset =
2299     ST.getTargetLowering()->getImplicitParameterOffset(
2300       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2301   Register DstReg = MI.getOperand(0).getReg();
2302   LLT DstTy = MRI.getType(DstReg);
2303   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2304 
2305   const ArgDescriptor *Arg;
2306   const TargetRegisterClass *RC;
2307   std::tie(Arg, RC)
2308     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2309   if (!Arg)
2310     return false;
2311 
2312   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2313   if (!loadInputValue(KernargPtrReg, B, Arg))
2314     return false;
2315 
2316   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2317   MI.eraseFromParent();
2318   return true;
2319 }
2320 
2321 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2322                                               MachineRegisterInfo &MRI,
2323                                               MachineIRBuilder &B,
2324                                               unsigned AddrSpace) const {
2325   B.setInstr(MI);
2326   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2327   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2328   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2329   MI.eraseFromParent();
2330   return true;
2331 }
2332 
2333 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2334 // offset (the offset that is included in bounds checking and swizzling, to be
2335 // split between the instruction's voffset and immoffset fields) and soffset
2336 // (the offset that is excluded from bounds checking and swizzling, to go in
2337 // the instruction's soffset field).  This function takes the first kind of
2338 // offset and figures out how to split it between voffset and immoffset.
2339 std::tuple<Register, unsigned, unsigned>
2340 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2341                                         Register OrigOffset) const {
2342   const unsigned MaxImm = 4095;
2343   Register BaseReg;
2344   unsigned TotalConstOffset;
2345   MachineInstr *OffsetDef;
2346   const LLT S32 = LLT::scalar(32);
2347 
2348   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2349     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2350 
2351   unsigned ImmOffset = TotalConstOffset;
2352 
2353   // If the immediate value is too big for the immoffset field, put the value
2354   // and -4096 into the immoffset field so that the value that is copied/added
2355   // for the voffset field is a multiple of 4096, and it stands more chance
2356   // of being CSEd with the copy/add for another similar load/store.
2357   // However, do not do that rounding down to a multiple of 4096 if that is a
2358   // negative number, as it appears to be illegal to have a negative offset
2359   // in the vgpr, even if adding the immediate offset makes it positive.
2360   unsigned Overflow = ImmOffset & ~MaxImm;
2361   ImmOffset -= Overflow;
2362   if ((int32_t)Overflow < 0) {
2363     Overflow += ImmOffset;
2364     ImmOffset = 0;
2365   }
2366 
2367   if (Overflow != 0) {
2368     if (!BaseReg) {
2369       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2370     } else {
2371       auto OverflowVal = B.buildConstant(S32, Overflow);
2372       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2373     }
2374   }
2375 
2376   if (!BaseReg)
2377     BaseReg = B.buildConstant(S32, 0).getReg(0);
2378 
2379   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2380 }
2381 
2382 /// Handle register layout difference for f16 images for some subtargets.
2383 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2384                                              MachineRegisterInfo &MRI,
2385                                              Register Reg) const {
2386   if (!ST.hasUnpackedD16VMem())
2387     return Reg;
2388 
2389   const LLT S16 = LLT::scalar(16);
2390   const LLT S32 = LLT::scalar(32);
2391   LLT StoreVT = MRI.getType(Reg);
2392   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2393 
2394   auto Unmerge = B.buildUnmerge(S16, Reg);
2395 
2396   SmallVector<Register, 4> WideRegs;
2397   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2398     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2399 
2400   int NumElts = StoreVT.getNumElements();
2401 
2402   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2403 }
2404 
2405 Register AMDGPULegalizerInfo::fixStoreSourceType(
2406   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2407   MachineRegisterInfo *MRI = B.getMRI();
2408   LLT Ty = MRI->getType(VData);
2409 
2410   const LLT S16 = LLT::scalar(16);
2411 
2412   // Fixup illegal register types for i8 stores.
2413   if (Ty == LLT::scalar(8) || Ty == S16) {
2414     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2415     return AnyExt;
2416   }
2417 
2418   if (Ty.isVector()) {
2419     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2420       if (IsFormat)
2421         return handleD16VData(B, *MRI, VData);
2422     }
2423   }
2424 
2425   return VData;
2426 }
2427 
2428 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2429                                               MachineRegisterInfo &MRI,
2430                                               MachineIRBuilder &B,
2431                                               bool IsTyped,
2432                                               bool IsFormat) const {
2433   B.setInstr(MI);
2434 
2435   Register VData = MI.getOperand(1).getReg();
2436   LLT Ty = MRI.getType(VData);
2437   LLT EltTy = Ty.getScalarType();
2438   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2439   const LLT S32 = LLT::scalar(32);
2440 
2441   VData = fixStoreSourceType(B, VData, IsFormat);
2442   Register RSrc = MI.getOperand(2).getReg();
2443 
2444   MachineMemOperand *MMO = *MI.memoperands_begin();
2445   const int MemSize = MMO->getSize();
2446 
2447   unsigned ImmOffset;
2448   unsigned TotalOffset;
2449 
2450   // The typed intrinsics add an immediate after the registers.
2451   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2452 
2453   // The struct intrinsic variants add one additional operand over raw.
2454   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2455   Register VIndex;
2456   int OpOffset = 0;
2457   if (HasVIndex) {
2458     VIndex = MI.getOperand(3).getReg();
2459     OpOffset = 1;
2460   }
2461 
2462   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2463   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2464 
2465   unsigned Format = 0;
2466   if (IsTyped) {
2467     Format = MI.getOperand(5 + OpOffset).getImm();
2468     ++OpOffset;
2469   }
2470 
2471   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2472 
2473   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2474   if (TotalOffset != 0)
2475     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2476 
2477   unsigned Opc;
2478   if (IsTyped) {
2479     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2480                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2481   } else if (IsFormat) {
2482     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2483                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2484   } else {
2485     switch (MemSize) {
2486     case 1:
2487       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2488       break;
2489     case 2:
2490       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2491       break;
2492     default:
2493       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2494       break;
2495     }
2496   }
2497 
2498   if (!VIndex)
2499     VIndex = B.buildConstant(S32, 0).getReg(0);
2500 
2501   auto MIB = B.buildInstr(Opc)
2502     .addUse(VData)              // vdata
2503     .addUse(RSrc)               // rsrc
2504     .addUse(VIndex)             // vindex
2505     .addUse(VOffset)            // voffset
2506     .addUse(SOffset)            // soffset
2507     .addImm(ImmOffset);         // offset(imm)
2508 
2509   if (IsTyped)
2510     MIB.addImm(Format);
2511 
2512   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2513      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2514      .addMemOperand(MMO);
2515 
2516   MI.eraseFromParent();
2517   return true;
2518 }
2519 
2520 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2521                                              MachineRegisterInfo &MRI,
2522                                              MachineIRBuilder &B,
2523                                              bool IsFormat,
2524                                              bool IsTyped) const {
2525   B.setInstr(MI);
2526 
2527   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2528   MachineMemOperand *MMO = *MI.memoperands_begin();
2529   const int MemSize = MMO->getSize();
2530   const LLT S32 = LLT::scalar(32);
2531 
2532   Register Dst = MI.getOperand(0).getReg();
2533   Register RSrc = MI.getOperand(2).getReg();
2534 
2535   // The typed intrinsics add an immediate after the registers.
2536   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2537 
2538   // The struct intrinsic variants add one additional operand over raw.
2539   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2540   Register VIndex;
2541   int OpOffset = 0;
2542   if (HasVIndex) {
2543     VIndex = MI.getOperand(3).getReg();
2544     OpOffset = 1;
2545   }
2546 
2547   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2548   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2549 
2550   unsigned Format = 0;
2551   if (IsTyped) {
2552     Format = MI.getOperand(5 + OpOffset).getImm();
2553     ++OpOffset;
2554   }
2555 
2556   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2557   unsigned ImmOffset;
2558   unsigned TotalOffset;
2559 
2560   LLT Ty = MRI.getType(Dst);
2561   LLT EltTy = Ty.getScalarType();
2562   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2563   const bool Unpacked = ST.hasUnpackedD16VMem();
2564 
2565   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2566   if (TotalOffset != 0)
2567     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2568 
2569   unsigned Opc;
2570 
2571   if (IsTyped) {
2572     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2573                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2574   } else if (IsFormat) {
2575     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2576                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2577   } else {
2578     switch (MemSize) {
2579     case 1:
2580       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2581       break;
2582     case 2:
2583       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2584       break;
2585     default:
2586       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2587       break;
2588     }
2589   }
2590 
2591   Register LoadDstReg;
2592 
2593   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2594   LLT UnpackedTy = Ty.changeElementSize(32);
2595 
2596   if (IsExtLoad)
2597     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2598   else if (Unpacked && IsD16 && Ty.isVector())
2599     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2600   else
2601     LoadDstReg = Dst;
2602 
2603   if (!VIndex)
2604     VIndex = B.buildConstant(S32, 0).getReg(0);
2605 
2606   auto MIB = B.buildInstr(Opc)
2607     .addDef(LoadDstReg)         // vdata
2608     .addUse(RSrc)               // rsrc
2609     .addUse(VIndex)             // vindex
2610     .addUse(VOffset)            // voffset
2611     .addUse(SOffset)            // soffset
2612     .addImm(ImmOffset);         // offset(imm)
2613 
2614   if (IsTyped)
2615     MIB.addImm(Format);
2616 
2617   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2618      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2619      .addMemOperand(MMO);
2620 
2621   if (LoadDstReg != Dst) {
2622     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2623 
2624     // Widen result for extending loads was widened.
2625     if (IsExtLoad)
2626       B.buildTrunc(Dst, LoadDstReg);
2627     else {
2628       // Repack to original 16-bit vector result
2629       // FIXME: G_TRUNC should work, but legalization currently fails
2630       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2631       SmallVector<Register, 4> Repack;
2632       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2633         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2634       B.buildMerge(Dst, Repack);
2635     }
2636   }
2637 
2638   MI.eraseFromParent();
2639   return true;
2640 }
2641 
2642 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2643                                                MachineIRBuilder &B,
2644                                                bool IsInc) const {
2645   B.setInstr(MI);
2646   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2647                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2648   B.buildInstr(Opc)
2649     .addDef(MI.getOperand(0).getReg())
2650     .addUse(MI.getOperand(2).getReg())
2651     .addUse(MI.getOperand(3).getReg())
2652     .cloneMemRefs(MI);
2653   MI.eraseFromParent();
2654   return true;
2655 }
2656 
2657 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2658   switch (IntrID) {
2659   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2660   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2661     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2662   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2663   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2664     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2665   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2666   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2667     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2668   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2669   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2670     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2671   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2672   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2673     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2674   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2675   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2676     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2677   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2678   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2679     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2680   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2681   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2682     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2683   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2684   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2685     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2686   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2687   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2688     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2689   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2690   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2691     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2692   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2693   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2694     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2695   default:
2696     llvm_unreachable("unhandled atomic opcode");
2697   }
2698 }
2699 
2700 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2701                                                MachineIRBuilder &B,
2702                                                Intrinsic::ID IID) const {
2703   B.setInstr(MI);
2704 
2705   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2706                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2707 
2708   Register Dst = MI.getOperand(0).getReg();
2709   Register VData = MI.getOperand(2).getReg();
2710 
2711   Register CmpVal;
2712   int OpOffset = 0;
2713 
2714   if (IsCmpSwap) {
2715     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2716     ++OpOffset;
2717   }
2718 
2719   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2720   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2721 
2722   // The struct intrinsic variants add one additional operand over raw.
2723   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2724   Register VIndex;
2725   if (HasVIndex) {
2726     VIndex = MI.getOperand(4).getReg();
2727     ++OpOffset;
2728   }
2729 
2730   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2731   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2732   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2733 
2734   MachineMemOperand *MMO = *MI.memoperands_begin();
2735 
2736   unsigned ImmOffset;
2737   unsigned TotalOffset;
2738   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2739   if (TotalOffset != 0)
2740     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
2741 
2742   if (!VIndex)
2743     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
2744 
2745   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
2746     .addDef(Dst)
2747     .addUse(VData); // vdata
2748 
2749   if (IsCmpSwap)
2750     MIB.addReg(CmpVal);
2751 
2752   MIB.addUse(RSrc)               // rsrc
2753      .addUse(VIndex)             // vindex
2754      .addUse(VOffset)            // voffset
2755      .addUse(SOffset)            // soffset
2756      .addImm(ImmOffset)          // offset(imm)
2757      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2758      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2759      .addMemOperand(MMO);
2760 
2761   MI.eraseFromParent();
2762   return true;
2763 }
2764 
2765 // FIMXE: Needs observer like custom
2766 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2767                                             MachineRegisterInfo &MRI,
2768                                             MachineIRBuilder &B) const {
2769   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2770   auto IntrID = MI.getIntrinsicID();
2771   switch (IntrID) {
2772   case Intrinsic::amdgcn_if:
2773   case Intrinsic::amdgcn_else: {
2774     MachineInstr *Br = nullptr;
2775     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2776       const SIRegisterInfo *TRI
2777         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2778 
2779       B.setInstr(*BrCond);
2780       Register Def = MI.getOperand(1).getReg();
2781       Register Use = MI.getOperand(3).getReg();
2782 
2783       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2784       if (Br)
2785         BrTarget = Br->getOperand(0).getMBB();
2786 
2787       if (IntrID == Intrinsic::amdgcn_if) {
2788         B.buildInstr(AMDGPU::SI_IF)
2789           .addDef(Def)
2790           .addUse(Use)
2791           .addMBB(BrTarget);
2792       } else {
2793         B.buildInstr(AMDGPU::SI_ELSE)
2794           .addDef(Def)
2795           .addUse(Use)
2796           .addMBB(BrTarget)
2797           .addImm(0);
2798       }
2799 
2800       if (Br)
2801         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2802 
2803       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2804       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2805       MI.eraseFromParent();
2806       BrCond->eraseFromParent();
2807       return true;
2808     }
2809 
2810     return false;
2811   }
2812   case Intrinsic::amdgcn_loop: {
2813     MachineInstr *Br = nullptr;
2814     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2815       const SIRegisterInfo *TRI
2816         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2817 
2818       B.setInstr(*BrCond);
2819 
2820       // FIXME: Need to adjust branch targets based on unconditional branch.
2821       Register Reg = MI.getOperand(2).getReg();
2822       B.buildInstr(AMDGPU::SI_LOOP)
2823         .addUse(Reg)
2824         .addMBB(BrCond->getOperand(1).getMBB());
2825       MI.eraseFromParent();
2826       BrCond->eraseFromParent();
2827       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2828       return true;
2829     }
2830 
2831     return false;
2832   }
2833   case Intrinsic::amdgcn_kernarg_segment_ptr:
2834     return legalizePreloadedArgIntrin(
2835       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2836   case Intrinsic::amdgcn_implicitarg_ptr:
2837     return legalizeImplicitArgPtr(MI, MRI, B);
2838   case Intrinsic::amdgcn_workitem_id_x:
2839     return legalizePreloadedArgIntrin(MI, MRI, B,
2840                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2841   case Intrinsic::amdgcn_workitem_id_y:
2842     return legalizePreloadedArgIntrin(MI, MRI, B,
2843                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2844   case Intrinsic::amdgcn_workitem_id_z:
2845     return legalizePreloadedArgIntrin(MI, MRI, B,
2846                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2847   case Intrinsic::amdgcn_workgroup_id_x:
2848     return legalizePreloadedArgIntrin(MI, MRI, B,
2849                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2850   case Intrinsic::amdgcn_workgroup_id_y:
2851     return legalizePreloadedArgIntrin(MI, MRI, B,
2852                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2853   case Intrinsic::amdgcn_workgroup_id_z:
2854     return legalizePreloadedArgIntrin(MI, MRI, B,
2855                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2856   case Intrinsic::amdgcn_dispatch_ptr:
2857     return legalizePreloadedArgIntrin(MI, MRI, B,
2858                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2859   case Intrinsic::amdgcn_queue_ptr:
2860     return legalizePreloadedArgIntrin(MI, MRI, B,
2861                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2862   case Intrinsic::amdgcn_implicit_buffer_ptr:
2863     return legalizePreloadedArgIntrin(
2864       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2865   case Intrinsic::amdgcn_dispatch_id:
2866     return legalizePreloadedArgIntrin(MI, MRI, B,
2867                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2868   case Intrinsic::amdgcn_fdiv_fast:
2869     return legalizeFDIVFastIntrin(MI, MRI, B);
2870   case Intrinsic::amdgcn_is_shared:
2871     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2872   case Intrinsic::amdgcn_is_private:
2873     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2874   case Intrinsic::amdgcn_wavefrontsize: {
2875     B.setInstr(MI);
2876     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2877     MI.eraseFromParent();
2878     return true;
2879   }
2880   case Intrinsic::amdgcn_raw_buffer_store:
2881   case Intrinsic::amdgcn_struct_buffer_store:
2882     return legalizeBufferStore(MI, MRI, B, false, false);
2883   case Intrinsic::amdgcn_raw_buffer_store_format:
2884   case Intrinsic::amdgcn_struct_buffer_store_format:
2885     return legalizeBufferStore(MI, MRI, B, false, true);
2886   case Intrinsic::amdgcn_raw_tbuffer_store:
2887   case Intrinsic::amdgcn_struct_tbuffer_store:
2888     return legalizeBufferStore(MI, MRI, B, true, true);
2889   case Intrinsic::amdgcn_raw_buffer_load:
2890   case Intrinsic::amdgcn_struct_buffer_load:
2891     return legalizeBufferLoad(MI, MRI, B, false, false);
2892   case Intrinsic::amdgcn_raw_buffer_load_format:
2893   case Intrinsic::amdgcn_struct_buffer_load_format:
2894     return legalizeBufferLoad(MI, MRI, B, true, false);
2895   case Intrinsic::amdgcn_raw_tbuffer_load:
2896   case Intrinsic::amdgcn_struct_tbuffer_load:
2897     return legalizeBufferLoad(MI, MRI, B, true, true);
2898   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2899   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2900   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2901   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2902   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2903   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2904   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2905   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2906   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2907   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2908   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2909   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2910   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2911   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2912   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2913   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2914   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2915   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2916   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2917   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2918   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2919   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2920   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2921   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2922   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2923   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2924     return legalizeBufferAtomic(MI, B, IntrID);
2925   case Intrinsic::amdgcn_atomic_inc:
2926     return legalizeAtomicIncDec(MI, B, true);
2927   case Intrinsic::amdgcn_atomic_dec:
2928     return legalizeAtomicIncDec(MI, B, false);
2929   default:
2930     return true;
2931   }
2932 
2933   return true;
2934 }
2935