1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
182                                          const GCNTargetMachine &TM)
183   :  ST(ST_) {
184   using namespace TargetOpcode;
185 
186   auto GetAddrSpacePtr = [&TM](unsigned AS) {
187     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
188   };
189 
190   const LLT S1 = LLT::scalar(1);
191   const LLT S16 = LLT::scalar(16);
192   const LLT S32 = LLT::scalar(32);
193   const LLT S64 = LLT::scalar(64);
194   const LLT S96 = LLT::scalar(96);
195   const LLT S128 = LLT::scalar(128);
196   const LLT S256 = LLT::scalar(256);
197   const LLT S1024 = LLT::scalar(1024);
198 
199   const LLT V2S16 = LLT::vector(2, 16);
200   const LLT V4S16 = LLT::vector(4, 16);
201 
202   const LLT V2S32 = LLT::vector(2, 32);
203   const LLT V3S32 = LLT::vector(3, 32);
204   const LLT V4S32 = LLT::vector(4, 32);
205   const LLT V5S32 = LLT::vector(5, 32);
206   const LLT V6S32 = LLT::vector(6, 32);
207   const LLT V7S32 = LLT::vector(7, 32);
208   const LLT V8S32 = LLT::vector(8, 32);
209   const LLT V9S32 = LLT::vector(9, 32);
210   const LLT V10S32 = LLT::vector(10, 32);
211   const LLT V11S32 = LLT::vector(11, 32);
212   const LLT V12S32 = LLT::vector(12, 32);
213   const LLT V13S32 = LLT::vector(13, 32);
214   const LLT V14S32 = LLT::vector(14, 32);
215   const LLT V15S32 = LLT::vector(15, 32);
216   const LLT V16S32 = LLT::vector(16, 32);
217   const LLT V32S32 = LLT::vector(32, 32);
218 
219   const LLT V2S64 = LLT::vector(2, 64);
220   const LLT V3S64 = LLT::vector(3, 64);
221   const LLT V4S64 = LLT::vector(4, 64);
222   const LLT V5S64 = LLT::vector(5, 64);
223   const LLT V6S64 = LLT::vector(6, 64);
224   const LLT V7S64 = LLT::vector(7, 64);
225   const LLT V8S64 = LLT::vector(8, 64);
226   const LLT V16S64 = LLT::vector(16, 64);
227 
228   std::initializer_list<LLT> AllS32Vectors =
229     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
230      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
231   std::initializer_list<LLT> AllS64Vectors =
232     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
233 
234   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
235   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
236   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
237   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
238   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
239   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
240   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
241 
242   const LLT CodePtr = FlatPtr;
243 
244   const std::initializer_list<LLT> AddrSpaces64 = {
245     GlobalPtr, ConstantPtr, FlatPtr
246   };
247 
248   const std::initializer_list<LLT> AddrSpaces32 = {
249     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
250   };
251 
252   const std::initializer_list<LLT> FPTypesBase = {
253     S32, S64
254   };
255 
256   const std::initializer_list<LLT> FPTypes16 = {
257     S32, S64, S16
258   };
259 
260   const std::initializer_list<LLT> FPTypesPK16 = {
261     S32, S64, S16, V2S16
262   };
263 
264   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
265 
266   setAction({G_BRCOND, S1}, Legal); // VCC branches
267   setAction({G_BRCOND, S32}, Legal); // SCC branches
268 
269   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
270   // elements for v3s16
271   getActionDefinitionsBuilder(G_PHI)
272     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
273     .legalFor(AllS32Vectors)
274     .legalFor(AllS64Vectors)
275     .legalFor(AddrSpaces64)
276     .legalFor(AddrSpaces32)
277     .clampScalar(0, S32, S256)
278     .widenScalarToNextPow2(0, 32)
279     .clampMaxNumElements(0, S32, 16)
280     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
281     .legalIf(isPointer(0));
282 
283   if (ST.has16BitInsts()) {
284     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
285       .legalFor({S32, S16})
286       .clampScalar(0, S16, S32)
287       .scalarize(0);
288   } else {
289     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
290       .legalFor({S32})
291       .clampScalar(0, S32, S32)
292       .scalarize(0);
293   }
294 
295   // FIXME: Not really legal. Placeholder for custom lowering.
296   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
297     .legalFor({S32, S64})
298     .clampScalar(0, S32, S64)
299     .widenScalarToNextPow2(0, 32)
300     .scalarize(0);
301 
302   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
303     .legalFor({S32})
304     .clampScalar(0, S32, S32)
305     .scalarize(0);
306 
307   // Report legal for any types we can handle anywhere. For the cases only legal
308   // on the SALU, RegBankSelect will be able to re-legalize.
309   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
310     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
311     .clampScalar(0, S32, S64)
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
314     .widenScalarToNextPow2(0)
315     .scalarize(0);
316 
317   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
318                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
319     .legalFor({{S32, S1}, {S32, S32}})
320     .clampScalar(0, S32, S32)
321     .scalarize(0); // TODO: Implement.
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     .lower();
327 
328 
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   getActionDefinitionsBuilder(G_FCONSTANT)
337     .legalFor({S32, S64, S16})
338     .clampScalar(0, S16, S64);
339 
340   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
341     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
342                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
343     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344     .clampScalarOrElt(0, S32, S1024)
345     .legalIf(isMultiple32(0))
346     .widenScalarToNextPow2(0, 32)
347     .clampMaxNumElements(0, S32, 16);
348 
349   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
350   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
351     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
352   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
353 
354   auto &FPOpActions = getActionDefinitionsBuilder(
355     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
356     .legalFor({S32, S64});
357   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
358     .customFor({S32, S64});
359   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
360     .customFor({S32, S64});
361 
362   if (ST.has16BitInsts()) {
363     if (ST.hasVOP3PInsts())
364       FPOpActions.legalFor({S16, V2S16});
365     else
366       FPOpActions.legalFor({S16});
367 
368     TrigActions.customFor({S16});
369     FDIVActions.customFor({S16});
370   }
371 
372   auto &MinNumMaxNum = getActionDefinitionsBuilder({
373       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
374 
375   if (ST.hasVOP3PInsts()) {
376     MinNumMaxNum.customFor(FPTypesPK16)
377       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378       .clampMaxNumElements(0, S16, 2)
379       .clampScalar(0, S16, S64)
380       .scalarize(0);
381   } else if (ST.has16BitInsts()) {
382     MinNumMaxNum.customFor(FPTypes16)
383       .clampScalar(0, S16, S64)
384       .scalarize(0);
385   } else {
386     MinNumMaxNum.customFor(FPTypesBase)
387       .clampScalar(0, S32, S64)
388       .scalarize(0);
389   }
390 
391   if (ST.hasVOP3PInsts())
392     FPOpActions.clampMaxNumElements(0, S16, 2);
393 
394   FPOpActions
395     .scalarize(0)
396     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
397 
398   TrigActions
399     .scalarize(0)
400     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
401 
402   FDIVActions
403     .scalarize(0)
404     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
405 
406   getActionDefinitionsBuilder({G_FNEG, G_FABS})
407     .legalFor(FPTypesPK16)
408     .clampMaxNumElements(0, S16, 2)
409     .scalarize(0)
410     .clampScalar(0, S16, S64);
411 
412   if (ST.has16BitInsts()) {
413     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
414       .legalFor({S32, S64, S16})
415       .scalarize(0)
416       .clampScalar(0, S16, S64);
417   } else {
418     getActionDefinitionsBuilder(G_FSQRT)
419       .legalFor({S32, S64})
420       .scalarize(0)
421       .clampScalar(0, S32, S64);
422 
423     if (ST.hasFractBug()) {
424       getActionDefinitionsBuilder(G_FFLOOR)
425         .customFor({S64})
426         .legalFor({S32, S64})
427         .scalarize(0)
428         .clampScalar(0, S32, S64);
429     } else {
430       getActionDefinitionsBuilder(G_FFLOOR)
431         .legalFor({S32, S64})
432         .scalarize(0)
433         .clampScalar(0, S32, S64);
434     }
435   }
436 
437   getActionDefinitionsBuilder(G_FPTRUNC)
438     .legalFor({{S32, S64}, {S16, S32}})
439     .scalarize(0);
440 
441   getActionDefinitionsBuilder(G_FPEXT)
442     .legalFor({{S64, S32}, {S32, S16}})
443     .lowerFor({{S64, S16}}) // FIXME: Implement
444     .scalarize(0);
445 
446   getActionDefinitionsBuilder(G_FSUB)
447       // Use actual fsub instruction
448       .legalFor({S32})
449       // Must use fadd + fneg
450       .lowerFor({S64, S16, V2S16})
451       .scalarize(0)
452       .clampScalar(0, S32, S64);
453 
454   // Whether this is legal depends on the floating point mode for the function.
455   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
456   if (ST.hasMadF16())
457     FMad.customFor({S32, S16});
458   else
459     FMad.customFor({S32});
460   FMad.scalarize(0)
461       .lower();
462 
463   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
464     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
465                {S32, S1}, {S64, S1}, {S16, S1}})
466     .scalarize(0)
467     .clampScalar(0, S32, S64);
468 
469   // TODO: Split s1->s64 during regbankselect for VALU.
470   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
471     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
472     .lowerFor({{S32, S64}})
473     .lowerIf(typeIs(1, S1))
474     .customFor({{S64, S64}});
475   if (ST.has16BitInsts())
476     IToFP.legalFor({{S16, S16}});
477   IToFP.clampScalar(1, S32, S64)
478        .scalarize(0);
479 
480   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
481     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
482     .customFor({{S64, S64}});
483   if (ST.has16BitInsts())
484     FPToI.legalFor({{S16, S16}});
485   else
486     FPToI.minScalar(1, S32);
487 
488   FPToI.minScalar(0, S32)
489        .scalarize(0)
490        .lower();
491 
492   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
493     .scalarize(0)
494     .lower();
495 
496   if (ST.has16BitInsts()) {
497     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
498       .legalFor({S16, S32, S64})
499       .clampScalar(0, S16, S64)
500       .scalarize(0);
501   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
502     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
503       .legalFor({S32, S64})
504       .clampScalar(0, S32, S64)
505       .scalarize(0);
506   } else {
507     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
508       .legalFor({S32})
509       .customFor({S64})
510       .clampScalar(0, S32, S64)
511       .scalarize(0);
512   }
513 
514   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
515     .scalarize(0)
516     .alwaysLegal();
517 
518   auto &CmpBuilder =
519     getActionDefinitionsBuilder(G_ICMP)
520     // The compare output type differs based on the register bank of the output,
521     // so make both s1 and s32 legal.
522     //
523     // Scalar compares producing output in scc will be promoted to s32, as that
524     // is the allocatable register type that will be needed for the copy from
525     // scc. This will be promoted during RegBankSelect, and we assume something
526     // before that won't try to use s32 result types.
527     //
528     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
529     // bank.
530     .legalForCartesianProduct(
531       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
532     .legalForCartesianProduct(
533       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
534   if (ST.has16BitInsts()) {
535     CmpBuilder.legalFor({{S1, S16}});
536   }
537 
538   CmpBuilder
539     .widenScalarToNextPow2(1)
540     .clampScalar(1, S32, S64)
541     .scalarize(0)
542     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
543 
544   getActionDefinitionsBuilder(G_FCMP)
545     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
546     .widenScalarToNextPow2(1)
547     .clampScalar(1, S32, S64)
548     .scalarize(0);
549 
550   // FIXME: fpow has a selection pattern that should move to custom lowering.
551   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
552   if (ST.has16BitInsts())
553     Exp2Ops.legalFor({S32, S16});
554   else
555     Exp2Ops.legalFor({S32});
556   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
557   Exp2Ops.scalarize(0);
558 
559   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
560   if (ST.has16BitInsts())
561     ExpOps.customFor({{S32}, {S16}});
562   else
563     ExpOps.customFor({S32});
564   ExpOps.clampScalar(0, MinScalarFPTy, S32)
565         .scalarize(0);
566 
567   // The 64-bit versions produce 32-bit results, but only on the SALU.
568   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
569                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
570                                G_CTPOP})
571     .legalFor({{S32, S32}, {S32, S64}})
572     .clampScalar(0, S32, S32)
573     .clampScalar(1, S32, S64)
574     .scalarize(0)
575     .widenScalarToNextPow2(0, 32)
576     .widenScalarToNextPow2(1, 32);
577 
578   // TODO: Expand for > s32
579   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
580     .legalFor({S32})
581     .clampScalar(0, S32, S32)
582     .scalarize(0);
583 
584   if (ST.has16BitInsts()) {
585     if (ST.hasVOP3PInsts()) {
586       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
587         .legalFor({S32, S16, V2S16})
588         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
589         .clampMaxNumElements(0, S16, 2)
590         .clampScalar(0, S16, S32)
591         .widenScalarToNextPow2(0)
592         .scalarize(0);
593     } else {
594       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
595         .legalFor({S32, S16})
596         .widenScalarToNextPow2(0)
597         .clampScalar(0, S16, S32)
598         .scalarize(0);
599     }
600   } else {
601     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
602       .legalFor({S32})
603       .clampScalar(0, S32, S32)
604       .widenScalarToNextPow2(0)
605       .scalarize(0);
606   }
607 
608   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
609     return [=](const LegalityQuery &Query) {
610       return Query.Types[TypeIdx0].getSizeInBits() <
611              Query.Types[TypeIdx1].getSizeInBits();
612     };
613   };
614 
615   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
616     return [=](const LegalityQuery &Query) {
617       return Query.Types[TypeIdx0].getSizeInBits() >
618              Query.Types[TypeIdx1].getSizeInBits();
619     };
620   };
621 
622   getActionDefinitionsBuilder(G_INTTOPTR)
623     // List the common cases
624     .legalForCartesianProduct(AddrSpaces64, {S64})
625     .legalForCartesianProduct(AddrSpaces32, {S32})
626     .scalarize(0)
627     // Accept any address space as long as the size matches
628     .legalIf(sameSize(0, 1))
629     .widenScalarIf(smallerThan(1, 0),
630       [](const LegalityQuery &Query) {
631         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
632       })
633     .narrowScalarIf(greaterThan(1, 0),
634       [](const LegalityQuery &Query) {
635         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
636       });
637 
638   getActionDefinitionsBuilder(G_PTRTOINT)
639     // List the common cases
640     .legalForCartesianProduct(AddrSpaces64, {S64})
641     .legalForCartesianProduct(AddrSpaces32, {S32})
642     .scalarize(0)
643     // Accept any address space as long as the size matches
644     .legalIf(sameSize(0, 1))
645     .widenScalarIf(smallerThan(0, 1),
646       [](const LegalityQuery &Query) {
647         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
648       })
649     .narrowScalarIf(
650       greaterThan(0, 1),
651       [](const LegalityQuery &Query) {
652         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
653       });
654 
655   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
656     .scalarize(0)
657     .custom();
658 
659   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
660   // handle some operations by just promoting the register during
661   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
662   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
663     switch (AS) {
664     // FIXME: Private element size.
665     case AMDGPUAS::PRIVATE_ADDRESS:
666       return 32;
667     // FIXME: Check subtarget
668     case AMDGPUAS::LOCAL_ADDRESS:
669       return ST.useDS128() ? 128 : 64;
670 
671     // Treat constant and global as identical. SMRD loads are sometimes usable
672     // for global loads (ideally constant address space should be eliminated)
673     // depending on the context. Legality cannot be context dependent, but
674     // RegBankSelect can split the load as necessary depending on the pointer
675     // register bank/uniformity and if the memory is invariant or not written in
676     // a kernel.
677     case AMDGPUAS::CONSTANT_ADDRESS:
678     case AMDGPUAS::GLOBAL_ADDRESS:
679       return IsLoad ? 512 : 128;
680     default:
681       return 128;
682     }
683   };
684 
685   const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool {
686     const LLT DstTy = Query.Types[0];
687 
688     // Split vector extloads.
689     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
690     unsigned Align = Query.MMODescrs[0].AlignInBits;
691 
692     if (MemSize < DstTy.getSizeInBits())
693       MemSize = std::max(MemSize, Align);
694 
695     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
696       return true;
697 
698     const LLT PtrTy = Query.Types[1];
699     unsigned AS = PtrTy.getAddressSpace();
700     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
701       return true;
702 
703     // Catch weird sized loads that don't evenly divide into the access sizes
704     // TODO: May be able to widen depending on alignment etc.
705     unsigned NumRegs = MemSize / 32;
706     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
707       return true;
708 
709     if (Align < MemSize) {
710       const SITargetLowering *TLI = ST.getTargetLowering();
711       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
712     }
713 
714     return false;
715   };
716 
717   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
718   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
719   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
720 
721   // TODO: Refine based on subtargets which support unaligned access or 128-bit
722   // LDS
723   // TODO: Unsupported flat for SI.
724 
725   for (unsigned Op : {G_LOAD, G_STORE}) {
726     const bool IsStore = Op == G_STORE;
727 
728     auto &Actions = getActionDefinitionsBuilder(Op);
729     // Whitelist the common cases.
730     // TODO: Pointer loads
731     // TODO: Wide constant loads
732     // TODO: Only CI+ has 3x loads
733     // TODO: Loads to s16 on gfx9
734     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
735                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
736                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
737                                       {S96, GlobalPtr, 96, GlobalAlign32},
738                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
739                                       {S128, GlobalPtr, 128, GlobalAlign32},
740                                       {S64, GlobalPtr, 64, GlobalAlign32},
741                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
742                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
743                                       {S32, GlobalPtr, 8, GlobalAlign8},
744                                       {S32, GlobalPtr, 16, GlobalAlign16},
745 
746                                       {S32, LocalPtr, 32, 32},
747                                       {S64, LocalPtr, 64, 32},
748                                       {V2S32, LocalPtr, 64, 32},
749                                       {S32, LocalPtr, 8, 8},
750                                       {S32, LocalPtr, 16, 16},
751                                       {V2S16, LocalPtr, 32, 32},
752 
753                                       {S32, PrivatePtr, 32, 32},
754                                       {S32, PrivatePtr, 8, 8},
755                                       {S32, PrivatePtr, 16, 16},
756                                       {V2S16, PrivatePtr, 32, 32},
757 
758                                       {S32, FlatPtr, 32, GlobalAlign32},
759                                       {S32, FlatPtr, 16, GlobalAlign16},
760                                       {S32, FlatPtr, 8, GlobalAlign8},
761                                       {V2S16, FlatPtr, 32, GlobalAlign32},
762 
763                                       {S32, ConstantPtr, 32, GlobalAlign32},
764                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
765                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
766                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
767                                       {S64, ConstantPtr, 64, GlobalAlign32},
768                                       {S128, ConstantPtr, 128, GlobalAlign32},
769                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
770     Actions
771         .customIf(typeIs(1, Constant32Ptr))
772         .narrowScalarIf(
773             [=](const LegalityQuery &Query) -> bool {
774               return !Query.Types[0].isVector() &&
775                      needToSplitMemOp(Query, Op == G_LOAD);
776             },
777             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
778               const LLT DstTy = Query.Types[0];
779               const LLT PtrTy = Query.Types[1];
780 
781               const unsigned DstSize = DstTy.getSizeInBits();
782               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
783 
784               // Split extloads.
785               if (DstSize > MemSize)
786                 return std::make_pair(0, LLT::scalar(MemSize));
787 
788               if (DstSize > 32 && (DstSize % 32 != 0)) {
789                 // FIXME: Need a way to specify non-extload of larger size if
790                 // suitably aligned.
791                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
792               }
793 
794               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
795                                                      Op == G_LOAD);
796               if (MemSize > MaxSize)
797                 return std::make_pair(0, LLT::scalar(MaxSize));
798 
799               unsigned Align = Query.MMODescrs[0].AlignInBits;
800               return std::make_pair(0, LLT::scalar(Align));
801             })
802         .fewerElementsIf(
803             [=](const LegalityQuery &Query) -> bool {
804               return Query.Types[0].isVector() &&
805                      needToSplitMemOp(Query, Op == G_LOAD);
806             },
807             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
808               const LLT DstTy = Query.Types[0];
809               const LLT PtrTy = Query.Types[1];
810 
811               LLT EltTy = DstTy.getElementType();
812               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
813                                                      Op == G_LOAD);
814 
815               // Split if it's too large for the address space.
816               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
817                 unsigned NumElts = DstTy.getNumElements();
818                 unsigned EltSize = EltTy.getSizeInBits();
819 
820                 if (MaxSize % EltSize == 0) {
821                   return std::make_pair(
822                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
823                 }
824 
825                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
826 
827                 // FIXME: Refine when odd breakdowns handled
828                 // The scalars will need to be re-legalized.
829                 if (NumPieces == 1 || NumPieces >= NumElts ||
830                     NumElts % NumPieces != 0)
831                   return std::make_pair(0, EltTy);
832 
833                 return std::make_pair(0,
834                                       LLT::vector(NumElts / NumPieces, EltTy));
835               }
836 
837               // Need to split because of alignment.
838               unsigned Align = Query.MMODescrs[0].AlignInBits;
839               unsigned EltSize = EltTy.getSizeInBits();
840               if (EltSize > Align &&
841                   (EltSize / Align < DstTy.getNumElements())) {
842                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
843               }
844 
845               // May need relegalization for the scalars.
846               return std::make_pair(0, EltTy);
847             })
848         .minScalar(0, S32);
849 
850     if (IsStore)
851       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
852 
853     // TODO: Need a bitcast lower option?
854     Actions
855         .legalIf([=](const LegalityQuery &Query) {
856           const LLT Ty0 = Query.Types[0];
857           unsigned Size = Ty0.getSizeInBits();
858           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
859           unsigned Align = Query.MMODescrs[0].AlignInBits;
860 
861           // FIXME: Widening store from alignment not valid.
862           if (MemSize < Size)
863             MemSize = std::max(MemSize, Align);
864 
865           // No extending vector loads.
866           if (Size > MemSize && Ty0.isVector())
867             return false;
868 
869           switch (MemSize) {
870           case 8:
871           case 16:
872             return Size == 32;
873           case 32:
874           case 64:
875           case 128:
876             return true;
877           case 96:
878             return ST.hasDwordx3LoadStores();
879           case 256:
880           case 512:
881             return true;
882           default:
883             return false;
884           }
885         })
886         .widenScalarToNextPow2(0)
887         // TODO: v3s32->v4s32 with alignment
888         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
889   }
890 
891   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
892                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
893                                                   {S32, GlobalPtr, 16, 2 * 8},
894                                                   {S32, LocalPtr, 8, 8},
895                                                   {S32, LocalPtr, 16, 16},
896                                                   {S32, PrivatePtr, 8, 8},
897                                                   {S32, PrivatePtr, 16, 16},
898                                                   {S32, ConstantPtr, 8, 8},
899                                                   {S32, ConstantPtr, 16, 2 * 8}});
900   if (ST.hasFlatAddressSpace()) {
901     ExtLoads.legalForTypesWithMemDesc(
902         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
903   }
904 
905   ExtLoads.clampScalar(0, S32, S32)
906           .widenScalarToNextPow2(0)
907           .unsupportedIfMemSizeNotPow2()
908           .lower();
909 
910   auto &Atomics = getActionDefinitionsBuilder(
911     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
912      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
913      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
914      G_ATOMICRMW_UMIN})
915     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
916                {S64, GlobalPtr}, {S64, LocalPtr}});
917   if (ST.hasFlatAddressSpace()) {
918     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
919   }
920 
921   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
922     .legalFor({{S32, LocalPtr}});
923 
924   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
925   // demarshalling
926   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
927     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
928                 {S32, FlatPtr}, {S64, FlatPtr}})
929     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
930                {S32, RegionPtr}, {S64, RegionPtr}});
931   // TODO: Pointer types, any 32-bit or 64-bit vector
932 
933   // Condition should be s32 for scalar, s1 for vector.
934   getActionDefinitionsBuilder(G_SELECT)
935     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
936           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
937           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
938     .clampScalar(0, S16, S64)
939     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
940     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
941     .scalarize(1)
942     .clampMaxNumElements(0, S32, 2)
943     .clampMaxNumElements(0, LocalPtr, 2)
944     .clampMaxNumElements(0, PrivatePtr, 2)
945     .scalarize(0)
946     .widenScalarToNextPow2(0)
947     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
948 
949   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
950   // be more flexible with the shift amount type.
951   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
952     .legalFor({{S32, S32}, {S64, S32}});
953   if (ST.has16BitInsts()) {
954     if (ST.hasVOP3PInsts()) {
955       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
956             .clampMaxNumElements(0, S16, 2);
957     } else
958       Shifts.legalFor({{S16, S32}, {S16, S16}});
959 
960     // TODO: Support 16-bit shift amounts
961     Shifts.clampScalar(1, S32, S32);
962     Shifts.clampScalar(0, S16, S64);
963     Shifts.widenScalarToNextPow2(0, 16);
964   } else {
965     // Make sure we legalize the shift amount type first, as the general
966     // expansion for the shifted type will produce much worse code if it hasn't
967     // been truncated already.
968     Shifts.clampScalar(1, S32, S32);
969     Shifts.clampScalar(0, S32, S64);
970     Shifts.widenScalarToNextPow2(0, 32);
971   }
972   Shifts.scalarize(0);
973 
974   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
975     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
976     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
977     unsigned IdxTypeIdx = 2;
978 
979     getActionDefinitionsBuilder(Op)
980       .customIf([=](const LegalityQuery &Query) {
981           const LLT EltTy = Query.Types[EltTypeIdx];
982           const LLT VecTy = Query.Types[VecTypeIdx];
983           const LLT IdxTy = Query.Types[IdxTypeIdx];
984           return (EltTy.getSizeInBits() == 16 ||
985                   EltTy.getSizeInBits() % 32 == 0) &&
986                  VecTy.getSizeInBits() % 32 == 0 &&
987                  VecTy.getSizeInBits() <= 1024 &&
988                  IdxTy.getSizeInBits() == 32;
989         })
990       .clampScalar(EltTypeIdx, S32, S64)
991       .clampScalar(VecTypeIdx, S32, S64)
992       .clampScalar(IdxTypeIdx, S32, S32);
993   }
994 
995   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
996     .unsupportedIf([=](const LegalityQuery &Query) {
997         const LLT &EltTy = Query.Types[1].getElementType();
998         return Query.Types[0] != EltTy;
999       });
1000 
1001   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1002     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1003     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1004 
1005     // FIXME: Doesn't handle extract of illegal sizes.
1006     getActionDefinitionsBuilder(Op)
1007       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1008       // FIXME: Multiples of 16 should not be legal.
1009       .legalIf([=](const LegalityQuery &Query) {
1010           const LLT BigTy = Query.Types[BigTyIdx];
1011           const LLT LitTy = Query.Types[LitTyIdx];
1012           return (BigTy.getSizeInBits() % 32 == 0) &&
1013                  (LitTy.getSizeInBits() % 16 == 0);
1014         })
1015       .widenScalarIf(
1016         [=](const LegalityQuery &Query) {
1017           const LLT BigTy = Query.Types[BigTyIdx];
1018           return (BigTy.getScalarSizeInBits() < 16);
1019         },
1020         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1021       .widenScalarIf(
1022         [=](const LegalityQuery &Query) {
1023           const LLT LitTy = Query.Types[LitTyIdx];
1024           return (LitTy.getScalarSizeInBits() < 16);
1025         },
1026         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1027       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1028       .widenScalarToNextPow2(BigTyIdx, 32);
1029 
1030   }
1031 
1032   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1033     .legalForCartesianProduct(AllS32Vectors, {S32})
1034     .legalForCartesianProduct(AllS64Vectors, {S64})
1035     .clampNumElements(0, V16S32, V32S32)
1036     .clampNumElements(0, V2S64, V16S64)
1037     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1038 
1039   if (ST.hasScalarPackInsts()) {
1040     BuildVector
1041       // FIXME: Should probably widen s1 vectors straight to s32
1042       .minScalarOrElt(0, S16)
1043       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1044       .minScalar(1, S32);
1045 
1046     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1047       .legalFor({V2S16, S32})
1048       .lower();
1049     BuildVector.minScalarOrElt(0, S32);
1050   } else {
1051     BuildVector.customFor({V2S16, S16});
1052     BuildVector.minScalarOrElt(0, S32);
1053 
1054     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1055       .customFor({V2S16, S32})
1056       .lower();
1057   }
1058 
1059   BuildVector.legalIf(isRegisterType(0));
1060 
1061   // FIXME: Clamp maximum size
1062   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1063     .legalIf(isRegisterType(0));
1064 
1065   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1066   // pre-legalize.
1067   if (ST.hasVOP3PInsts()) {
1068     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1069       .customFor({V2S16, V2S16})
1070       .lower();
1071   } else
1072     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1073 
1074   // Merge/Unmerge
1075   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1076     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1077     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1078 
1079     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1080       const LLT &Ty = Query.Types[TypeIdx];
1081       if (Ty.isVector()) {
1082         const LLT &EltTy = Ty.getElementType();
1083         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1084           return true;
1085         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1086           return true;
1087       }
1088       return false;
1089     };
1090 
1091     auto &Builder = getActionDefinitionsBuilder(Op)
1092       // Try to widen to s16 first for small types.
1093       // TODO: Only do this on targets with legal s16 shifts
1094       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1095 
1096       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1097       .lowerFor({{S16, V2S16}})
1098       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1099       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1100                            elementTypeIs(1, S16)),
1101                        changeTo(1, V2S16))
1102       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1103       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1104       // valid.
1105       .clampScalar(LitTyIdx, S32, S256)
1106       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1107       // Break up vectors with weird elements into scalars
1108       .fewerElementsIf(
1109         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1110         scalarize(0))
1111       .fewerElementsIf(
1112         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1113         scalarize(1))
1114       .clampScalar(BigTyIdx, S32, S1024);
1115 
1116     if (Op == G_MERGE_VALUES) {
1117       Builder.widenScalarIf(
1118         // TODO: Use 16-bit shifts if legal for 8-bit values?
1119         [=](const LegalityQuery &Query) {
1120           const LLT Ty = Query.Types[LitTyIdx];
1121           return Ty.getSizeInBits() < 32;
1122         },
1123         changeTo(LitTyIdx, S32));
1124     }
1125 
1126     Builder.widenScalarIf(
1127       [=](const LegalityQuery &Query) {
1128         const LLT Ty = Query.Types[BigTyIdx];
1129         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1130           Ty.getSizeInBits() % 16 != 0;
1131       },
1132       [=](const LegalityQuery &Query) {
1133         // Pick the next power of 2, or a multiple of 64 over 128.
1134         // Whichever is smaller.
1135         const LLT &Ty = Query.Types[BigTyIdx];
1136         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1137         if (NewSizeInBits >= 256) {
1138           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1139           if (RoundedTo < NewSizeInBits)
1140             NewSizeInBits = RoundedTo;
1141         }
1142         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1143       })
1144       .legalIf([=](const LegalityQuery &Query) {
1145           const LLT &BigTy = Query.Types[BigTyIdx];
1146           const LLT &LitTy = Query.Types[LitTyIdx];
1147 
1148           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1149             return false;
1150           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1151             return false;
1152 
1153           return BigTy.getSizeInBits() % 16 == 0 &&
1154                  LitTy.getSizeInBits() % 16 == 0 &&
1155                  BigTy.getSizeInBits() <= 1024;
1156         })
1157       // Any vectors left are the wrong size. Scalarize them.
1158       .scalarize(0)
1159       .scalarize(1);
1160   }
1161 
1162   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1163   // RegBankSelect.
1164   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1165     .legalFor({{S32}, {S64}});
1166 
1167   if (ST.hasVOP3PInsts()) {
1168     SextInReg.lowerFor({{V2S16}})
1169       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1170       // get more vector shift opportunities, since we'll get those when
1171       // expanded.
1172       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1173   } else if (ST.has16BitInsts()) {
1174     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1175   } else {
1176     // Prefer to promote to s32 before lowering if we don't have 16-bit
1177     // shifts. This avoid a lot of intermediate truncate and extend operations.
1178     SextInReg.lowerFor({{S32}, {S64}});
1179   }
1180 
1181   SextInReg
1182     .scalarize(0)
1183     .clampScalar(0, S32, S64)
1184     .lower();
1185 
1186   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1187     .legalFor({S64});
1188 
1189   getActionDefinitionsBuilder({
1190       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1191       G_FCOPYSIGN,
1192 
1193       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1194       G_READ_REGISTER,
1195       G_WRITE_REGISTER,
1196 
1197       G_SADDO, G_SSUBO,
1198 
1199        // TODO: Implement
1200       G_FMINIMUM, G_FMAXIMUM
1201     }).lower();
1202 
1203   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1204         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1205         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1206     .unsupported();
1207 
1208   computeTables();
1209   verify(*ST.getInstrInfo());
1210 }
1211 
1212 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1213                                          MachineRegisterInfo &MRI,
1214                                          MachineIRBuilder &B,
1215                                          GISelChangeObserver &Observer) const {
1216   switch (MI.getOpcode()) {
1217   case TargetOpcode::G_ADDRSPACE_CAST:
1218     return legalizeAddrSpaceCast(MI, MRI, B);
1219   case TargetOpcode::G_FRINT:
1220     return legalizeFrint(MI, MRI, B);
1221   case TargetOpcode::G_FCEIL:
1222     return legalizeFceil(MI, MRI, B);
1223   case TargetOpcode::G_INTRINSIC_TRUNC:
1224     return legalizeIntrinsicTrunc(MI, MRI, B);
1225   case TargetOpcode::G_SITOFP:
1226     return legalizeITOFP(MI, MRI, B, true);
1227   case TargetOpcode::G_UITOFP:
1228     return legalizeITOFP(MI, MRI, B, false);
1229   case TargetOpcode::G_FPTOSI:
1230     return legalizeFPTOI(MI, MRI, B, true);
1231   case TargetOpcode::G_FPTOUI:
1232     return legalizeFPTOI(MI, MRI, B, false);
1233   case TargetOpcode::G_FMINNUM:
1234   case TargetOpcode::G_FMAXNUM:
1235   case TargetOpcode::G_FMINNUM_IEEE:
1236   case TargetOpcode::G_FMAXNUM_IEEE:
1237     return legalizeMinNumMaxNum(MI, MRI, B);
1238   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1239     return legalizeExtractVectorElt(MI, MRI, B);
1240   case TargetOpcode::G_INSERT_VECTOR_ELT:
1241     return legalizeInsertVectorElt(MI, MRI, B);
1242   case TargetOpcode::G_SHUFFLE_VECTOR:
1243     return legalizeShuffleVector(MI, MRI, B);
1244   case TargetOpcode::G_FSIN:
1245   case TargetOpcode::G_FCOS:
1246     return legalizeSinCos(MI, MRI, B);
1247   case TargetOpcode::G_GLOBAL_VALUE:
1248     return legalizeGlobalValue(MI, MRI, B);
1249   case TargetOpcode::G_LOAD:
1250     return legalizeLoad(MI, MRI, B, Observer);
1251   case TargetOpcode::G_FMAD:
1252     return legalizeFMad(MI, MRI, B);
1253   case TargetOpcode::G_FDIV:
1254     return legalizeFDIV(MI, MRI, B);
1255   case TargetOpcode::G_ATOMIC_CMPXCHG:
1256     return legalizeAtomicCmpXChg(MI, MRI, B);
1257   case TargetOpcode::G_FLOG:
1258     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1259   case TargetOpcode::G_FLOG10:
1260     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1261   case TargetOpcode::G_FEXP:
1262     return legalizeFExp(MI, B);
1263   case TargetOpcode::G_FFLOOR:
1264     return legalizeFFloor(MI, MRI, B);
1265   case TargetOpcode::G_BUILD_VECTOR:
1266     return legalizeBuildVector(MI, MRI, B);
1267   default:
1268     return false;
1269   }
1270 
1271   llvm_unreachable("expected switch to return");
1272 }
1273 
1274 Register AMDGPULegalizerInfo::getSegmentAperture(
1275   unsigned AS,
1276   MachineRegisterInfo &MRI,
1277   MachineIRBuilder &B) const {
1278   MachineFunction &MF = B.getMF();
1279   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1280   const LLT S32 = LLT::scalar(32);
1281 
1282   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1283 
1284   if (ST.hasApertureRegs()) {
1285     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1286     // getreg.
1287     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1288         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1289         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1290     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1291         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1292         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1293     unsigned Encoding =
1294         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1295         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1296         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1297 
1298     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1299 
1300     B.buildInstr(AMDGPU::S_GETREG_B32)
1301       .addDef(GetReg)
1302       .addImm(Encoding);
1303     MRI.setType(GetReg, S32);
1304 
1305     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1306     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1307   }
1308 
1309   Register QueuePtr = MRI.createGenericVirtualRegister(
1310     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1311 
1312   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1313   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1314     return Register();
1315 
1316   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1317   // private_segment_aperture_base_hi.
1318   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1319 
1320   // TODO: can we be smarter about machine pointer info?
1321   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1322   MachineMemOperand *MMO = MF.getMachineMemOperand(
1323     PtrInfo,
1324     MachineMemOperand::MOLoad |
1325     MachineMemOperand::MODereferenceable |
1326     MachineMemOperand::MOInvariant,
1327     4,
1328     MinAlign(64, StructOffset));
1329 
1330   Register LoadAddr;
1331 
1332   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1333   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1334 }
1335 
1336 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1337   MachineInstr &MI, MachineRegisterInfo &MRI,
1338   MachineIRBuilder &B) const {
1339   MachineFunction &MF = B.getMF();
1340 
1341   B.setInstr(MI);
1342 
1343   const LLT S32 = LLT::scalar(32);
1344   Register Dst = MI.getOperand(0).getReg();
1345   Register Src = MI.getOperand(1).getReg();
1346 
1347   LLT DstTy = MRI.getType(Dst);
1348   LLT SrcTy = MRI.getType(Src);
1349   unsigned DestAS = DstTy.getAddressSpace();
1350   unsigned SrcAS = SrcTy.getAddressSpace();
1351 
1352   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1353   // vector element.
1354   assert(!DstTy.isVector());
1355 
1356   const AMDGPUTargetMachine &TM
1357     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1358 
1359   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1360   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1361     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1362     return true;
1363   }
1364 
1365   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1366     // Truncate.
1367     B.buildExtract(Dst, Src, 0);
1368     MI.eraseFromParent();
1369     return true;
1370   }
1371 
1372   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1373     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1374     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1375 
1376     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1377     // another. Merge operands are required to be the same type, but creating an
1378     // extra ptrtoint would be kind of pointless.
1379     auto HighAddr = B.buildConstant(
1380       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1381     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1382     MI.eraseFromParent();
1383     return true;
1384   }
1385 
1386   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1387     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1388            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1389     unsigned NullVal = TM.getNullPointerValue(DestAS);
1390 
1391     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1392     auto FlatNull = B.buildConstant(SrcTy, 0);
1393 
1394     // Extract low 32-bits of the pointer.
1395     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1396 
1397     auto CmpRes =
1398         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1399     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1400 
1401     MI.eraseFromParent();
1402     return true;
1403   }
1404 
1405   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1406     return false;
1407 
1408   if (!ST.hasFlatAddressSpace())
1409     return false;
1410 
1411   auto SegmentNull =
1412       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1413   auto FlatNull =
1414       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1415 
1416   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1417   if (!ApertureReg.isValid())
1418     return false;
1419 
1420   auto CmpRes =
1421       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1422 
1423   // Coerce the type of the low half of the result so we can use merge_values.
1424   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1425 
1426   // TODO: Should we allow mismatched types but matching sizes in merges to
1427   // avoid the ptrtoint?
1428   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1429   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1430 
1431   MI.eraseFromParent();
1432   return true;
1433 }
1434 
1435 bool AMDGPULegalizerInfo::legalizeFrint(
1436   MachineInstr &MI, MachineRegisterInfo &MRI,
1437   MachineIRBuilder &B) const {
1438   B.setInstr(MI);
1439 
1440   Register Src = MI.getOperand(1).getReg();
1441   LLT Ty = MRI.getType(Src);
1442   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1443 
1444   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1445   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1446 
1447   auto C1 = B.buildFConstant(Ty, C1Val);
1448   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1449 
1450   // TODO: Should this propagate fast-math-flags?
1451   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1452   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1453 
1454   auto C2 = B.buildFConstant(Ty, C2Val);
1455   auto Fabs = B.buildFAbs(Ty, Src);
1456 
1457   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1458   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1459   return true;
1460 }
1461 
1462 bool AMDGPULegalizerInfo::legalizeFceil(
1463   MachineInstr &MI, MachineRegisterInfo &MRI,
1464   MachineIRBuilder &B) const {
1465   B.setInstr(MI);
1466 
1467   const LLT S1 = LLT::scalar(1);
1468   const LLT S64 = LLT::scalar(64);
1469 
1470   Register Src = MI.getOperand(1).getReg();
1471   assert(MRI.getType(Src) == S64);
1472 
1473   // result = trunc(src)
1474   // if (src > 0.0 && src != result)
1475   //   result += 1.0
1476 
1477   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1478 
1479   const auto Zero = B.buildFConstant(S64, 0.0);
1480   const auto One = B.buildFConstant(S64, 1.0);
1481   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1482   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1483   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1484   auto Add = B.buildSelect(S64, And, One, Zero);
1485 
1486   // TODO: Should this propagate fast-math-flags?
1487   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1488   return true;
1489 }
1490 
1491 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1492                                               MachineIRBuilder &B) {
1493   const unsigned FractBits = 52;
1494   const unsigned ExpBits = 11;
1495   LLT S32 = LLT::scalar(32);
1496 
1497   auto Const0 = B.buildConstant(S32, FractBits - 32);
1498   auto Const1 = B.buildConstant(S32, ExpBits);
1499 
1500   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1501     .addUse(Const0.getReg(0))
1502     .addUse(Const1.getReg(0));
1503 
1504   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1505 }
1506 
1507 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1508   MachineInstr &MI, MachineRegisterInfo &MRI,
1509   MachineIRBuilder &B) const {
1510   B.setInstr(MI);
1511 
1512   const LLT S1 = LLT::scalar(1);
1513   const LLT S32 = LLT::scalar(32);
1514   const LLT S64 = LLT::scalar(64);
1515 
1516   Register Src = MI.getOperand(1).getReg();
1517   assert(MRI.getType(Src) == S64);
1518 
1519   // TODO: Should this use extract since the low half is unused?
1520   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1521   Register Hi = Unmerge.getReg(1);
1522 
1523   // Extract the upper half, since this is where we will find the sign and
1524   // exponent.
1525   auto Exp = extractF64Exponent(Hi, B);
1526 
1527   const unsigned FractBits = 52;
1528 
1529   // Extract the sign bit.
1530   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1531   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1532 
1533   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1534 
1535   const auto Zero32 = B.buildConstant(S32, 0);
1536 
1537   // Extend back to 64-bits.
1538   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1539 
1540   auto Shr = B.buildAShr(S64, FractMask, Exp);
1541   auto Not = B.buildNot(S64, Shr);
1542   auto Tmp0 = B.buildAnd(S64, Src, Not);
1543   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1544 
1545   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1546   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1547 
1548   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1549   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1550   return true;
1551 }
1552 
1553 bool AMDGPULegalizerInfo::legalizeITOFP(
1554   MachineInstr &MI, MachineRegisterInfo &MRI,
1555   MachineIRBuilder &B, bool Signed) const {
1556   B.setInstr(MI);
1557 
1558   Register Dst = MI.getOperand(0).getReg();
1559   Register Src = MI.getOperand(1).getReg();
1560 
1561   const LLT S64 = LLT::scalar(64);
1562   const LLT S32 = LLT::scalar(32);
1563 
1564   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1565 
1566   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1567 
1568   auto CvtHi = Signed ?
1569     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1570     B.buildUITOFP(S64, Unmerge.getReg(1));
1571 
1572   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1573 
1574   auto ThirtyTwo = B.buildConstant(S32, 32);
1575   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1576     .addUse(CvtHi.getReg(0))
1577     .addUse(ThirtyTwo.getReg(0));
1578 
1579   // TODO: Should this propagate fast-math-flags?
1580   B.buildFAdd(Dst, LdExp, CvtLo);
1581   MI.eraseFromParent();
1582   return true;
1583 }
1584 
1585 // TODO: Copied from DAG implementation. Verify logic and document how this
1586 // actually works.
1587 bool AMDGPULegalizerInfo::legalizeFPTOI(
1588   MachineInstr &MI, MachineRegisterInfo &MRI,
1589   MachineIRBuilder &B, bool Signed) const {
1590   B.setInstr(MI);
1591 
1592   Register Dst = MI.getOperand(0).getReg();
1593   Register Src = MI.getOperand(1).getReg();
1594 
1595   const LLT S64 = LLT::scalar(64);
1596   const LLT S32 = LLT::scalar(32);
1597 
1598   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1599 
1600   unsigned Flags = MI.getFlags();
1601 
1602   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1603   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1604   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1605 
1606   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1607   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1608   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1609 
1610   auto Hi = Signed ?
1611     B.buildFPTOSI(S32, FloorMul) :
1612     B.buildFPTOUI(S32, FloorMul);
1613   auto Lo = B.buildFPTOUI(S32, Fma);
1614 
1615   B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) });
1616   MI.eraseFromParent();
1617 
1618   return true;
1619 }
1620 
1621 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1622   MachineInstr &MI, MachineRegisterInfo &MRI,
1623   MachineIRBuilder &B) const {
1624   MachineFunction &MF = B.getMF();
1625   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1626 
1627   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1628                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1629 
1630   // With ieee_mode disabled, the instructions have the correct behavior
1631   // already for G_FMINNUM/G_FMAXNUM
1632   if (!MFI->getMode().IEEE)
1633     return !IsIEEEOp;
1634 
1635   if (IsIEEEOp)
1636     return true;
1637 
1638   MachineIRBuilder HelperBuilder(MI);
1639   GISelObserverWrapper DummyObserver;
1640   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1641   HelperBuilder.setInstr(MI);
1642   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1643 }
1644 
1645 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1646   MachineInstr &MI, MachineRegisterInfo &MRI,
1647   MachineIRBuilder &B) const {
1648   // TODO: Should move some of this into LegalizerHelper.
1649 
1650   // TODO: Promote dynamic indexing of s16 to s32
1651   // TODO: Dynamic s64 indexing is only legal for SGPR.
1652   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1653   if (!IdxVal) // Dynamic case will be selected to register indexing.
1654     return true;
1655 
1656   Register Dst = MI.getOperand(0).getReg();
1657   Register Vec = MI.getOperand(1).getReg();
1658 
1659   LLT VecTy = MRI.getType(Vec);
1660   LLT EltTy = VecTy.getElementType();
1661   assert(EltTy == MRI.getType(Dst));
1662 
1663   B.setInstr(MI);
1664 
1665   if (IdxVal.getValue() < VecTy.getNumElements())
1666     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1667   else
1668     B.buildUndef(Dst);
1669 
1670   MI.eraseFromParent();
1671   return true;
1672 }
1673 
1674 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1675   MachineInstr &MI, MachineRegisterInfo &MRI,
1676   MachineIRBuilder &B) const {
1677   // TODO: Should move some of this into LegalizerHelper.
1678 
1679   // TODO: Promote dynamic indexing of s16 to s32
1680   // TODO: Dynamic s64 indexing is only legal for SGPR.
1681   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1682   if (!IdxVal) // Dynamic case will be selected to register indexing.
1683     return true;
1684 
1685   Register Dst = MI.getOperand(0).getReg();
1686   Register Vec = MI.getOperand(1).getReg();
1687   Register Ins = MI.getOperand(2).getReg();
1688 
1689   LLT VecTy = MRI.getType(Vec);
1690   LLT EltTy = VecTy.getElementType();
1691   assert(EltTy == MRI.getType(Ins));
1692 
1693   B.setInstr(MI);
1694 
1695   if (IdxVal.getValue() < VecTy.getNumElements())
1696     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1697   else
1698     B.buildUndef(Dst);
1699 
1700   MI.eraseFromParent();
1701   return true;
1702 }
1703 
1704 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1705   assert(Mask.size() == 2);
1706 
1707   // If one half is undef, the other is trivially in the same reg.
1708   if (Mask[0] == -1 || Mask[1] == -1)
1709     return true;
1710   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1711          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1712 }
1713 
1714 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1715   MachineInstr &MI, MachineRegisterInfo &MRI,
1716   MachineIRBuilder &B) const {
1717   const LLT V2S16 = LLT::vector(2, 16);
1718 
1719   Register Dst = MI.getOperand(0).getReg();
1720   Register Src0 = MI.getOperand(1).getReg();
1721   LLT DstTy = MRI.getType(Dst);
1722   LLT SrcTy = MRI.getType(Src0);
1723 
1724   if (SrcTy == V2S16 && DstTy == V2S16 &&
1725       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1726     return true;
1727 
1728   MachineIRBuilder HelperBuilder(MI);
1729   GISelObserverWrapper DummyObserver;
1730   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1731   HelperBuilder.setInstr(MI);
1732   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1733 }
1734 
1735 bool AMDGPULegalizerInfo::legalizeSinCos(
1736   MachineInstr &MI, MachineRegisterInfo &MRI,
1737   MachineIRBuilder &B) const {
1738   B.setInstr(MI);
1739 
1740   Register DstReg = MI.getOperand(0).getReg();
1741   Register SrcReg = MI.getOperand(1).getReg();
1742   LLT Ty = MRI.getType(DstReg);
1743   unsigned Flags = MI.getFlags();
1744 
1745   Register TrigVal;
1746   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1747   if (ST.hasTrigReducedRange()) {
1748     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1749     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1750       .addUse(MulVal.getReg(0))
1751       .setMIFlags(Flags).getReg(0);
1752   } else
1753     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1754 
1755   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1756     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1757   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1758     .addUse(TrigVal)
1759     .setMIFlags(Flags);
1760   MI.eraseFromParent();
1761   return true;
1762 }
1763 
1764 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1765   Register DstReg, LLT PtrTy,
1766   MachineIRBuilder &B, const GlobalValue *GV,
1767   unsigned Offset, unsigned GAFlags) const {
1768   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1769   // to the following code sequence:
1770   //
1771   // For constant address space:
1772   //   s_getpc_b64 s[0:1]
1773   //   s_add_u32 s0, s0, $symbol
1774   //   s_addc_u32 s1, s1, 0
1775   //
1776   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1777   //   a fixup or relocation is emitted to replace $symbol with a literal
1778   //   constant, which is a pc-relative offset from the encoding of the $symbol
1779   //   operand to the global variable.
1780   //
1781   // For global address space:
1782   //   s_getpc_b64 s[0:1]
1783   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1784   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1785   //
1786   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1787   //   fixups or relocations are emitted to replace $symbol@*@lo and
1788   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1789   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1790   //   operand to the global variable.
1791   //
1792   // What we want here is an offset from the value returned by s_getpc
1793   // (which is the address of the s_add_u32 instruction) to the global
1794   // variable, but since the encoding of $symbol starts 4 bytes after the start
1795   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1796   // small. This requires us to add 4 to the global variable offset in order to
1797   // compute the correct address.
1798 
1799   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1800 
1801   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1802     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1803 
1804   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1805     .addDef(PCReg);
1806 
1807   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1808   if (GAFlags == SIInstrInfo::MO_NONE)
1809     MIB.addImm(0);
1810   else
1811     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1812 
1813   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1814 
1815   if (PtrTy.getSizeInBits() == 32)
1816     B.buildExtract(DstReg, PCReg, 0);
1817   return true;
1818  }
1819 
1820 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1821   MachineInstr &MI, MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B) const {
1823   Register DstReg = MI.getOperand(0).getReg();
1824   LLT Ty = MRI.getType(DstReg);
1825   unsigned AS = Ty.getAddressSpace();
1826 
1827   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1828   MachineFunction &MF = B.getMF();
1829   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1830   B.setInstr(MI);
1831 
1832   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1833     if (!MFI->isEntryFunction()) {
1834       const Function &Fn = MF.getFunction();
1835       DiagnosticInfoUnsupported BadLDSDecl(
1836         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1837       Fn.getContext().diagnose(BadLDSDecl);
1838     }
1839 
1840     // TODO: We could emit code to handle the initialization somewhere.
1841     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1842       const SITargetLowering *TLI = ST.getTargetLowering();
1843       if (!TLI->shouldUseLDSConstAddress(GV)) {
1844         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1845         return true; // Leave in place;
1846       }
1847 
1848       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1849       MI.eraseFromParent();
1850       return true;
1851     }
1852 
1853     const Function &Fn = MF.getFunction();
1854     DiagnosticInfoUnsupported BadInit(
1855       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1856     Fn.getContext().diagnose(BadInit);
1857     return true;
1858   }
1859 
1860   const SITargetLowering *TLI = ST.getTargetLowering();
1861 
1862   if (TLI->shouldEmitFixup(GV)) {
1863     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1864     MI.eraseFromParent();
1865     return true;
1866   }
1867 
1868   if (TLI->shouldEmitPCReloc(GV)) {
1869     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1870     MI.eraseFromParent();
1871     return true;
1872   }
1873 
1874   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1875   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1876 
1877   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1878     MachinePointerInfo::getGOT(MF),
1879     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1880     MachineMemOperand::MOInvariant,
1881     8 /*Size*/, 8 /*Align*/);
1882 
1883   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1884 
1885   if (Ty.getSizeInBits() == 32) {
1886     // Truncate if this is a 32-bit constant adrdess.
1887     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1888     B.buildExtract(DstReg, Load, 0);
1889   } else
1890     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1891 
1892   MI.eraseFromParent();
1893   return true;
1894 }
1895 
1896 bool AMDGPULegalizerInfo::legalizeLoad(
1897   MachineInstr &MI, MachineRegisterInfo &MRI,
1898   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1899   B.setInstr(MI);
1900   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1901   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1902   Observer.changingInstr(MI);
1903   MI.getOperand(1).setReg(Cast.getReg(0));
1904   Observer.changedInstr(MI);
1905   return true;
1906 }
1907 
1908 bool AMDGPULegalizerInfo::legalizeFMad(
1909   MachineInstr &MI, MachineRegisterInfo &MRI,
1910   MachineIRBuilder &B) const {
1911   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1912   assert(Ty.isScalar());
1913 
1914   MachineFunction &MF = B.getMF();
1915   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1916 
1917   // TODO: Always legal with future ftz flag.
1918   // FIXME: Do we need just output?
1919   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
1920     return true;
1921   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
1922     return true;
1923 
1924   MachineIRBuilder HelperBuilder(MI);
1925   GISelObserverWrapper DummyObserver;
1926   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1927   HelperBuilder.setMBB(*MI.getParent());
1928   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1929 }
1930 
1931 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1932   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1933   Register DstReg = MI.getOperand(0).getReg();
1934   Register PtrReg = MI.getOperand(1).getReg();
1935   Register CmpVal = MI.getOperand(2).getReg();
1936   Register NewVal = MI.getOperand(3).getReg();
1937 
1938   assert(SITargetLowering::isFlatGlobalAddrSpace(
1939            MRI.getType(PtrReg).getAddressSpace()) &&
1940          "this should not have been custom lowered");
1941 
1942   LLT ValTy = MRI.getType(CmpVal);
1943   LLT VecTy = LLT::vector(2, ValTy);
1944 
1945   B.setInstr(MI);
1946   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1947 
1948   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1949     .addDef(DstReg)
1950     .addUse(PtrReg)
1951     .addUse(PackedVal)
1952     .setMemRefs(MI.memoperands());
1953 
1954   MI.eraseFromParent();
1955   return true;
1956 }
1957 
1958 bool AMDGPULegalizerInfo::legalizeFlog(
1959   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
1960   Register Dst = MI.getOperand(0).getReg();
1961   Register Src = MI.getOperand(1).getReg();
1962   LLT Ty = B.getMRI()->getType(Dst);
1963   unsigned Flags = MI.getFlags();
1964   B.setInstr(MI);
1965 
1966   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
1967   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
1968 
1969   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
1970   MI.eraseFromParent();
1971   return true;
1972 }
1973 
1974 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
1975                                        MachineIRBuilder &B) const {
1976   Register Dst = MI.getOperand(0).getReg();
1977   Register Src = MI.getOperand(1).getReg();
1978   unsigned Flags = MI.getFlags();
1979   LLT Ty = B.getMRI()->getType(Dst);
1980   B.setInstr(MI);
1981 
1982   auto K = B.buildFConstant(Ty, numbers::log2e);
1983   auto Mul = B.buildFMul(Ty, Src, K, Flags);
1984   B.buildFExp2(Dst, Mul, Flags);
1985   MI.eraseFromParent();
1986   return true;
1987 }
1988 
1989 // Find a source register, ignoring any possible source modifiers.
1990 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
1991   Register ModSrc = OrigSrc;
1992   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
1993     ModSrc = SrcFNeg->getOperand(1).getReg();
1994     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
1995       ModSrc = SrcFAbs->getOperand(1).getReg();
1996   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
1997     ModSrc = SrcFAbs->getOperand(1).getReg();
1998   return ModSrc;
1999 }
2000 
2001 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2002                                          MachineRegisterInfo &MRI,
2003                                          MachineIRBuilder &B) const {
2004   B.setInstr(MI);
2005 
2006   const LLT S1 = LLT::scalar(1);
2007   const LLT S64 = LLT::scalar(64);
2008   Register Dst = MI.getOperand(0).getReg();
2009   Register OrigSrc = MI.getOperand(1).getReg();
2010   unsigned Flags = MI.getFlags();
2011   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2012          "this should not have been custom lowered");
2013 
2014   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2015   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2016   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2017   // V_FRACT bug is:
2018   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2019   //
2020   // Convert floor(x) to (x - fract(x))
2021 
2022   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2023     .addUse(OrigSrc)
2024     .setMIFlags(Flags);
2025 
2026   // Give source modifier matching some assistance before obscuring a foldable
2027   // pattern.
2028 
2029   // TODO: We can avoid the neg on the fract? The input sign to fract
2030   // shouldn't matter?
2031   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2032 
2033   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2034 
2035   Register Min = MRI.createGenericVirtualRegister(S64);
2036 
2037   // We don't need to concern ourselves with the snan handling difference, so
2038   // use the one which will directly select.
2039   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2040   if (MFI->getMode().IEEE)
2041     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2042   else
2043     B.buildFMinNum(Min, Fract, Const, Flags);
2044 
2045   Register CorrectedFract = Min;
2046   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2047     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2048     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2049   }
2050 
2051   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2052   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2053 
2054   MI.eraseFromParent();
2055   return true;
2056 }
2057 
2058 // Turn an illegal packed v2s16 build vector into bit operations.
2059 // TODO: This should probably be a bitcast action in LegalizerHelper.
2060 bool AMDGPULegalizerInfo::legalizeBuildVector(
2061   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2062   Register Dst = MI.getOperand(0).getReg();
2063   LLT DstTy = MRI.getType(Dst);
2064   const LLT S32 = LLT::scalar(32);
2065   const LLT V2S16 = LLT::vector(2, 16);
2066   (void)DstTy;
2067   (void)V2S16;
2068   assert(DstTy == V2S16);
2069 
2070   Register Src0 = MI.getOperand(1).getReg();
2071   Register Src1 = MI.getOperand(2).getReg();
2072   assert(MRI.getType(Src0) == LLT::scalar(16));
2073 
2074   B.setInstr(MI);
2075   auto Merge = B.buildMerge(S32, {Src0, Src1});
2076   B.buildBitcast(Dst, Merge);
2077 
2078   MI.eraseFromParent();
2079   return true;
2080 }
2081 
2082 // Return the use branch instruction, otherwise null if the usage is invalid.
2083 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2084                                        MachineRegisterInfo &MRI,
2085                                        MachineInstr *&Br) {
2086   Register CondDef = MI.getOperand(0).getReg();
2087   if (!MRI.hasOneNonDBGUse(CondDef))
2088     return nullptr;
2089 
2090   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2091   if (UseMI.getParent() != MI.getParent() ||
2092       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2093     return nullptr;
2094 
2095   // Make sure the cond br is followed by a G_BR
2096   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2097   if (Next != MI.getParent()->end()) {
2098     if (Next->getOpcode() != AMDGPU::G_BR)
2099       return nullptr;
2100     Br = &*Next;
2101   }
2102 
2103   return &UseMI;
2104 }
2105 
2106 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2107                                                 Register Reg, LLT Ty) const {
2108   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2109   if (LiveIn)
2110     return LiveIn;
2111 
2112   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2113   MRI.addLiveIn(Reg, NewReg);
2114   return NewReg;
2115 }
2116 
2117 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2118                                          const ArgDescriptor *Arg) const {
2119   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2120     return false; // TODO: Handle these
2121 
2122   assert(Arg->getRegister().isPhysical());
2123 
2124   MachineRegisterInfo &MRI = *B.getMRI();
2125 
2126   LLT Ty = MRI.getType(DstReg);
2127   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2128 
2129   if (Arg->isMasked()) {
2130     // TODO: Should we try to emit this once in the entry block?
2131     const LLT S32 = LLT::scalar(32);
2132     const unsigned Mask = Arg->getMask();
2133     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2134 
2135     Register AndMaskSrc = LiveIn;
2136 
2137     if (Shift != 0) {
2138       auto ShiftAmt = B.buildConstant(S32, Shift);
2139       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2140     }
2141 
2142     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2143   } else
2144     B.buildCopy(DstReg, LiveIn);
2145 
2146   // Insert the argument copy if it doens't already exist.
2147   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2148   if (!MRI.getVRegDef(LiveIn)) {
2149     // FIXME: Should have scoped insert pt
2150     MachineBasicBlock &OrigInsBB = B.getMBB();
2151     auto OrigInsPt = B.getInsertPt();
2152 
2153     MachineBasicBlock &EntryMBB = B.getMF().front();
2154     EntryMBB.addLiveIn(Arg->getRegister());
2155     B.setInsertPt(EntryMBB, EntryMBB.begin());
2156     B.buildCopy(LiveIn, Arg->getRegister());
2157 
2158     B.setInsertPt(OrigInsBB, OrigInsPt);
2159   }
2160 
2161   return true;
2162 }
2163 
2164 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2165   MachineInstr &MI,
2166   MachineRegisterInfo &MRI,
2167   MachineIRBuilder &B,
2168   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2169   B.setInstr(MI);
2170 
2171   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2172 
2173   const ArgDescriptor *Arg;
2174   const TargetRegisterClass *RC;
2175   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2176   if (!Arg) {
2177     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2178     return false;
2179   }
2180 
2181   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2182     MI.eraseFromParent();
2183     return true;
2184   }
2185 
2186   return false;
2187 }
2188 
2189 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2190                                        MachineRegisterInfo &MRI,
2191                                        MachineIRBuilder &B) const {
2192   B.setInstr(MI);
2193   Register Dst = MI.getOperand(0).getReg();
2194   LLT DstTy = MRI.getType(Dst);
2195   LLT S16 = LLT::scalar(16);
2196   LLT S32 = LLT::scalar(32);
2197   LLT S64 = LLT::scalar(64);
2198 
2199   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2200     return true;
2201 
2202   if (DstTy == S16)
2203     return legalizeFDIV16(MI, MRI, B);
2204   if (DstTy == S32)
2205     return legalizeFDIV32(MI, MRI, B);
2206   if (DstTy == S64)
2207     return legalizeFDIV64(MI, MRI, B);
2208 
2209   return false;
2210 }
2211 
2212 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2213                                                  MachineRegisterInfo &MRI,
2214                                                  MachineIRBuilder &B) const {
2215   Register Res = MI.getOperand(0).getReg();
2216   Register LHS = MI.getOperand(1).getReg();
2217   Register RHS = MI.getOperand(2).getReg();
2218 
2219   uint16_t Flags = MI.getFlags();
2220 
2221   LLT ResTy = MRI.getType(Res);
2222   LLT S32 = LLT::scalar(32);
2223   LLT S64 = LLT::scalar(64);
2224 
2225   const MachineFunction &MF = B.getMF();
2226   bool Unsafe =
2227     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2228 
2229   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2230     return false;
2231 
2232   if (!Unsafe && ResTy == S32 &&
2233       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2234     return false;
2235 
2236   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2237     // 1 / x -> RCP(x)
2238     if (CLHS->isExactlyValue(1.0)) {
2239       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2240         .addUse(RHS)
2241         .setMIFlags(Flags);
2242 
2243       MI.eraseFromParent();
2244       return true;
2245     }
2246 
2247     // -1 / x -> RCP( FNEG(x) )
2248     if (CLHS->isExactlyValue(-1.0)) {
2249       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2250       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2251         .addUse(FNeg.getReg(0))
2252         .setMIFlags(Flags);
2253 
2254       MI.eraseFromParent();
2255       return true;
2256     }
2257   }
2258 
2259   // x / y -> x * (1.0 / y)
2260   if (Unsafe) {
2261     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2262       .addUse(RHS)
2263       .setMIFlags(Flags);
2264     B.buildFMul(Res, LHS, RCP, Flags);
2265 
2266     MI.eraseFromParent();
2267     return true;
2268   }
2269 
2270   return false;
2271 }
2272 
2273 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2274                                          MachineRegisterInfo &MRI,
2275                                          MachineIRBuilder &B) const {
2276   B.setInstr(MI);
2277   Register Res = MI.getOperand(0).getReg();
2278   Register LHS = MI.getOperand(1).getReg();
2279   Register RHS = MI.getOperand(2).getReg();
2280 
2281   uint16_t Flags = MI.getFlags();
2282 
2283   LLT S16 = LLT::scalar(16);
2284   LLT S32 = LLT::scalar(32);
2285 
2286   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2287   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2288 
2289   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2290     .addUse(RHSExt.getReg(0))
2291     .setMIFlags(Flags);
2292 
2293   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2294   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2295 
2296   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2297     .addUse(RDst.getReg(0))
2298     .addUse(RHS)
2299     .addUse(LHS)
2300     .setMIFlags(Flags);
2301 
2302   MI.eraseFromParent();
2303   return true;
2304 }
2305 
2306 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2307 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2308 static void toggleSPDenormMode(bool Enable,
2309                                MachineIRBuilder &B,
2310                                const GCNSubtarget &ST,
2311                                AMDGPU::SIModeRegisterDefaults Mode) {
2312   // Set SP denorm mode to this value.
2313   unsigned SPDenormMode =
2314     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2315 
2316   if (ST.hasDenormModeInst()) {
2317     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2318     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2319 
2320     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2321     B.buildInstr(AMDGPU::S_DENORM_MODE)
2322       .addImm(NewDenormModeValue);
2323 
2324   } else {
2325     // Select FP32 bit field in mode register.
2326     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2327                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2328                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2329 
2330     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2331       .addImm(SPDenormMode)
2332       .addImm(SPDenormModeBitField);
2333   }
2334 }
2335 
2336 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2337                                          MachineRegisterInfo &MRI,
2338                                          MachineIRBuilder &B) const {
2339   B.setInstr(MI);
2340   Register Res = MI.getOperand(0).getReg();
2341   Register LHS = MI.getOperand(1).getReg();
2342   Register RHS = MI.getOperand(2).getReg();
2343   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2344   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2345 
2346   uint16_t Flags = MI.getFlags();
2347 
2348   LLT S32 = LLT::scalar(32);
2349   LLT S1 = LLT::scalar(1);
2350 
2351   auto One = B.buildFConstant(S32, 1.0f);
2352 
2353   auto DenominatorScaled =
2354     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2355       .addUse(RHS)
2356       .addUse(LHS)
2357       .addImm(1)
2358       .setMIFlags(Flags);
2359   auto NumeratorScaled =
2360     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2361       .addUse(LHS)
2362       .addUse(RHS)
2363       .addImm(0)
2364       .setMIFlags(Flags);
2365 
2366   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2367     .addUse(DenominatorScaled.getReg(0))
2368     .setMIFlags(Flags);
2369   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2370 
2371   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2372   // aren't modeled as reading it.
2373   if (!Mode.allFP32Denormals())
2374     toggleSPDenormMode(true, B, ST, Mode);
2375 
2376   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2377   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2378   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2379   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2380   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2381   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2382 
2383   if (!Mode.allFP32Denormals())
2384     toggleSPDenormMode(false, B, ST, Mode);
2385 
2386   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2387     .addUse(Fma4.getReg(0))
2388     .addUse(Fma1.getReg(0))
2389     .addUse(Fma3.getReg(0))
2390     .addUse(NumeratorScaled.getReg(1))
2391     .setMIFlags(Flags);
2392 
2393   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2394     .addUse(Fmas.getReg(0))
2395     .addUse(RHS)
2396     .addUse(LHS)
2397     .setMIFlags(Flags);
2398 
2399   MI.eraseFromParent();
2400   return true;
2401 }
2402 
2403 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2404                                          MachineRegisterInfo &MRI,
2405                                          MachineIRBuilder &B) const {
2406   B.setInstr(MI);
2407   Register Res = MI.getOperand(0).getReg();
2408   Register LHS = MI.getOperand(1).getReg();
2409   Register RHS = MI.getOperand(2).getReg();
2410 
2411   uint16_t Flags = MI.getFlags();
2412 
2413   LLT S64 = LLT::scalar(64);
2414   LLT S1 = LLT::scalar(1);
2415 
2416   auto One = B.buildFConstant(S64, 1.0);
2417 
2418   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2419     .addUse(LHS)
2420     .addUse(RHS)
2421     .addImm(1)
2422     .setMIFlags(Flags);
2423 
2424   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2425 
2426   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2427     .addUse(DivScale0.getReg(0))
2428     .setMIFlags(Flags);
2429 
2430   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2431   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2432   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2433 
2434   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2435     .addUse(LHS)
2436     .addUse(RHS)
2437     .addImm(0)
2438     .setMIFlags(Flags);
2439 
2440   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2441   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2442   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2443 
2444   Register Scale;
2445   if (!ST.hasUsableDivScaleConditionOutput()) {
2446     // Workaround a hardware bug on SI where the condition output from div_scale
2447     // is not usable.
2448 
2449     LLT S32 = LLT::scalar(32);
2450 
2451     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2452     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2453     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2454     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2455 
2456     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2457                               Scale1Unmerge.getReg(1));
2458     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2459                               Scale0Unmerge.getReg(1));
2460     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2461   } else {
2462     Scale = DivScale1.getReg(1);
2463   }
2464 
2465   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2466     .addUse(Fma4.getReg(0))
2467     .addUse(Fma3.getReg(0))
2468     .addUse(Mul.getReg(0))
2469     .addUse(Scale)
2470     .setMIFlags(Flags);
2471 
2472   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2473     .addUse(Fmas.getReg(0))
2474     .addUse(RHS)
2475     .addUse(LHS)
2476     .setMIFlags(Flags);
2477 
2478   MI.eraseFromParent();
2479   return true;
2480 }
2481 
2482 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2483                                                  MachineRegisterInfo &MRI,
2484                                                  MachineIRBuilder &B) const {
2485   B.setInstr(MI);
2486   Register Res = MI.getOperand(0).getReg();
2487   Register LHS = MI.getOperand(2).getReg();
2488   Register RHS = MI.getOperand(3).getReg();
2489   uint16_t Flags = MI.getFlags();
2490 
2491   LLT S32 = LLT::scalar(32);
2492   LLT S1 = LLT::scalar(1);
2493 
2494   auto Abs = B.buildFAbs(S32, RHS, Flags);
2495   const APFloat C0Val(1.0f);
2496 
2497   auto C0 = B.buildConstant(S32, 0x6f800000);
2498   auto C1 = B.buildConstant(S32, 0x2f800000);
2499   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2500 
2501   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2502   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2503 
2504   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2505 
2506   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2507     .addUse(Mul0.getReg(0))
2508     .setMIFlags(Flags);
2509 
2510   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2511 
2512   B.buildFMul(Res, Sel, Mul1, Flags);
2513 
2514   MI.eraseFromParent();
2515   return true;
2516 }
2517 
2518 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2519                                                  MachineRegisterInfo &MRI,
2520                                                  MachineIRBuilder &B) const {
2521   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2522   if (!MFI->isEntryFunction()) {
2523     return legalizePreloadedArgIntrin(MI, MRI, B,
2524                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2525   }
2526 
2527   B.setInstr(MI);
2528 
2529   uint64_t Offset =
2530     ST.getTargetLowering()->getImplicitParameterOffset(
2531       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2532   Register DstReg = MI.getOperand(0).getReg();
2533   LLT DstTy = MRI.getType(DstReg);
2534   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2535 
2536   const ArgDescriptor *Arg;
2537   const TargetRegisterClass *RC;
2538   std::tie(Arg, RC)
2539     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2540   if (!Arg)
2541     return false;
2542 
2543   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2544   if (!loadInputValue(KernargPtrReg, B, Arg))
2545     return false;
2546 
2547   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2548   MI.eraseFromParent();
2549   return true;
2550 }
2551 
2552 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2553                                               MachineRegisterInfo &MRI,
2554                                               MachineIRBuilder &B,
2555                                               unsigned AddrSpace) const {
2556   B.setInstr(MI);
2557   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2558   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2559   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2560   MI.eraseFromParent();
2561   return true;
2562 }
2563 
2564 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2565 // offset (the offset that is included in bounds checking and swizzling, to be
2566 // split between the instruction's voffset and immoffset fields) and soffset
2567 // (the offset that is excluded from bounds checking and swizzling, to go in
2568 // the instruction's soffset field).  This function takes the first kind of
2569 // offset and figures out how to split it between voffset and immoffset.
2570 std::tuple<Register, unsigned, unsigned>
2571 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2572                                         Register OrigOffset) const {
2573   const unsigned MaxImm = 4095;
2574   Register BaseReg;
2575   unsigned TotalConstOffset;
2576   MachineInstr *OffsetDef;
2577   const LLT S32 = LLT::scalar(32);
2578 
2579   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2580     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2581 
2582   unsigned ImmOffset = TotalConstOffset;
2583 
2584   // If the immediate value is too big for the immoffset field, put the value
2585   // and -4096 into the immoffset field so that the value that is copied/added
2586   // for the voffset field is a multiple of 4096, and it stands more chance
2587   // of being CSEd with the copy/add for another similar load/store.
2588   // However, do not do that rounding down to a multiple of 4096 if that is a
2589   // negative number, as it appears to be illegal to have a negative offset
2590   // in the vgpr, even if adding the immediate offset makes it positive.
2591   unsigned Overflow = ImmOffset & ~MaxImm;
2592   ImmOffset -= Overflow;
2593   if ((int32_t)Overflow < 0) {
2594     Overflow += ImmOffset;
2595     ImmOffset = 0;
2596   }
2597 
2598   if (Overflow != 0) {
2599     if (!BaseReg) {
2600       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2601     } else {
2602       auto OverflowVal = B.buildConstant(S32, Overflow);
2603       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2604     }
2605   }
2606 
2607   if (!BaseReg)
2608     BaseReg = B.buildConstant(S32, 0).getReg(0);
2609 
2610   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2611 }
2612 
2613 /// Handle register layout difference for f16 images for some subtargets.
2614 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2615                                              MachineRegisterInfo &MRI,
2616                                              Register Reg) const {
2617   if (!ST.hasUnpackedD16VMem())
2618     return Reg;
2619 
2620   const LLT S16 = LLT::scalar(16);
2621   const LLT S32 = LLT::scalar(32);
2622   LLT StoreVT = MRI.getType(Reg);
2623   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2624 
2625   auto Unmerge = B.buildUnmerge(S16, Reg);
2626 
2627   SmallVector<Register, 4> WideRegs;
2628   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2629     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2630 
2631   int NumElts = StoreVT.getNumElements();
2632 
2633   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2634 }
2635 
2636 Register AMDGPULegalizerInfo::fixStoreSourceType(
2637   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2638   MachineRegisterInfo *MRI = B.getMRI();
2639   LLT Ty = MRI->getType(VData);
2640 
2641   const LLT S16 = LLT::scalar(16);
2642 
2643   // Fixup illegal register types for i8 stores.
2644   if (Ty == LLT::scalar(8) || Ty == S16) {
2645     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2646     return AnyExt;
2647   }
2648 
2649   if (Ty.isVector()) {
2650     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2651       if (IsFormat)
2652         return handleD16VData(B, *MRI, VData);
2653     }
2654   }
2655 
2656   return VData;
2657 }
2658 
2659 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2660                                               MachineRegisterInfo &MRI,
2661                                               MachineIRBuilder &B,
2662                                               bool IsTyped,
2663                                               bool IsFormat) const {
2664   B.setInstr(MI);
2665 
2666   Register VData = MI.getOperand(1).getReg();
2667   LLT Ty = MRI.getType(VData);
2668   LLT EltTy = Ty.getScalarType();
2669   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2670   const LLT S32 = LLT::scalar(32);
2671 
2672   VData = fixStoreSourceType(B, VData, IsFormat);
2673   Register RSrc = MI.getOperand(2).getReg();
2674 
2675   MachineMemOperand *MMO = *MI.memoperands_begin();
2676   const int MemSize = MMO->getSize();
2677 
2678   unsigned ImmOffset;
2679   unsigned TotalOffset;
2680 
2681   // The typed intrinsics add an immediate after the registers.
2682   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2683 
2684   // The struct intrinsic variants add one additional operand over raw.
2685   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2686   Register VIndex;
2687   int OpOffset = 0;
2688   if (HasVIndex) {
2689     VIndex = MI.getOperand(3).getReg();
2690     OpOffset = 1;
2691   }
2692 
2693   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2694   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2695 
2696   unsigned Format = 0;
2697   if (IsTyped) {
2698     Format = MI.getOperand(5 + OpOffset).getImm();
2699     ++OpOffset;
2700   }
2701 
2702   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2703 
2704   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2705   if (TotalOffset != 0)
2706     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2707 
2708   unsigned Opc;
2709   if (IsTyped) {
2710     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2711                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2712   } else if (IsFormat) {
2713     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2714                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2715   } else {
2716     switch (MemSize) {
2717     case 1:
2718       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2719       break;
2720     case 2:
2721       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2722       break;
2723     default:
2724       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2725       break;
2726     }
2727   }
2728 
2729   if (!VIndex)
2730     VIndex = B.buildConstant(S32, 0).getReg(0);
2731 
2732   auto MIB = B.buildInstr(Opc)
2733     .addUse(VData)              // vdata
2734     .addUse(RSrc)               // rsrc
2735     .addUse(VIndex)             // vindex
2736     .addUse(VOffset)            // voffset
2737     .addUse(SOffset)            // soffset
2738     .addImm(ImmOffset);         // offset(imm)
2739 
2740   if (IsTyped)
2741     MIB.addImm(Format);
2742 
2743   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2744      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2745      .addMemOperand(MMO);
2746 
2747   MI.eraseFromParent();
2748   return true;
2749 }
2750 
2751 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2752                                              MachineRegisterInfo &MRI,
2753                                              MachineIRBuilder &B,
2754                                              bool IsFormat,
2755                                              bool IsTyped) const {
2756   B.setInstr(MI);
2757 
2758   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2759   MachineMemOperand *MMO = *MI.memoperands_begin();
2760   const int MemSize = MMO->getSize();
2761   const LLT S32 = LLT::scalar(32);
2762 
2763   Register Dst = MI.getOperand(0).getReg();
2764   Register RSrc = MI.getOperand(2).getReg();
2765 
2766   // The typed intrinsics add an immediate after the registers.
2767   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2768 
2769   // The struct intrinsic variants add one additional operand over raw.
2770   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2771   Register VIndex;
2772   int OpOffset = 0;
2773   if (HasVIndex) {
2774     VIndex = MI.getOperand(3).getReg();
2775     OpOffset = 1;
2776   }
2777 
2778   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2779   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2780 
2781   unsigned Format = 0;
2782   if (IsTyped) {
2783     Format = MI.getOperand(5 + OpOffset).getImm();
2784     ++OpOffset;
2785   }
2786 
2787   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2788   unsigned ImmOffset;
2789   unsigned TotalOffset;
2790 
2791   LLT Ty = MRI.getType(Dst);
2792   LLT EltTy = Ty.getScalarType();
2793   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2794   const bool Unpacked = ST.hasUnpackedD16VMem();
2795 
2796   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2797   if (TotalOffset != 0)
2798     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2799 
2800   unsigned Opc;
2801 
2802   if (IsTyped) {
2803     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2804                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2805   } else if (IsFormat) {
2806     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2807                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2808   } else {
2809     switch (MemSize) {
2810     case 1:
2811       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2812       break;
2813     case 2:
2814       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2815       break;
2816     default:
2817       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2818       break;
2819     }
2820   }
2821 
2822   Register LoadDstReg;
2823 
2824   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2825   LLT UnpackedTy = Ty.changeElementSize(32);
2826 
2827   if (IsExtLoad)
2828     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2829   else if (Unpacked && IsD16 && Ty.isVector())
2830     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2831   else
2832     LoadDstReg = Dst;
2833 
2834   if (!VIndex)
2835     VIndex = B.buildConstant(S32, 0).getReg(0);
2836 
2837   auto MIB = B.buildInstr(Opc)
2838     .addDef(LoadDstReg)         // vdata
2839     .addUse(RSrc)               // rsrc
2840     .addUse(VIndex)             // vindex
2841     .addUse(VOffset)            // voffset
2842     .addUse(SOffset)            // soffset
2843     .addImm(ImmOffset);         // offset(imm)
2844 
2845   if (IsTyped)
2846     MIB.addImm(Format);
2847 
2848   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2849      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2850      .addMemOperand(MMO);
2851 
2852   if (LoadDstReg != Dst) {
2853     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2854 
2855     // Widen result for extending loads was widened.
2856     if (IsExtLoad)
2857       B.buildTrunc(Dst, LoadDstReg);
2858     else {
2859       // Repack to original 16-bit vector result
2860       // FIXME: G_TRUNC should work, but legalization currently fails
2861       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2862       SmallVector<Register, 4> Repack;
2863       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2864         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2865       B.buildMerge(Dst, Repack);
2866     }
2867   }
2868 
2869   MI.eraseFromParent();
2870   return true;
2871 }
2872 
2873 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2874                                                MachineIRBuilder &B,
2875                                                bool IsInc) const {
2876   B.setInstr(MI);
2877   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2878                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2879   B.buildInstr(Opc)
2880     .addDef(MI.getOperand(0).getReg())
2881     .addUse(MI.getOperand(2).getReg())
2882     .addUse(MI.getOperand(3).getReg())
2883     .cloneMemRefs(MI);
2884   MI.eraseFromParent();
2885   return true;
2886 }
2887 
2888 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2889   switch (IntrID) {
2890   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2891   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2892     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2893   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2894   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2895     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2896   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2897   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2898     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2899   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2900   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2901     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2902   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2903   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2904     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2905   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2906   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2907     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2908   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2909   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2910     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2911   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2912   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2913     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2914   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2915   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2916     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2917   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2918   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2919     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2920   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2921   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2922     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2923   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2924   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2925     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2926   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2927   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2928     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
2929   default:
2930     llvm_unreachable("unhandled atomic opcode");
2931   }
2932 }
2933 
2934 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2935                                                MachineIRBuilder &B,
2936                                                Intrinsic::ID IID) const {
2937   B.setInstr(MI);
2938 
2939   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2940                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2941 
2942   Register Dst = MI.getOperand(0).getReg();
2943   Register VData = MI.getOperand(2).getReg();
2944 
2945   Register CmpVal;
2946   int OpOffset = 0;
2947 
2948   if (IsCmpSwap) {
2949     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2950     ++OpOffset;
2951   }
2952 
2953   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2954   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2955 
2956   // The struct intrinsic variants add one additional operand over raw.
2957   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2958   Register VIndex;
2959   if (HasVIndex) {
2960     VIndex = MI.getOperand(4 + OpOffset).getReg();
2961     ++OpOffset;
2962   }
2963 
2964   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2965   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2966   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2967 
2968   MachineMemOperand *MMO = *MI.memoperands_begin();
2969 
2970   unsigned ImmOffset;
2971   unsigned TotalOffset;
2972   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2973   if (TotalOffset != 0)
2974     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
2975 
2976   if (!VIndex)
2977     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
2978 
2979   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
2980     .addDef(Dst)
2981     .addUse(VData); // vdata
2982 
2983   if (IsCmpSwap)
2984     MIB.addReg(CmpVal);
2985 
2986   MIB.addUse(RSrc)               // rsrc
2987      .addUse(VIndex)             // vindex
2988      .addUse(VOffset)            // voffset
2989      .addUse(SOffset)            // soffset
2990      .addImm(ImmOffset)          // offset(imm)
2991      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2992      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2993      .addMemOperand(MMO);
2994 
2995   MI.eraseFromParent();
2996   return true;
2997 }
2998 
2999 // Produce a vector of s16 elements from s32 pieces.
3000 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3001                              ArrayRef<Register> UnmergeParts) {
3002   const LLT S16 = LLT::scalar(16);
3003 
3004   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3005   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3006     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3007 
3008   B.buildBuildVector(DstReg, RemergeParts);
3009 }
3010 
3011 /// Convert a set of s32 registers to a result vector with s16 elements.
3012 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3013                                ArrayRef<Register> UnmergeParts) {
3014   MachineRegisterInfo &MRI = *B.getMRI();
3015   const LLT V2S16 = LLT::vector(2, 16);
3016   LLT TargetTy = MRI.getType(DstReg);
3017   int NumElts = UnmergeParts.size();
3018 
3019   if (NumElts == 1) {
3020     assert(TargetTy == V2S16);
3021     B.buildBitcast(DstReg, UnmergeParts[0]);
3022     return;
3023   }
3024 
3025   SmallVector<Register, 4> RemergeParts(NumElts);
3026   for (int I = 0; I != NumElts; ++I)
3027     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3028 
3029   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3030     B.buildConcatVectors(DstReg, RemergeParts);
3031     return;
3032   }
3033 
3034   const LLT V3S16 = LLT::vector(3, 16);
3035   const LLT V6S16 = LLT::vector(6, 16);
3036 
3037   // Widen to v6s16 and unpack v3 parts.
3038   assert(TargetTy == V3S16);
3039 
3040   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3041   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3042   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3043 }
3044 
3045 // FIXME: Just vector trunc should be sufficent, but legalization currently
3046 // broken.
3047 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3048                                   Register WideDstReg) {
3049   const LLT S32 = LLT::scalar(32);
3050   const LLT S16 = LLT::scalar(16);
3051 
3052   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3053 
3054   int NumOps = Unmerge->getNumOperands() - 1;
3055   SmallVector<Register, 4> RemergeParts(NumOps);
3056   for (int I = 0; I != NumOps; ++I)
3057     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3058 
3059   B.buildBuildVector(DstReg, RemergeParts);
3060 }
3061 
3062 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3063     MachineInstr &MI, MachineIRBuilder &B,
3064     GISelChangeObserver &Observer,
3065     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3066   bool IsTFE = MI.getNumExplicitDefs() == 2;
3067 
3068   // We are only processing the operands of d16 image operations on subtargets
3069   // that use the unpacked register layout, or need to repack the TFE result.
3070 
3071   // TODO: Need to handle a16 images too
3072   // TODO: Do we need to guard against already legalized intrinsics?
3073   if (!IsTFE && !ST.hasUnpackedD16VMem())
3074     return true;
3075 
3076   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3077     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3078 
3079   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3080     return true;
3081 
3082   B.setInstr(MI);
3083 
3084   MachineRegisterInfo *MRI = B.getMRI();
3085   const LLT S32 = LLT::scalar(32);
3086   const LLT S16 = LLT::scalar(16);
3087 
3088   if (BaseOpcode->Store) { // No TFE for stores?
3089     Register VData = MI.getOperand(1).getReg();
3090     LLT Ty = MRI->getType(VData);
3091     if (!Ty.isVector() || Ty.getElementType() != S16)
3092       return true;
3093 
3094     B.setInstr(MI);
3095 
3096     Observer.changingInstr(MI);
3097     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3098     Observer.changedInstr(MI);
3099     return true;
3100   }
3101 
3102   Register DstReg = MI.getOperand(0).getReg();
3103   LLT Ty = MRI->getType(DstReg);
3104   const LLT EltTy = Ty.getScalarType();
3105   const bool IsD16 = Ty.getScalarType() == S16;
3106   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3107 
3108   if (IsTFE) {
3109     // In the IR, TFE is supposed to be used with a 2 element struct return
3110     // type. The intruction really returns these two values in one contiguous
3111     // register, with one additional dword beyond the loaded data. Rewrite the
3112     // return type to use a single register result.
3113     Register Dst1Reg = MI.getOperand(1).getReg();
3114     if (MRI->getType(Dst1Reg) != S32)
3115       return false;
3116 
3117     // TODO: Make sure the TFE operand bit is set.
3118 
3119     // The raw dword aligned data component of the load. The only legal cases
3120     // where this matters should be when using the packed D16 format, for
3121     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3122     LLT RoundedTy;
3123     LLT TFETy;
3124 
3125     if (IsD16 && ST.hasUnpackedD16VMem()) {
3126       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3127       TFETy = LLT::vector(NumElts + 1, 32);
3128     } else {
3129       unsigned EltSize = Ty.getScalarSizeInBits();
3130       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3131       unsigned RoundedSize = 32 * RoundedElts;
3132       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3133       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3134     }
3135 
3136     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3137     Observer.changingInstr(MI);
3138 
3139     MI.getOperand(0).setReg(TFEReg);
3140     MI.RemoveOperand(1);
3141 
3142     Observer.changedInstr(MI);
3143 
3144     // Insert after the instruction.
3145     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3146 
3147     // Now figure out how to copy the new result register back into the old
3148     // result.
3149 
3150     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3151     int NumDataElts = TFETy.getNumElements() - 1;
3152 
3153     if (!Ty.isVector()) {
3154       // Simplest case is a trivial unmerge (plus a truncate for d16).
3155       UnmergeResults[0] = Ty == S32 ?
3156         DstReg : MRI->createGenericVirtualRegister(S32);
3157 
3158       B.buildUnmerge(UnmergeResults, TFEReg);
3159       if (Ty != S32)
3160         B.buildTrunc(DstReg, UnmergeResults[0]);
3161       return true;
3162     }
3163 
3164     // We have to repack into a new vector of some kind.
3165     for (int I = 0; I != NumDataElts; ++I)
3166       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3167     B.buildUnmerge(UnmergeResults, TFEReg);
3168 
3169     // Drop the final TFE element.
3170     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3171 
3172     if (EltTy == S32)
3173       B.buildBuildVector(DstReg, DataPart);
3174     else if (ST.hasUnpackedD16VMem())
3175       truncToS16Vector(B, DstReg, DataPart);
3176     else
3177       bitcastToS16Vector(B, DstReg, DataPart);
3178 
3179     return true;
3180   }
3181 
3182   // Must be an image load.
3183   if (!Ty.isVector() || Ty.getElementType() != S16)
3184     return true;
3185 
3186   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3187 
3188   LLT WidenedTy = Ty.changeElementType(S32);
3189   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3190 
3191   Observer.changingInstr(MI);
3192   MI.getOperand(0).setReg(WideDstReg);
3193   Observer.changedInstr(MI);
3194 
3195   repackUnpackedD16Load(B, DstReg, WideDstReg);
3196   return true;
3197 }
3198 
3199 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3200   MachineInstr &MI, MachineIRBuilder &B,
3201   GISelChangeObserver &Observer) const {
3202   Register Dst = MI.getOperand(0).getReg();
3203   LLT Ty = B.getMRI()->getType(Dst);
3204   unsigned Size = Ty.getSizeInBits();
3205   MachineFunction &MF = B.getMF();
3206 
3207   Observer.changingInstr(MI);
3208 
3209   // FIXME: We don't really need this intermediate instruction. The intrinsic
3210   // should be fixed to have a memory operand. Since it's readnone, we're not
3211   // allowed to add one.
3212   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3213   MI.RemoveOperand(1); // Remove intrinsic ID
3214 
3215   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3216   // TODO: Should this use datalayout alignment?
3217   const unsigned MemSize = (Size + 7) / 8;
3218   const unsigned MemAlign = 4;
3219   MachineMemOperand *MMO = MF.getMachineMemOperand(
3220     MachinePointerInfo(),
3221     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3222     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3223   MI.addMemOperand(MF, MMO);
3224 
3225   // There are no 96-bit result scalar loads, but widening to 128-bit should
3226   // always be legal. We may need to restore this to a 96-bit result if it turns
3227   // out this needs to be converted to a vector load during RegBankSelect.
3228   if (!isPowerOf2_32(Size)) {
3229     LegalizerHelper Helper(MF, *this, Observer, B);
3230     B.setInstr(MI);
3231 
3232     if (Ty.isVector())
3233       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3234     else
3235       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3236   }
3237 
3238   Observer.changedInstr(MI);
3239   return true;
3240 }
3241 
3242 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3243                                             MachineIRBuilder &B,
3244                                             GISelChangeObserver &Observer) const {
3245   MachineRegisterInfo &MRI = *B.getMRI();
3246 
3247   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3248   auto IntrID = MI.getIntrinsicID();
3249   switch (IntrID) {
3250   case Intrinsic::amdgcn_if:
3251   case Intrinsic::amdgcn_else: {
3252     MachineInstr *Br = nullptr;
3253     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3254       const SIRegisterInfo *TRI
3255         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3256 
3257       B.setInstr(*BrCond);
3258       Register Def = MI.getOperand(1).getReg();
3259       Register Use = MI.getOperand(3).getReg();
3260 
3261       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3262       if (Br)
3263         BrTarget = Br->getOperand(0).getMBB();
3264 
3265       if (IntrID == Intrinsic::amdgcn_if) {
3266         B.buildInstr(AMDGPU::SI_IF)
3267           .addDef(Def)
3268           .addUse(Use)
3269           .addMBB(BrTarget);
3270       } else {
3271         B.buildInstr(AMDGPU::SI_ELSE)
3272           .addDef(Def)
3273           .addUse(Use)
3274           .addMBB(BrTarget)
3275           .addImm(0);
3276       }
3277 
3278       if (Br)
3279         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3280 
3281       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3282       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3283       MI.eraseFromParent();
3284       BrCond->eraseFromParent();
3285       return true;
3286     }
3287 
3288     return false;
3289   }
3290   case Intrinsic::amdgcn_loop: {
3291     MachineInstr *Br = nullptr;
3292     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3293       const SIRegisterInfo *TRI
3294         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3295 
3296       B.setInstr(*BrCond);
3297 
3298       // FIXME: Need to adjust branch targets based on unconditional branch.
3299       Register Reg = MI.getOperand(2).getReg();
3300       B.buildInstr(AMDGPU::SI_LOOP)
3301         .addUse(Reg)
3302         .addMBB(BrCond->getOperand(1).getMBB());
3303       MI.eraseFromParent();
3304       BrCond->eraseFromParent();
3305       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3306       return true;
3307     }
3308 
3309     return false;
3310   }
3311   case Intrinsic::amdgcn_kernarg_segment_ptr:
3312     return legalizePreloadedArgIntrin(
3313       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3314   case Intrinsic::amdgcn_implicitarg_ptr:
3315     return legalizeImplicitArgPtr(MI, MRI, B);
3316   case Intrinsic::amdgcn_workitem_id_x:
3317     return legalizePreloadedArgIntrin(MI, MRI, B,
3318                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3319   case Intrinsic::amdgcn_workitem_id_y:
3320     return legalizePreloadedArgIntrin(MI, MRI, B,
3321                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3322   case Intrinsic::amdgcn_workitem_id_z:
3323     return legalizePreloadedArgIntrin(MI, MRI, B,
3324                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3325   case Intrinsic::amdgcn_workgroup_id_x:
3326     return legalizePreloadedArgIntrin(MI, MRI, B,
3327                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3328   case Intrinsic::amdgcn_workgroup_id_y:
3329     return legalizePreloadedArgIntrin(MI, MRI, B,
3330                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3331   case Intrinsic::amdgcn_workgroup_id_z:
3332     return legalizePreloadedArgIntrin(MI, MRI, B,
3333                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3334   case Intrinsic::amdgcn_dispatch_ptr:
3335     return legalizePreloadedArgIntrin(MI, MRI, B,
3336                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3337   case Intrinsic::amdgcn_queue_ptr:
3338     return legalizePreloadedArgIntrin(MI, MRI, B,
3339                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3340   case Intrinsic::amdgcn_implicit_buffer_ptr:
3341     return legalizePreloadedArgIntrin(
3342       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3343   case Intrinsic::amdgcn_dispatch_id:
3344     return legalizePreloadedArgIntrin(MI, MRI, B,
3345                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3346   case Intrinsic::amdgcn_fdiv_fast:
3347     return legalizeFDIVFastIntrin(MI, MRI, B);
3348   case Intrinsic::amdgcn_is_shared:
3349     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3350   case Intrinsic::amdgcn_is_private:
3351     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3352   case Intrinsic::amdgcn_wavefrontsize: {
3353     B.setInstr(MI);
3354     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3355     MI.eraseFromParent();
3356     return true;
3357   }
3358   case Intrinsic::amdgcn_s_buffer_load:
3359     return legalizeSBufferLoad(MI, B, Observer);
3360   case Intrinsic::amdgcn_raw_buffer_store:
3361   case Intrinsic::amdgcn_struct_buffer_store:
3362     return legalizeBufferStore(MI, MRI, B, false, false);
3363   case Intrinsic::amdgcn_raw_buffer_store_format:
3364   case Intrinsic::amdgcn_struct_buffer_store_format:
3365     return legalizeBufferStore(MI, MRI, B, false, true);
3366   case Intrinsic::amdgcn_raw_tbuffer_store:
3367   case Intrinsic::amdgcn_struct_tbuffer_store:
3368     return legalizeBufferStore(MI, MRI, B, true, true);
3369   case Intrinsic::amdgcn_raw_buffer_load:
3370   case Intrinsic::amdgcn_struct_buffer_load:
3371     return legalizeBufferLoad(MI, MRI, B, false, false);
3372   case Intrinsic::amdgcn_raw_buffer_load_format:
3373   case Intrinsic::amdgcn_struct_buffer_load_format:
3374     return legalizeBufferLoad(MI, MRI, B, true, false);
3375   case Intrinsic::amdgcn_raw_tbuffer_load:
3376   case Intrinsic::amdgcn_struct_tbuffer_load:
3377     return legalizeBufferLoad(MI, MRI, B, true, true);
3378   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3379   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3380   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3381   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3382   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3383   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3384   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3385   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3386   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3387   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3388   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3389   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3390   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3391   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3392   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3393   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3394   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3395   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3396   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3397   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3398   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3399   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3400   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3401   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3402   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3403   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3404     return legalizeBufferAtomic(MI, B, IntrID);
3405   case Intrinsic::amdgcn_atomic_inc:
3406     return legalizeAtomicIncDec(MI, B, true);
3407   case Intrinsic::amdgcn_atomic_dec:
3408     return legalizeAtomicIncDec(MI, B, false);
3409   default: {
3410     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3411             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3412       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3413     return true;
3414   }
3415   }
3416 
3417   return true;
3418 }
3419