1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182   return [=](const LegalityQuery &Query) {
183     return Query.Types[TypeIdx0].getSizeInBits() <
184            Query.Types[TypeIdx1].getSizeInBits();
185   };
186 }
187 
188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189   return [=](const LegalityQuery &Query) {
190     return Query.Types[TypeIdx0].getSizeInBits() >
191            Query.Types[TypeIdx1].getSizeInBits();
192   };
193 }
194 
195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196                                          const GCNTargetMachine &TM)
197   :  ST(ST_) {
198   using namespace TargetOpcode;
199 
200   auto GetAddrSpacePtr = [&TM](unsigned AS) {
201     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202   };
203 
204   const LLT S1 = LLT::scalar(1);
205   const LLT S16 = LLT::scalar(16);
206   const LLT S32 = LLT::scalar(32);
207   const LLT S64 = LLT::scalar(64);
208   const LLT S128 = LLT::scalar(128);
209   const LLT S256 = LLT::scalar(256);
210   const LLT S1024 = LLT::scalar(1024);
211 
212   const LLT V2S16 = LLT::vector(2, 16);
213   const LLT V4S16 = LLT::vector(4, 16);
214 
215   const LLT V2S32 = LLT::vector(2, 32);
216   const LLT V3S32 = LLT::vector(3, 32);
217   const LLT V4S32 = LLT::vector(4, 32);
218   const LLT V5S32 = LLT::vector(5, 32);
219   const LLT V6S32 = LLT::vector(6, 32);
220   const LLT V7S32 = LLT::vector(7, 32);
221   const LLT V8S32 = LLT::vector(8, 32);
222   const LLT V9S32 = LLT::vector(9, 32);
223   const LLT V10S32 = LLT::vector(10, 32);
224   const LLT V11S32 = LLT::vector(11, 32);
225   const LLT V12S32 = LLT::vector(12, 32);
226   const LLT V13S32 = LLT::vector(13, 32);
227   const LLT V14S32 = LLT::vector(14, 32);
228   const LLT V15S32 = LLT::vector(15, 32);
229   const LLT V16S32 = LLT::vector(16, 32);
230   const LLT V32S32 = LLT::vector(32, 32);
231 
232   const LLT V2S64 = LLT::vector(2, 64);
233   const LLT V3S64 = LLT::vector(3, 64);
234   const LLT V4S64 = LLT::vector(4, 64);
235   const LLT V5S64 = LLT::vector(5, 64);
236   const LLT V6S64 = LLT::vector(6, 64);
237   const LLT V7S64 = LLT::vector(7, 64);
238   const LLT V8S64 = LLT::vector(8, 64);
239   const LLT V16S64 = LLT::vector(16, 64);
240 
241   std::initializer_list<LLT> AllS32Vectors =
242     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244   std::initializer_list<LLT> AllS64Vectors =
245     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246 
247   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254 
255   const LLT CodePtr = FlatPtr;
256 
257   const std::initializer_list<LLT> AddrSpaces64 = {
258     GlobalPtr, ConstantPtr, FlatPtr
259   };
260 
261   const std::initializer_list<LLT> AddrSpaces32 = {
262     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263   };
264 
265   const std::initializer_list<LLT> FPTypesBase = {
266     S32, S64
267   };
268 
269   const std::initializer_list<LLT> FPTypes16 = {
270     S32, S64, S16
271   };
272 
273   const std::initializer_list<LLT> FPTypesPK16 = {
274     S32, S64, S16, V2S16
275   };
276 
277   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278 
279   setAction({G_BRCOND, S1}, Legal); // VCC branches
280   setAction({G_BRCOND, S32}, Legal); // SCC branches
281 
282   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283   // elements for v3s16
284   getActionDefinitionsBuilder(G_PHI)
285     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286     .legalFor(AllS32Vectors)
287     .legalFor(AllS64Vectors)
288     .legalFor(AddrSpaces64)
289     .legalFor(AddrSpaces32)
290     .clampScalar(0, S32, S256)
291     .widenScalarToNextPow2(0, 32)
292     .clampMaxNumElements(0, S32, 16)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .legalIf(isPointer(0));
295 
296   if (ST.hasVOP3PInsts()) {
297     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298       .legalFor({S32, S16, V2S16})
299       .clampScalar(0, S16, S32)
300       .clampMaxNumElements(0, S16, 2)
301       .scalarize(0)
302       .widenScalarToNextPow2(0, 32);
303   } else if (ST.has16BitInsts()) {
304     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
305       .legalFor({S32, S16})
306       .clampScalar(0, S16, S32)
307       .scalarize(0)
308       .widenScalarToNextPow2(0, 32);
309   } else {
310     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
311       .legalFor({S32})
312       .clampScalar(0, S32, S32)
313       .scalarize(0);
314   }
315 
316   // FIXME: Not really legal. Placeholder for custom lowering.
317   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
318     .customFor({S32, S64})
319     .clampScalar(0, S32, S64)
320     .widenScalarToNextPow2(0, 32)
321     .scalarize(0);
322 
323   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
324     .legalFor({S32})
325     .clampScalar(0, S32, S32)
326     .scalarize(0);
327 
328   // Report legal for any types we can handle anywhere. For the cases only legal
329   // on the SALU, RegBankSelect will be able to re-legalize.
330   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
331     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
332     .clampScalar(0, S32, S64)
333     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
334     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
335     .widenScalarToNextPow2(0)
336     .scalarize(0);
337 
338   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
339                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
340     .legalFor({{S32, S1}, {S32, S32}})
341     .minScalar(0, S32)
342     // TODO: .scalarize(0)
343     .lower();
344 
345   getActionDefinitionsBuilder(G_BITCAST)
346     // Don't worry about the size constraint.
347     .legalIf(all(isRegisterType(0), isRegisterType(1)))
348     .lower();
349 
350 
351   getActionDefinitionsBuilder(G_CONSTANT)
352     .legalFor({S1, S32, S64, S16, GlobalPtr,
353                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
354     .clampScalar(0, S32, S64)
355     .widenScalarToNextPow2(0)
356     .legalIf(isPointer(0));
357 
358   getActionDefinitionsBuilder(G_FCONSTANT)
359     .legalFor({S32, S64, S16})
360     .clampScalar(0, S16, S64);
361 
362   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
363     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
364                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
365     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366     .clampScalarOrElt(0, S32, S1024)
367     .legalIf(isMultiple32(0))
368     .widenScalarToNextPow2(0, 32)
369     .clampMaxNumElements(0, S32, 16);
370 
371   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
372   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
373     .unsupportedFor({PrivatePtr})
374     .custom();
375   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
376 
377   auto &FPOpActions = getActionDefinitionsBuilder(
378     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
379     .legalFor({S32, S64});
380   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
381     .customFor({S32, S64});
382   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
383     .customFor({S32, S64});
384 
385   if (ST.has16BitInsts()) {
386     if (ST.hasVOP3PInsts())
387       FPOpActions.legalFor({S16, V2S16});
388     else
389       FPOpActions.legalFor({S16});
390 
391     TrigActions.customFor({S16});
392     FDIVActions.customFor({S16});
393   }
394 
395   auto &MinNumMaxNum = getActionDefinitionsBuilder({
396       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
397 
398   if (ST.hasVOP3PInsts()) {
399     MinNumMaxNum.customFor(FPTypesPK16)
400       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
401       .clampMaxNumElements(0, S16, 2)
402       .clampScalar(0, S16, S64)
403       .scalarize(0);
404   } else if (ST.has16BitInsts()) {
405     MinNumMaxNum.customFor(FPTypes16)
406       .clampScalar(0, S16, S64)
407       .scalarize(0);
408   } else {
409     MinNumMaxNum.customFor(FPTypesBase)
410       .clampScalar(0, S32, S64)
411       .scalarize(0);
412   }
413 
414   if (ST.hasVOP3PInsts())
415     FPOpActions.clampMaxNumElements(0, S16, 2);
416 
417   FPOpActions
418     .scalarize(0)
419     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
420 
421   TrigActions
422     .scalarize(0)
423     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
424 
425   FDIVActions
426     .scalarize(0)
427     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
428 
429   getActionDefinitionsBuilder({G_FNEG, G_FABS})
430     .legalFor(FPTypesPK16)
431     .clampMaxNumElements(0, S16, 2)
432     .scalarize(0)
433     .clampScalar(0, S16, S64);
434 
435   if (ST.has16BitInsts()) {
436     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
437       .legalFor({S32, S64, S16})
438       .scalarize(0)
439       .clampScalar(0, S16, S64);
440   } else {
441     getActionDefinitionsBuilder(G_FSQRT)
442       .legalFor({S32, S64})
443       .scalarize(0)
444       .clampScalar(0, S32, S64);
445 
446     if (ST.hasFractBug()) {
447       getActionDefinitionsBuilder(G_FFLOOR)
448         .customFor({S64})
449         .legalFor({S32, S64})
450         .scalarize(0)
451         .clampScalar(0, S32, S64);
452     } else {
453       getActionDefinitionsBuilder(G_FFLOOR)
454         .legalFor({S32, S64})
455         .scalarize(0)
456         .clampScalar(0, S32, S64);
457     }
458   }
459 
460   getActionDefinitionsBuilder(G_FPTRUNC)
461     .legalFor({{S32, S64}, {S16, S32}})
462     .scalarize(0)
463     .lower();
464 
465   getActionDefinitionsBuilder(G_FPEXT)
466     .legalFor({{S64, S32}, {S32, S16}})
467     .lowerFor({{S64, S16}}) // FIXME: Implement
468     .scalarize(0);
469 
470   getActionDefinitionsBuilder(G_FSUB)
471       // Use actual fsub instruction
472       .legalFor({S32})
473       // Must use fadd + fneg
474       .lowerFor({S64, S16, V2S16})
475       .scalarize(0)
476       .clampScalar(0, S32, S64);
477 
478   // Whether this is legal depends on the floating point mode for the function.
479   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
480   if (ST.hasMadF16())
481     FMad.customFor({S32, S16});
482   else
483     FMad.customFor({S32});
484   FMad.scalarize(0)
485       .lower();
486 
487   getActionDefinitionsBuilder(G_TRUNC)
488     .alwaysLegal();
489 
490   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
491     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
492                {S32, S1}, {S64, S1}, {S16, S1}})
493     .scalarize(0)
494     .clampScalar(0, S32, S64)
495     .widenScalarToNextPow2(1, 32);
496 
497   // TODO: Split s1->s64 during regbankselect for VALU.
498   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
499     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
500     .lowerFor({{S32, S64}})
501     .lowerIf(typeIs(1, S1))
502     .customFor({{S64, S64}});
503   if (ST.has16BitInsts())
504     IToFP.legalFor({{S16, S16}});
505   IToFP.clampScalar(1, S32, S64)
506        .scalarize(0)
507        .widenScalarToNextPow2(1);
508 
509   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
510     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
511     .customFor({{S64, S64}});
512   if (ST.has16BitInsts())
513     FPToI.legalFor({{S16, S16}});
514   else
515     FPToI.minScalar(1, S32);
516 
517   FPToI.minScalar(0, S32)
518        .scalarize(0)
519        .lower();
520 
521   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
522     .scalarize(0)
523     .lower();
524 
525   if (ST.has16BitInsts()) {
526     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
527       .legalFor({S16, S32, S64})
528       .clampScalar(0, S16, S64)
529       .scalarize(0);
530   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
531     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
532       .legalFor({S32, S64})
533       .clampScalar(0, S32, S64)
534       .scalarize(0);
535   } else {
536     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
537       .legalFor({S32})
538       .customFor({S64})
539       .clampScalar(0, S32, S64)
540       .scalarize(0);
541   }
542 
543   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
544     .scalarize(0)
545     .alwaysLegal();
546 
547   auto &CmpBuilder =
548     getActionDefinitionsBuilder(G_ICMP)
549     // The compare output type differs based on the register bank of the output,
550     // so make both s1 and s32 legal.
551     //
552     // Scalar compares producing output in scc will be promoted to s32, as that
553     // is the allocatable register type that will be needed for the copy from
554     // scc. This will be promoted during RegBankSelect, and we assume something
555     // before that won't try to use s32 result types.
556     //
557     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
558     // bank.
559     .legalForCartesianProduct(
560       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
561     .legalForCartesianProduct(
562       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
563   if (ST.has16BitInsts()) {
564     CmpBuilder.legalFor({{S1, S16}});
565   }
566 
567   CmpBuilder
568     .widenScalarToNextPow2(1)
569     .clampScalar(1, S32, S64)
570     .scalarize(0)
571     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
572 
573   getActionDefinitionsBuilder(G_FCMP)
574     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
575     .widenScalarToNextPow2(1)
576     .clampScalar(1, S32, S64)
577     .scalarize(0);
578 
579   // FIXME: fpow has a selection pattern that should move to custom lowering.
580   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
581   if (ST.has16BitInsts())
582     Exp2Ops.legalFor({S32, S16});
583   else
584     Exp2Ops.legalFor({S32});
585   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
586   Exp2Ops.scalarize(0);
587 
588   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
589   if (ST.has16BitInsts())
590     ExpOps.customFor({{S32}, {S16}});
591   else
592     ExpOps.customFor({S32});
593   ExpOps.clampScalar(0, MinScalarFPTy, S32)
594         .scalarize(0);
595 
596   // The 64-bit versions produce 32-bit results, but only on the SALU.
597   getActionDefinitionsBuilder(G_CTPOP)
598     .legalFor({{S32, S32}, {S32, S64}})
599     .clampScalar(0, S32, S32)
600     .clampScalar(1, S32, S64)
601     .scalarize(0)
602     .widenScalarToNextPow2(0, 32)
603     .widenScalarToNextPow2(1, 32);
604 
605   // The hardware instructions return a different result on 0 than the generic
606   // instructions expect. The hardware produces -1, but these produce the
607   // bitwidth.
608   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
609     .scalarize(0)
610     .clampScalar(0, S32, S32)
611     .clampScalar(1, S32, S64)
612     .widenScalarToNextPow2(0, 32)
613     .widenScalarToNextPow2(1, 32)
614     .lower();
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   getActionDefinitionsBuilder(G_BITREVERSE)
626     .legalFor({S32})
627     .clampScalar(0, S32, S32)
628     .scalarize(0);
629 
630   if (ST.has16BitInsts()) {
631     getActionDefinitionsBuilder(G_BSWAP)
632       .legalFor({S16, S32, V2S16})
633       .clampMaxNumElements(0, S16, 2)
634       // FIXME: Fixing non-power-of-2 before clamp is workaround for
635       // narrowScalar limitation.
636       .widenScalarToNextPow2(0)
637       .clampScalar(0, S16, S32)
638       .scalarize(0);
639 
640     if (ST.hasVOP3PInsts()) {
641       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642         .legalFor({S32, S16, V2S16})
643         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644         .clampMaxNumElements(0, S16, 2)
645         .minScalar(0, S16)
646         .widenScalarToNextPow2(0)
647         .scalarize(0)
648         .lower();
649     } else {
650       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
651         .legalFor({S32, S16})
652         .widenScalarToNextPow2(0)
653         .minScalar(0, S16)
654         .scalarize(0)
655         .lower();
656     }
657   } else {
658     // TODO: Should have same legality without v_perm_b32
659     getActionDefinitionsBuilder(G_BSWAP)
660       .legalFor({S32})
661       .lowerIf(narrowerThan(0, 32))
662       // FIXME: Fixing non-power-of-2 before clamp is workaround for
663       // narrowScalar limitation.
664       .widenScalarToNextPow2(0)
665       .maxScalar(0, S32)
666       .scalarize(0)
667       .lower();
668 
669     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
670       .legalFor({S32})
671       .minScalar(0, S32)
672       .widenScalarToNextPow2(0)
673       .scalarize(0)
674       .lower();
675   }
676 
677   getActionDefinitionsBuilder(G_INTTOPTR)
678     // List the common cases
679     .legalForCartesianProduct(AddrSpaces64, {S64})
680     .legalForCartesianProduct(AddrSpaces32, {S32})
681     .scalarize(0)
682     // Accept any address space as long as the size matches
683     .legalIf(sameSize(0, 1))
684     .widenScalarIf(smallerThan(1, 0),
685       [](const LegalityQuery &Query) {
686         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
687       })
688     .narrowScalarIf(greaterThan(1, 0),
689       [](const LegalityQuery &Query) {
690         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
691       });
692 
693   getActionDefinitionsBuilder(G_PTRTOINT)
694     // List the common cases
695     .legalForCartesianProduct(AddrSpaces64, {S64})
696     .legalForCartesianProduct(AddrSpaces32, {S32})
697     .scalarize(0)
698     // Accept any address space as long as the size matches
699     .legalIf(sameSize(0, 1))
700     .widenScalarIf(smallerThan(0, 1),
701       [](const LegalityQuery &Query) {
702         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
703       })
704     .narrowScalarIf(
705       greaterThan(0, 1),
706       [](const LegalityQuery &Query) {
707         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
708       });
709 
710   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
711     .scalarize(0)
712     .custom();
713 
714   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
715   // handle some operations by just promoting the register during
716   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
717   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
718     switch (AS) {
719     // FIXME: Private element size.
720     case AMDGPUAS::PRIVATE_ADDRESS:
721       return 32;
722     // FIXME: Check subtarget
723     case AMDGPUAS::LOCAL_ADDRESS:
724       return ST.useDS128() ? 128 : 64;
725 
726     // Treat constant and global as identical. SMRD loads are sometimes usable
727     // for global loads (ideally constant address space should be eliminated)
728     // depending on the context. Legality cannot be context dependent, but
729     // RegBankSelect can split the load as necessary depending on the pointer
730     // register bank/uniformity and if the memory is invariant or not written in
731     // a kernel.
732     case AMDGPUAS::CONSTANT_ADDRESS:
733     case AMDGPUAS::GLOBAL_ADDRESS:
734       return IsLoad ? 512 : 128;
735     default:
736       return 128;
737     }
738   };
739 
740   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
741                                     bool IsLoad) -> bool {
742     const LLT DstTy = Query.Types[0];
743 
744     // Split vector extloads.
745     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
746     unsigned Align = Query.MMODescrs[0].AlignInBits;
747 
748     if (MemSize < DstTy.getSizeInBits())
749       MemSize = std::max(MemSize, Align);
750 
751     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
752       return true;
753 
754     const LLT PtrTy = Query.Types[1];
755     unsigned AS = PtrTy.getAddressSpace();
756     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
757       return true;
758 
759     // Catch weird sized loads that don't evenly divide into the access sizes
760     // TODO: May be able to widen depending on alignment etc.
761     unsigned NumRegs = (MemSize + 31) / 32;
762     if (NumRegs == 3) {
763       if (!ST.hasDwordx3LoadStores())
764         return true;
765     } else {
766       // If the alignment allows, these should have been widened.
767       if (!isPowerOf2_32(NumRegs))
768         return true;
769     }
770 
771     if (Align < MemSize) {
772       const SITargetLowering *TLI = ST.getTargetLowering();
773       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
774     }
775 
776     return false;
777   };
778 
779   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
780     unsigned Size = Query.Types[0].getSizeInBits();
781     if (isPowerOf2_32(Size))
782       return false;
783 
784     if (Size == 96 && ST.hasDwordx3LoadStores())
785       return false;
786 
787     unsigned AddrSpace = Query.Types[1].getAddressSpace();
788     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
789       return false;
790 
791     unsigned Align = Query.MMODescrs[0].AlignInBits;
792     unsigned RoundedSize = NextPowerOf2(Size);
793     return (Align >= RoundedSize);
794   };
795 
796   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
797   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
798   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
799 
800   // TODO: Refine based on subtargets which support unaligned access or 128-bit
801   // LDS
802   // TODO: Unsupported flat for SI.
803 
804   for (unsigned Op : {G_LOAD, G_STORE}) {
805     const bool IsStore = Op == G_STORE;
806 
807     auto &Actions = getActionDefinitionsBuilder(Op);
808     // Whitelist the common cases.
809     // TODO: Loads to s16 on gfx9
810     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
811                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
812                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
813                                       {S128, GlobalPtr, 128, GlobalAlign32},
814                                       {S64, GlobalPtr, 64, GlobalAlign32},
815                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
816                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
817                                       {S32, GlobalPtr, 8, GlobalAlign8},
818                                       {S32, GlobalPtr, 16, GlobalAlign16},
819 
820                                       {S32, LocalPtr, 32, 32},
821                                       {S64, LocalPtr, 64, 32},
822                                       {V2S32, LocalPtr, 64, 32},
823                                       {S32, LocalPtr, 8, 8},
824                                       {S32, LocalPtr, 16, 16},
825                                       {V2S16, LocalPtr, 32, 32},
826 
827                                       {S32, PrivatePtr, 32, 32},
828                                       {S32, PrivatePtr, 8, 8},
829                                       {S32, PrivatePtr, 16, 16},
830                                       {V2S16, PrivatePtr, 32, 32},
831 
832                                       {S32, FlatPtr, 32, GlobalAlign32},
833                                       {S32, FlatPtr, 16, GlobalAlign16},
834                                       {S32, FlatPtr, 8, GlobalAlign8},
835                                       {V2S16, FlatPtr, 32, GlobalAlign32},
836 
837                                       {S32, ConstantPtr, 32, GlobalAlign32},
838                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
839                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
840                                       {S64, ConstantPtr, 64, GlobalAlign32},
841                                       {S128, ConstantPtr, 128, GlobalAlign32},
842                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
843     Actions
844         .customIf(typeIs(1, Constant32Ptr))
845         // Widen suitably aligned loads by loading extra elements.
846         .moreElementsIf([=](const LegalityQuery &Query) {
847             const LLT Ty = Query.Types[0];
848             return Op == G_LOAD && Ty.isVector() &&
849                    shouldWidenLoadResult(Query);
850           }, moreElementsToNextPow2(0))
851         .widenScalarIf([=](const LegalityQuery &Query) {
852             const LLT Ty = Query.Types[0];
853             return Op == G_LOAD && !Ty.isVector() &&
854                    shouldWidenLoadResult(Query);
855           }, widenScalarOrEltToNextPow2(0))
856         .narrowScalarIf(
857             [=](const LegalityQuery &Query) -> bool {
858               return !Query.Types[0].isVector() &&
859                      needToSplitMemOp(Query, Op == G_LOAD);
860             },
861             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
862               const LLT DstTy = Query.Types[0];
863               const LLT PtrTy = Query.Types[1];
864 
865               const unsigned DstSize = DstTy.getSizeInBits();
866               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
867 
868               // Split extloads.
869               if (DstSize > MemSize)
870                 return std::make_pair(0, LLT::scalar(MemSize));
871 
872               if (!isPowerOf2_32(DstSize)) {
873                 // We're probably decomposing an odd sized store. Try to split
874                 // to the widest type. TODO: Account for alignment. As-is it
875                 // should be OK, since the new parts will be further legalized.
876                 unsigned FloorSize = PowerOf2Floor(DstSize);
877                 return std::make_pair(0, LLT::scalar(FloorSize));
878               }
879 
880               if (DstSize > 32 && (DstSize % 32 != 0)) {
881                 // FIXME: Need a way to specify non-extload of larger size if
882                 // suitably aligned.
883                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
884               }
885 
886               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
887                                                      Op == G_LOAD);
888               if (MemSize > MaxSize)
889                 return std::make_pair(0, LLT::scalar(MaxSize));
890 
891               unsigned Align = Query.MMODescrs[0].AlignInBits;
892               return std::make_pair(0, LLT::scalar(Align));
893             })
894         .fewerElementsIf(
895             [=](const LegalityQuery &Query) -> bool {
896               return Query.Types[0].isVector() &&
897                      needToSplitMemOp(Query, Op == G_LOAD);
898             },
899             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
900               const LLT DstTy = Query.Types[0];
901               const LLT PtrTy = Query.Types[1];
902 
903               LLT EltTy = DstTy.getElementType();
904               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
905                                                      Op == G_LOAD);
906 
907               // FIXME: Handle widened to power of 2 results better. This ends
908               // up scalarizing.
909               // FIXME: 3 element stores scalarized on SI
910 
911               // Split if it's too large for the address space.
912               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
913                 unsigned NumElts = DstTy.getNumElements();
914                 unsigned EltSize = EltTy.getSizeInBits();
915 
916                 if (MaxSize % EltSize == 0) {
917                   return std::make_pair(
918                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
919                 }
920 
921                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
922 
923                 // FIXME: Refine when odd breakdowns handled
924                 // The scalars will need to be re-legalized.
925                 if (NumPieces == 1 || NumPieces >= NumElts ||
926                     NumElts % NumPieces != 0)
927                   return std::make_pair(0, EltTy);
928 
929                 return std::make_pair(0,
930                                       LLT::vector(NumElts / NumPieces, EltTy));
931               }
932 
933               // FIXME: We could probably handle weird extending loads better.
934               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
935               if (DstTy.getSizeInBits() > MemSize)
936                 return std::make_pair(0, EltTy);
937 
938               unsigned EltSize = EltTy.getSizeInBits();
939               unsigned DstSize = DstTy.getSizeInBits();
940               if (!isPowerOf2_32(DstSize)) {
941                 // We're probably decomposing an odd sized store. Try to split
942                 // to the widest type. TODO: Account for alignment. As-is it
943                 // should be OK, since the new parts will be further legalized.
944                 unsigned FloorSize = PowerOf2Floor(DstSize);
945                 return std::make_pair(
946                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
947               }
948 
949               // Need to split because of alignment.
950               unsigned Align = Query.MMODescrs[0].AlignInBits;
951               if (EltSize > Align &&
952                   (EltSize / Align < DstTy.getNumElements())) {
953                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
954               }
955 
956               // May need relegalization for the scalars.
957               return std::make_pair(0, EltTy);
958             })
959         .minScalar(0, S32);
960 
961     if (IsStore)
962       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
963 
964     // TODO: Need a bitcast lower option?
965     Actions
966         .legalIf([=](const LegalityQuery &Query) {
967           const LLT Ty0 = Query.Types[0];
968           unsigned Size = Ty0.getSizeInBits();
969           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
970           unsigned Align = Query.MMODescrs[0].AlignInBits;
971 
972           // FIXME: Widening store from alignment not valid.
973           if (MemSize < Size)
974             MemSize = std::max(MemSize, Align);
975 
976           // No extending vector loads.
977           if (Size > MemSize && Ty0.isVector())
978             return false;
979 
980           switch (MemSize) {
981           case 8:
982           case 16:
983             return Size == 32;
984           case 32:
985           case 64:
986           case 128:
987             return true;
988           case 96:
989             return ST.hasDwordx3LoadStores();
990           case 256:
991           case 512:
992             return true;
993           default:
994             return false;
995           }
996         })
997         .widenScalarToNextPow2(0)
998         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
999   }
1000 
1001   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1002                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1003                                                   {S32, GlobalPtr, 16, 2 * 8},
1004                                                   {S32, LocalPtr, 8, 8},
1005                                                   {S32, LocalPtr, 16, 16},
1006                                                   {S32, PrivatePtr, 8, 8},
1007                                                   {S32, PrivatePtr, 16, 16},
1008                                                   {S32, ConstantPtr, 8, 8},
1009                                                   {S32, ConstantPtr, 16, 2 * 8}});
1010   if (ST.hasFlatAddressSpace()) {
1011     ExtLoads.legalForTypesWithMemDesc(
1012         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1013   }
1014 
1015   ExtLoads.clampScalar(0, S32, S32)
1016           .widenScalarToNextPow2(0)
1017           .unsupportedIfMemSizeNotPow2()
1018           .lower();
1019 
1020   auto &Atomics = getActionDefinitionsBuilder(
1021     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1022      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1023      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1024      G_ATOMICRMW_UMIN})
1025     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1026                {S64, GlobalPtr}, {S64, LocalPtr}});
1027   if (ST.hasFlatAddressSpace()) {
1028     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1029   }
1030 
1031   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1032     .legalFor({{S32, LocalPtr}});
1033 
1034   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1035   // demarshalling
1036   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1037     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1038                 {S32, FlatPtr}, {S64, FlatPtr}})
1039     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1040                {S32, RegionPtr}, {S64, RegionPtr}});
1041   // TODO: Pointer types, any 32-bit or 64-bit vector
1042 
1043   // Condition should be s32 for scalar, s1 for vector.
1044   getActionDefinitionsBuilder(G_SELECT)
1045     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1046           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1047           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1048     .clampScalar(0, S16, S64)
1049     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1050     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1051     .scalarize(1)
1052     .clampMaxNumElements(0, S32, 2)
1053     .clampMaxNumElements(0, LocalPtr, 2)
1054     .clampMaxNumElements(0, PrivatePtr, 2)
1055     .scalarize(0)
1056     .widenScalarToNextPow2(0)
1057     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1058 
1059   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1060   // be more flexible with the shift amount type.
1061   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1062     .legalFor({{S32, S32}, {S64, S32}});
1063   if (ST.has16BitInsts()) {
1064     if (ST.hasVOP3PInsts()) {
1065       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1066             .clampMaxNumElements(0, S16, 2);
1067     } else
1068       Shifts.legalFor({{S16, S32}, {S16, S16}});
1069 
1070     // TODO: Support 16-bit shift amounts
1071     Shifts.clampScalar(1, S32, S32);
1072     Shifts.clampScalar(0, S16, S64);
1073     Shifts.widenScalarToNextPow2(0, 16);
1074   } else {
1075     // Make sure we legalize the shift amount type first, as the general
1076     // expansion for the shifted type will produce much worse code if it hasn't
1077     // been truncated already.
1078     Shifts.clampScalar(1, S32, S32);
1079     Shifts.clampScalar(0, S32, S64);
1080     Shifts.widenScalarToNextPow2(0, 32);
1081   }
1082   Shifts.scalarize(0);
1083 
1084   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1085     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1086     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1087     unsigned IdxTypeIdx = 2;
1088 
1089     getActionDefinitionsBuilder(Op)
1090       .customIf([=](const LegalityQuery &Query) {
1091           const LLT EltTy = Query.Types[EltTypeIdx];
1092           const LLT VecTy = Query.Types[VecTypeIdx];
1093           const LLT IdxTy = Query.Types[IdxTypeIdx];
1094           return (EltTy.getSizeInBits() == 16 ||
1095                   EltTy.getSizeInBits() % 32 == 0) &&
1096                  VecTy.getSizeInBits() % 32 == 0 &&
1097                  VecTy.getSizeInBits() <= 1024 &&
1098                  IdxTy.getSizeInBits() == 32;
1099         })
1100       .clampScalar(EltTypeIdx, S32, S64)
1101       .clampScalar(VecTypeIdx, S32, S64)
1102       .clampScalar(IdxTypeIdx, S32, S32);
1103   }
1104 
1105   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1106     .unsupportedIf([=](const LegalityQuery &Query) {
1107         const LLT &EltTy = Query.Types[1].getElementType();
1108         return Query.Types[0] != EltTy;
1109       });
1110 
1111   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1112     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1113     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1114 
1115     // FIXME: Doesn't handle extract of illegal sizes.
1116     getActionDefinitionsBuilder(Op)
1117       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1118       // FIXME: Multiples of 16 should not be legal.
1119       .legalIf([=](const LegalityQuery &Query) {
1120           const LLT BigTy = Query.Types[BigTyIdx];
1121           const LLT LitTy = Query.Types[LitTyIdx];
1122           return (BigTy.getSizeInBits() % 32 == 0) &&
1123                  (LitTy.getSizeInBits() % 16 == 0);
1124         })
1125       .widenScalarIf(
1126         [=](const LegalityQuery &Query) {
1127           const LLT BigTy = Query.Types[BigTyIdx];
1128           return (BigTy.getScalarSizeInBits() < 16);
1129         },
1130         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1131       .widenScalarIf(
1132         [=](const LegalityQuery &Query) {
1133           const LLT LitTy = Query.Types[LitTyIdx];
1134           return (LitTy.getScalarSizeInBits() < 16);
1135         },
1136         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1137       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1138       .widenScalarToNextPow2(BigTyIdx, 32);
1139 
1140   }
1141 
1142   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1143     .legalForCartesianProduct(AllS32Vectors, {S32})
1144     .legalForCartesianProduct(AllS64Vectors, {S64})
1145     .clampNumElements(0, V16S32, V32S32)
1146     .clampNumElements(0, V2S64, V16S64)
1147     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1148 
1149   if (ST.hasScalarPackInsts()) {
1150     BuildVector
1151       // FIXME: Should probably widen s1 vectors straight to s32
1152       .minScalarOrElt(0, S16)
1153       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1154       .minScalar(1, S32);
1155 
1156     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1157       .legalFor({V2S16, S32})
1158       .lower();
1159     BuildVector.minScalarOrElt(0, S32);
1160   } else {
1161     BuildVector.customFor({V2S16, S16});
1162     BuildVector.minScalarOrElt(0, S32);
1163 
1164     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1165       .customFor({V2S16, S32})
1166       .lower();
1167   }
1168 
1169   BuildVector.legalIf(isRegisterType(0));
1170 
1171   // FIXME: Clamp maximum size
1172   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1173     .legalIf(isRegisterType(0));
1174 
1175   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1176   // pre-legalize.
1177   if (ST.hasVOP3PInsts()) {
1178     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1179       .customFor({V2S16, V2S16})
1180       .lower();
1181   } else
1182     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1183 
1184   // Merge/Unmerge
1185   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1186     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1187     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1188 
1189     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1190       const LLT &Ty = Query.Types[TypeIdx];
1191       if (Ty.isVector()) {
1192         const LLT &EltTy = Ty.getElementType();
1193         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1194           return true;
1195         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1196           return true;
1197       }
1198       return false;
1199     };
1200 
1201     auto &Builder = getActionDefinitionsBuilder(Op)
1202       // Try to widen to s16 first for small types.
1203       // TODO: Only do this on targets with legal s16 shifts
1204       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1205 
1206       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1207       .lowerFor({{S16, V2S16}})
1208       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1209       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1210                            elementTypeIs(1, S16)),
1211                        changeTo(1, V2S16))
1212       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1213       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1214       // valid.
1215       .clampScalar(LitTyIdx, S32, S256)
1216       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1217       // Break up vectors with weird elements into scalars
1218       .fewerElementsIf(
1219         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1220         scalarize(0))
1221       .fewerElementsIf(
1222         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1223         scalarize(1))
1224       .clampScalar(BigTyIdx, S32, S1024);
1225 
1226     if (Op == G_MERGE_VALUES) {
1227       Builder.widenScalarIf(
1228         // TODO: Use 16-bit shifts if legal for 8-bit values?
1229         [=](const LegalityQuery &Query) {
1230           const LLT Ty = Query.Types[LitTyIdx];
1231           return Ty.getSizeInBits() < 32;
1232         },
1233         changeTo(LitTyIdx, S32));
1234     }
1235 
1236     Builder.widenScalarIf(
1237       [=](const LegalityQuery &Query) {
1238         const LLT Ty = Query.Types[BigTyIdx];
1239         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1240           Ty.getSizeInBits() % 16 != 0;
1241       },
1242       [=](const LegalityQuery &Query) {
1243         // Pick the next power of 2, or a multiple of 64 over 128.
1244         // Whichever is smaller.
1245         const LLT &Ty = Query.Types[BigTyIdx];
1246         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1247         if (NewSizeInBits >= 256) {
1248           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1249           if (RoundedTo < NewSizeInBits)
1250             NewSizeInBits = RoundedTo;
1251         }
1252         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1253       })
1254       .legalIf([=](const LegalityQuery &Query) {
1255           const LLT &BigTy = Query.Types[BigTyIdx];
1256           const LLT &LitTy = Query.Types[LitTyIdx];
1257 
1258           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1259             return false;
1260           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1261             return false;
1262 
1263           return BigTy.getSizeInBits() % 16 == 0 &&
1264                  LitTy.getSizeInBits() % 16 == 0 &&
1265                  BigTy.getSizeInBits() <= 1024;
1266         })
1267       // Any vectors left are the wrong size. Scalarize them.
1268       .scalarize(0)
1269       .scalarize(1);
1270   }
1271 
1272   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1273   // RegBankSelect.
1274   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1275     .legalFor({{S32}, {S64}});
1276 
1277   if (ST.hasVOP3PInsts()) {
1278     SextInReg.lowerFor({{V2S16}})
1279       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1280       // get more vector shift opportunities, since we'll get those when
1281       // expanded.
1282       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1283   } else if (ST.has16BitInsts()) {
1284     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1285   } else {
1286     // Prefer to promote to s32 before lowering if we don't have 16-bit
1287     // shifts. This avoid a lot of intermediate truncate and extend operations.
1288     SextInReg.lowerFor({{S32}, {S64}});
1289   }
1290 
1291   SextInReg
1292     .scalarize(0)
1293     .clampScalar(0, S32, S64)
1294     .lower();
1295 
1296   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1297     .legalFor({S64});
1298 
1299   getActionDefinitionsBuilder({
1300       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1301       G_FCOPYSIGN,
1302 
1303       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1304       G_READ_REGISTER,
1305       G_WRITE_REGISTER,
1306 
1307       G_SADDO, G_SSUBO,
1308 
1309        // TODO: Implement
1310       G_FMINIMUM, G_FMAXIMUM
1311     }).lower();
1312 
1313   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1314         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1315         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1316     .unsupported();
1317 
1318   computeTables();
1319   verify(*ST.getInstrInfo());
1320 }
1321 
1322 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1323                                          MachineRegisterInfo &MRI,
1324                                          MachineIRBuilder &B,
1325                                          GISelChangeObserver &Observer) const {
1326   switch (MI.getOpcode()) {
1327   case TargetOpcode::G_ADDRSPACE_CAST:
1328     return legalizeAddrSpaceCast(MI, MRI, B);
1329   case TargetOpcode::G_FRINT:
1330     return legalizeFrint(MI, MRI, B);
1331   case TargetOpcode::G_FCEIL:
1332     return legalizeFceil(MI, MRI, B);
1333   case TargetOpcode::G_INTRINSIC_TRUNC:
1334     return legalizeIntrinsicTrunc(MI, MRI, B);
1335   case TargetOpcode::G_SITOFP:
1336     return legalizeITOFP(MI, MRI, B, true);
1337   case TargetOpcode::G_UITOFP:
1338     return legalizeITOFP(MI, MRI, B, false);
1339   case TargetOpcode::G_FPTOSI:
1340     return legalizeFPTOI(MI, MRI, B, true);
1341   case TargetOpcode::G_FPTOUI:
1342     return legalizeFPTOI(MI, MRI, B, false);
1343   case TargetOpcode::G_FMINNUM:
1344   case TargetOpcode::G_FMAXNUM:
1345   case TargetOpcode::G_FMINNUM_IEEE:
1346   case TargetOpcode::G_FMAXNUM_IEEE:
1347     return legalizeMinNumMaxNum(MI, MRI, B);
1348   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1349     return legalizeExtractVectorElt(MI, MRI, B);
1350   case TargetOpcode::G_INSERT_VECTOR_ELT:
1351     return legalizeInsertVectorElt(MI, MRI, B);
1352   case TargetOpcode::G_SHUFFLE_VECTOR:
1353     return legalizeShuffleVector(MI, MRI, B);
1354   case TargetOpcode::G_FSIN:
1355   case TargetOpcode::G_FCOS:
1356     return legalizeSinCos(MI, MRI, B);
1357   case TargetOpcode::G_GLOBAL_VALUE:
1358     return legalizeGlobalValue(MI, MRI, B);
1359   case TargetOpcode::G_LOAD:
1360     return legalizeLoad(MI, MRI, B, Observer);
1361   case TargetOpcode::G_FMAD:
1362     return legalizeFMad(MI, MRI, B);
1363   case TargetOpcode::G_FDIV:
1364     return legalizeFDIV(MI, MRI, B);
1365   case TargetOpcode::G_UDIV:
1366   case TargetOpcode::G_UREM:
1367     return legalizeUDIV_UREM(MI, MRI, B);
1368   case TargetOpcode::G_SDIV:
1369   case TargetOpcode::G_SREM:
1370     return legalizeSDIV_SREM(MI, MRI, B);
1371   case TargetOpcode::G_ATOMIC_CMPXCHG:
1372     return legalizeAtomicCmpXChg(MI, MRI, B);
1373   case TargetOpcode::G_FLOG:
1374     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1375   case TargetOpcode::G_FLOG10:
1376     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1377   case TargetOpcode::G_FEXP:
1378     return legalizeFExp(MI, B);
1379   case TargetOpcode::G_FPOW:
1380     return legalizeFPow(MI, B);
1381   case TargetOpcode::G_FFLOOR:
1382     return legalizeFFloor(MI, MRI, B);
1383   case TargetOpcode::G_BUILD_VECTOR:
1384     return legalizeBuildVector(MI, MRI, B);
1385   default:
1386     return false;
1387   }
1388 
1389   llvm_unreachable("expected switch to return");
1390 }
1391 
1392 Register AMDGPULegalizerInfo::getSegmentAperture(
1393   unsigned AS,
1394   MachineRegisterInfo &MRI,
1395   MachineIRBuilder &B) const {
1396   MachineFunction &MF = B.getMF();
1397   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1398   const LLT S32 = LLT::scalar(32);
1399 
1400   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1401 
1402   if (ST.hasApertureRegs()) {
1403     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1404     // getreg.
1405     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1406         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1407         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1408     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1409         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1410         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1411     unsigned Encoding =
1412         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1413         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1414         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1415 
1416     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1417 
1418     B.buildInstr(AMDGPU::S_GETREG_B32)
1419       .addDef(GetReg)
1420       .addImm(Encoding);
1421     MRI.setType(GetReg, S32);
1422 
1423     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1424     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1425   }
1426 
1427   Register QueuePtr = MRI.createGenericVirtualRegister(
1428     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1429 
1430   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1431   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1432     return Register();
1433 
1434   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1435   // private_segment_aperture_base_hi.
1436   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1437 
1438   // TODO: can we be smarter about machine pointer info?
1439   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1440   MachineMemOperand *MMO = MF.getMachineMemOperand(
1441     PtrInfo,
1442     MachineMemOperand::MOLoad |
1443     MachineMemOperand::MODereferenceable |
1444     MachineMemOperand::MOInvariant,
1445     4,
1446     MinAlign(64, StructOffset));
1447 
1448   Register LoadAddr;
1449 
1450   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1451   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1452 }
1453 
1454 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1455   MachineInstr &MI, MachineRegisterInfo &MRI,
1456   MachineIRBuilder &B) const {
1457   MachineFunction &MF = B.getMF();
1458 
1459   B.setInstr(MI);
1460 
1461   const LLT S32 = LLT::scalar(32);
1462   Register Dst = MI.getOperand(0).getReg();
1463   Register Src = MI.getOperand(1).getReg();
1464 
1465   LLT DstTy = MRI.getType(Dst);
1466   LLT SrcTy = MRI.getType(Src);
1467   unsigned DestAS = DstTy.getAddressSpace();
1468   unsigned SrcAS = SrcTy.getAddressSpace();
1469 
1470   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1471   // vector element.
1472   assert(!DstTy.isVector());
1473 
1474   const AMDGPUTargetMachine &TM
1475     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1476 
1477   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1478   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1479     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1480     return true;
1481   }
1482 
1483   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1484     // Truncate.
1485     B.buildExtract(Dst, Src, 0);
1486     MI.eraseFromParent();
1487     return true;
1488   }
1489 
1490   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1491     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1492     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1493 
1494     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1495     // another. Merge operands are required to be the same type, but creating an
1496     // extra ptrtoint would be kind of pointless.
1497     auto HighAddr = B.buildConstant(
1498       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1499     B.buildMerge(Dst, {Src, HighAddr});
1500     MI.eraseFromParent();
1501     return true;
1502   }
1503 
1504   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1505     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1506            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1507     unsigned NullVal = TM.getNullPointerValue(DestAS);
1508 
1509     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1510     auto FlatNull = B.buildConstant(SrcTy, 0);
1511 
1512     // Extract low 32-bits of the pointer.
1513     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1514 
1515     auto CmpRes =
1516         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1517     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1518 
1519     MI.eraseFromParent();
1520     return true;
1521   }
1522 
1523   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1524     return false;
1525 
1526   if (!ST.hasFlatAddressSpace())
1527     return false;
1528 
1529   auto SegmentNull =
1530       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1531   auto FlatNull =
1532       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1533 
1534   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1535   if (!ApertureReg.isValid())
1536     return false;
1537 
1538   auto CmpRes =
1539       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1540 
1541   // Coerce the type of the low half of the result so we can use merge_values.
1542   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1543 
1544   // TODO: Should we allow mismatched types but matching sizes in merges to
1545   // avoid the ptrtoint?
1546   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1547   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1548 
1549   MI.eraseFromParent();
1550   return true;
1551 }
1552 
1553 bool AMDGPULegalizerInfo::legalizeFrint(
1554   MachineInstr &MI, MachineRegisterInfo &MRI,
1555   MachineIRBuilder &B) const {
1556   B.setInstr(MI);
1557 
1558   Register Src = MI.getOperand(1).getReg();
1559   LLT Ty = MRI.getType(Src);
1560   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1561 
1562   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1563   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1564 
1565   auto C1 = B.buildFConstant(Ty, C1Val);
1566   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1567 
1568   // TODO: Should this propagate fast-math-flags?
1569   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1570   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1571 
1572   auto C2 = B.buildFConstant(Ty, C2Val);
1573   auto Fabs = B.buildFAbs(Ty, Src);
1574 
1575   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1576   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1577   return true;
1578 }
1579 
1580 bool AMDGPULegalizerInfo::legalizeFceil(
1581   MachineInstr &MI, MachineRegisterInfo &MRI,
1582   MachineIRBuilder &B) const {
1583   B.setInstr(MI);
1584 
1585   const LLT S1 = LLT::scalar(1);
1586   const LLT S64 = LLT::scalar(64);
1587 
1588   Register Src = MI.getOperand(1).getReg();
1589   assert(MRI.getType(Src) == S64);
1590 
1591   // result = trunc(src)
1592   // if (src > 0.0 && src != result)
1593   //   result += 1.0
1594 
1595   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1596 
1597   const auto Zero = B.buildFConstant(S64, 0.0);
1598   const auto One = B.buildFConstant(S64, 1.0);
1599   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1600   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1601   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1602   auto Add = B.buildSelect(S64, And, One, Zero);
1603 
1604   // TODO: Should this propagate fast-math-flags?
1605   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1606   return true;
1607 }
1608 
1609 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1610                                               MachineIRBuilder &B) {
1611   const unsigned FractBits = 52;
1612   const unsigned ExpBits = 11;
1613   LLT S32 = LLT::scalar(32);
1614 
1615   auto Const0 = B.buildConstant(S32, FractBits - 32);
1616   auto Const1 = B.buildConstant(S32, ExpBits);
1617 
1618   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1619     .addUse(Const0.getReg(0))
1620     .addUse(Const1.getReg(0));
1621 
1622   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1623 }
1624 
1625 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1626   MachineInstr &MI, MachineRegisterInfo &MRI,
1627   MachineIRBuilder &B) const {
1628   B.setInstr(MI);
1629 
1630   const LLT S1 = LLT::scalar(1);
1631   const LLT S32 = LLT::scalar(32);
1632   const LLT S64 = LLT::scalar(64);
1633 
1634   Register Src = MI.getOperand(1).getReg();
1635   assert(MRI.getType(Src) == S64);
1636 
1637   // TODO: Should this use extract since the low half is unused?
1638   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1639   Register Hi = Unmerge.getReg(1);
1640 
1641   // Extract the upper half, since this is where we will find the sign and
1642   // exponent.
1643   auto Exp = extractF64Exponent(Hi, B);
1644 
1645   const unsigned FractBits = 52;
1646 
1647   // Extract the sign bit.
1648   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1649   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1650 
1651   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1652 
1653   const auto Zero32 = B.buildConstant(S32, 0);
1654 
1655   // Extend back to 64-bits.
1656   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1657 
1658   auto Shr = B.buildAShr(S64, FractMask, Exp);
1659   auto Not = B.buildNot(S64, Shr);
1660   auto Tmp0 = B.buildAnd(S64, Src, Not);
1661   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1662 
1663   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1664   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1665 
1666   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1667   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1668   return true;
1669 }
1670 
1671 bool AMDGPULegalizerInfo::legalizeITOFP(
1672   MachineInstr &MI, MachineRegisterInfo &MRI,
1673   MachineIRBuilder &B, bool Signed) const {
1674   B.setInstr(MI);
1675 
1676   Register Dst = MI.getOperand(0).getReg();
1677   Register Src = MI.getOperand(1).getReg();
1678 
1679   const LLT S64 = LLT::scalar(64);
1680   const LLT S32 = LLT::scalar(32);
1681 
1682   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1683 
1684   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1685 
1686   auto CvtHi = Signed ?
1687     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1688     B.buildUITOFP(S64, Unmerge.getReg(1));
1689 
1690   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1691 
1692   auto ThirtyTwo = B.buildConstant(S32, 32);
1693   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1694     .addUse(CvtHi.getReg(0))
1695     .addUse(ThirtyTwo.getReg(0));
1696 
1697   // TODO: Should this propagate fast-math-flags?
1698   B.buildFAdd(Dst, LdExp, CvtLo);
1699   MI.eraseFromParent();
1700   return true;
1701 }
1702 
1703 // TODO: Copied from DAG implementation. Verify logic and document how this
1704 // actually works.
1705 bool AMDGPULegalizerInfo::legalizeFPTOI(
1706   MachineInstr &MI, MachineRegisterInfo &MRI,
1707   MachineIRBuilder &B, bool Signed) const {
1708   B.setInstr(MI);
1709 
1710   Register Dst = MI.getOperand(0).getReg();
1711   Register Src = MI.getOperand(1).getReg();
1712 
1713   const LLT S64 = LLT::scalar(64);
1714   const LLT S32 = LLT::scalar(32);
1715 
1716   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1717 
1718   unsigned Flags = MI.getFlags();
1719 
1720   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1721   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1722   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1723 
1724   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1725   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1726   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1727 
1728   auto Hi = Signed ?
1729     B.buildFPTOSI(S32, FloorMul) :
1730     B.buildFPTOUI(S32, FloorMul);
1731   auto Lo = B.buildFPTOUI(S32, Fma);
1732 
1733   B.buildMerge(Dst, { Lo, Hi });
1734   MI.eraseFromParent();
1735 
1736   return true;
1737 }
1738 
1739 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1740   MachineInstr &MI, MachineRegisterInfo &MRI,
1741   MachineIRBuilder &B) const {
1742   MachineFunction &MF = B.getMF();
1743   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1744 
1745   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1746                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1747 
1748   // With ieee_mode disabled, the instructions have the correct behavior
1749   // already for G_FMINNUM/G_FMAXNUM
1750   if (!MFI->getMode().IEEE)
1751     return !IsIEEEOp;
1752 
1753   if (IsIEEEOp)
1754     return true;
1755 
1756   MachineIRBuilder HelperBuilder(MI);
1757   GISelObserverWrapper DummyObserver;
1758   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1759   HelperBuilder.setInstr(MI);
1760   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1761 }
1762 
1763 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1764   MachineInstr &MI, MachineRegisterInfo &MRI,
1765   MachineIRBuilder &B) const {
1766   // TODO: Should move some of this into LegalizerHelper.
1767 
1768   // TODO: Promote dynamic indexing of s16 to s32
1769 
1770   // FIXME: Artifact combiner probably should have replaced the truncated
1771   // constant before this, so we shouldn't need
1772   // getConstantVRegValWithLookThrough.
1773   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1774     MI.getOperand(2).getReg(), MRI);
1775   if (!IdxVal) // Dynamic case will be selected to register indexing.
1776     return true;
1777 
1778   Register Dst = MI.getOperand(0).getReg();
1779   Register Vec = MI.getOperand(1).getReg();
1780 
1781   LLT VecTy = MRI.getType(Vec);
1782   LLT EltTy = VecTy.getElementType();
1783   assert(EltTy == MRI.getType(Dst));
1784 
1785   B.setInstr(MI);
1786 
1787   if (IdxVal->Value < VecTy.getNumElements())
1788     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1789   else
1790     B.buildUndef(Dst);
1791 
1792   MI.eraseFromParent();
1793   return true;
1794 }
1795 
1796 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1797   MachineInstr &MI, MachineRegisterInfo &MRI,
1798   MachineIRBuilder &B) const {
1799   // TODO: Should move some of this into LegalizerHelper.
1800 
1801   // TODO: Promote dynamic indexing of s16 to s32
1802 
1803   // FIXME: Artifact combiner probably should have replaced the truncated
1804   // constant before this, so we shouldn't need
1805   // getConstantVRegValWithLookThrough.
1806   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1807     MI.getOperand(3).getReg(), MRI);
1808   if (!IdxVal) // Dynamic case will be selected to register indexing.
1809     return true;
1810 
1811   Register Dst = MI.getOperand(0).getReg();
1812   Register Vec = MI.getOperand(1).getReg();
1813   Register Ins = MI.getOperand(2).getReg();
1814 
1815   LLT VecTy = MRI.getType(Vec);
1816   LLT EltTy = VecTy.getElementType();
1817   assert(EltTy == MRI.getType(Ins));
1818 
1819   B.setInstr(MI);
1820 
1821   if (IdxVal->Value < VecTy.getNumElements())
1822     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1823   else
1824     B.buildUndef(Dst);
1825 
1826   MI.eraseFromParent();
1827   return true;
1828 }
1829 
1830 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1831   MachineInstr &MI, MachineRegisterInfo &MRI,
1832   MachineIRBuilder &B) const {
1833   const LLT V2S16 = LLT::vector(2, 16);
1834 
1835   Register Dst = MI.getOperand(0).getReg();
1836   Register Src0 = MI.getOperand(1).getReg();
1837   LLT DstTy = MRI.getType(Dst);
1838   LLT SrcTy = MRI.getType(Src0);
1839 
1840   if (SrcTy == V2S16 && DstTy == V2S16 &&
1841       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1842     return true;
1843 
1844   MachineIRBuilder HelperBuilder(MI);
1845   GISelObserverWrapper DummyObserver;
1846   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1847   HelperBuilder.setInstr(MI);
1848   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1849 }
1850 
1851 bool AMDGPULegalizerInfo::legalizeSinCos(
1852   MachineInstr &MI, MachineRegisterInfo &MRI,
1853   MachineIRBuilder &B) const {
1854   B.setInstr(MI);
1855 
1856   Register DstReg = MI.getOperand(0).getReg();
1857   Register SrcReg = MI.getOperand(1).getReg();
1858   LLT Ty = MRI.getType(DstReg);
1859   unsigned Flags = MI.getFlags();
1860 
1861   Register TrigVal;
1862   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1863   if (ST.hasTrigReducedRange()) {
1864     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1865     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1866       .addUse(MulVal.getReg(0))
1867       .setMIFlags(Flags).getReg(0);
1868   } else
1869     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1870 
1871   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1872     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1873   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1874     .addUse(TrigVal)
1875     .setMIFlags(Flags);
1876   MI.eraseFromParent();
1877   return true;
1878 }
1879 
1880 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1881   Register DstReg, LLT PtrTy,
1882   MachineIRBuilder &B, const GlobalValue *GV,
1883   unsigned Offset, unsigned GAFlags) const {
1884   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1885   // to the following code sequence:
1886   //
1887   // For constant address space:
1888   //   s_getpc_b64 s[0:1]
1889   //   s_add_u32 s0, s0, $symbol
1890   //   s_addc_u32 s1, s1, 0
1891   //
1892   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1893   //   a fixup or relocation is emitted to replace $symbol with a literal
1894   //   constant, which is a pc-relative offset from the encoding of the $symbol
1895   //   operand to the global variable.
1896   //
1897   // For global address space:
1898   //   s_getpc_b64 s[0:1]
1899   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1900   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1901   //
1902   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1903   //   fixups or relocations are emitted to replace $symbol@*@lo and
1904   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1905   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1906   //   operand to the global variable.
1907   //
1908   // What we want here is an offset from the value returned by s_getpc
1909   // (which is the address of the s_add_u32 instruction) to the global
1910   // variable, but since the encoding of $symbol starts 4 bytes after the start
1911   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1912   // small. This requires us to add 4 to the global variable offset in order to
1913   // compute the correct address.
1914 
1915   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1916 
1917   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1918     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1919 
1920   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1921     .addDef(PCReg);
1922 
1923   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1924   if (GAFlags == SIInstrInfo::MO_NONE)
1925     MIB.addImm(0);
1926   else
1927     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1928 
1929   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1930 
1931   if (PtrTy.getSizeInBits() == 32)
1932     B.buildExtract(DstReg, PCReg, 0);
1933   return true;
1934  }
1935 
1936 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1937   MachineInstr &MI, MachineRegisterInfo &MRI,
1938   MachineIRBuilder &B) const {
1939   Register DstReg = MI.getOperand(0).getReg();
1940   LLT Ty = MRI.getType(DstReg);
1941   unsigned AS = Ty.getAddressSpace();
1942 
1943   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1944   MachineFunction &MF = B.getMF();
1945   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1946   B.setInstr(MI);
1947 
1948   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1949     if (!MFI->isEntryFunction()) {
1950       const Function &Fn = MF.getFunction();
1951       DiagnosticInfoUnsupported BadLDSDecl(
1952         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1953       Fn.getContext().diagnose(BadLDSDecl);
1954     }
1955 
1956     // TODO: We could emit code to handle the initialization somewhere.
1957     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1958       const SITargetLowering *TLI = ST.getTargetLowering();
1959       if (!TLI->shouldUseLDSConstAddress(GV)) {
1960         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1961         return true; // Leave in place;
1962       }
1963 
1964       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1965       MI.eraseFromParent();
1966       return true;
1967     }
1968 
1969     const Function &Fn = MF.getFunction();
1970     DiagnosticInfoUnsupported BadInit(
1971       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1972     Fn.getContext().diagnose(BadInit);
1973     return true;
1974   }
1975 
1976   const SITargetLowering *TLI = ST.getTargetLowering();
1977 
1978   if (TLI->shouldEmitFixup(GV)) {
1979     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1980     MI.eraseFromParent();
1981     return true;
1982   }
1983 
1984   if (TLI->shouldEmitPCReloc(GV)) {
1985     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1986     MI.eraseFromParent();
1987     return true;
1988   }
1989 
1990   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1991   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1992 
1993   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1994     MachinePointerInfo::getGOT(MF),
1995     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1996     MachineMemOperand::MOInvariant,
1997     8 /*Size*/, 8 /*Align*/);
1998 
1999   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2000 
2001   if (Ty.getSizeInBits() == 32) {
2002     // Truncate if this is a 32-bit constant adrdess.
2003     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2004     B.buildExtract(DstReg, Load, 0);
2005   } else
2006     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2007 
2008   MI.eraseFromParent();
2009   return true;
2010 }
2011 
2012 bool AMDGPULegalizerInfo::legalizeLoad(
2013   MachineInstr &MI, MachineRegisterInfo &MRI,
2014   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2015   B.setInstr(MI);
2016   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2017   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2018   Observer.changingInstr(MI);
2019   MI.getOperand(1).setReg(Cast.getReg(0));
2020   Observer.changedInstr(MI);
2021   return true;
2022 }
2023 
2024 bool AMDGPULegalizerInfo::legalizeFMad(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B) const {
2027   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2028   assert(Ty.isScalar());
2029 
2030   MachineFunction &MF = B.getMF();
2031   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2032 
2033   // TODO: Always legal with future ftz flag.
2034   // FIXME: Do we need just output?
2035   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2036     return true;
2037   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2038     return true;
2039 
2040   MachineIRBuilder HelperBuilder(MI);
2041   GISelObserverWrapper DummyObserver;
2042   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2043   HelperBuilder.setMBB(*MI.getParent());
2044   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2045 }
2046 
2047 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2048   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2049   Register DstReg = MI.getOperand(0).getReg();
2050   Register PtrReg = MI.getOperand(1).getReg();
2051   Register CmpVal = MI.getOperand(2).getReg();
2052   Register NewVal = MI.getOperand(3).getReg();
2053 
2054   assert(SITargetLowering::isFlatGlobalAddrSpace(
2055            MRI.getType(PtrReg).getAddressSpace()) &&
2056          "this should not have been custom lowered");
2057 
2058   LLT ValTy = MRI.getType(CmpVal);
2059   LLT VecTy = LLT::vector(2, ValTy);
2060 
2061   B.setInstr(MI);
2062   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2063 
2064   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2065     .addDef(DstReg)
2066     .addUse(PtrReg)
2067     .addUse(PackedVal)
2068     .setMemRefs(MI.memoperands());
2069 
2070   MI.eraseFromParent();
2071   return true;
2072 }
2073 
2074 bool AMDGPULegalizerInfo::legalizeFlog(
2075   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2076   Register Dst = MI.getOperand(0).getReg();
2077   Register Src = MI.getOperand(1).getReg();
2078   LLT Ty = B.getMRI()->getType(Dst);
2079   unsigned Flags = MI.getFlags();
2080   B.setInstr(MI);
2081 
2082   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2083   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2084 
2085   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2086   MI.eraseFromParent();
2087   return true;
2088 }
2089 
2090 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2091                                        MachineIRBuilder &B) const {
2092   Register Dst = MI.getOperand(0).getReg();
2093   Register Src = MI.getOperand(1).getReg();
2094   unsigned Flags = MI.getFlags();
2095   LLT Ty = B.getMRI()->getType(Dst);
2096   B.setInstr(MI);
2097 
2098   auto K = B.buildFConstant(Ty, numbers::log2e);
2099   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2100   B.buildFExp2(Dst, Mul, Flags);
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104 
2105 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2106                                        MachineIRBuilder &B) const {
2107   Register Dst = MI.getOperand(0).getReg();
2108   Register Src0 = MI.getOperand(1).getReg();
2109   Register Src1 = MI.getOperand(2).getReg();
2110   unsigned Flags = MI.getFlags();
2111   LLT Ty = B.getMRI()->getType(Dst);
2112   B.setInstr(MI);
2113   const LLT S16 = LLT::scalar(16);
2114   const LLT S32 = LLT::scalar(32);
2115 
2116   if (Ty == S32) {
2117     auto Log = B.buildFLog2(S32, Src0, Flags);
2118     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2119       .addUse(Log.getReg(0))
2120       .addUse(Src1)
2121       .setMIFlags(Flags);
2122     B.buildFExp2(Dst, Mul, Flags);
2123   } else if (Ty == S16) {
2124     // There's no f16 fmul_legacy, so we need to convert for it.
2125     auto Log = B.buildFLog2(S16, Src0, Flags);
2126     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2127     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2128     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2129       .addUse(Ext0.getReg(0))
2130       .addUse(Ext1.getReg(0))
2131       .setMIFlags(Flags);
2132 
2133     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2134   } else
2135     return false;
2136 
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 // Find a source register, ignoring any possible source modifiers.
2142 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2143   Register ModSrc = OrigSrc;
2144   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2145     ModSrc = SrcFNeg->getOperand(1).getReg();
2146     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2147       ModSrc = SrcFAbs->getOperand(1).getReg();
2148   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2149     ModSrc = SrcFAbs->getOperand(1).getReg();
2150   return ModSrc;
2151 }
2152 
2153 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2154                                          MachineRegisterInfo &MRI,
2155                                          MachineIRBuilder &B) const {
2156   B.setInstr(MI);
2157 
2158   const LLT S1 = LLT::scalar(1);
2159   const LLT S64 = LLT::scalar(64);
2160   Register Dst = MI.getOperand(0).getReg();
2161   Register OrigSrc = MI.getOperand(1).getReg();
2162   unsigned Flags = MI.getFlags();
2163   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2164          "this should not have been custom lowered");
2165 
2166   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2167   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2168   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2169   // V_FRACT bug is:
2170   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2171   //
2172   // Convert floor(x) to (x - fract(x))
2173 
2174   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2175     .addUse(OrigSrc)
2176     .setMIFlags(Flags);
2177 
2178   // Give source modifier matching some assistance before obscuring a foldable
2179   // pattern.
2180 
2181   // TODO: We can avoid the neg on the fract? The input sign to fract
2182   // shouldn't matter?
2183   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2184 
2185   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2186 
2187   Register Min = MRI.createGenericVirtualRegister(S64);
2188 
2189   // We don't need to concern ourselves with the snan handling difference, so
2190   // use the one which will directly select.
2191   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2192   if (MFI->getMode().IEEE)
2193     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2194   else
2195     B.buildFMinNum(Min, Fract, Const, Flags);
2196 
2197   Register CorrectedFract = Min;
2198   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2199     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2200     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2201   }
2202 
2203   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2204   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2205 
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 // Turn an illegal packed v2s16 build vector into bit operations.
2211 // TODO: This should probably be a bitcast action in LegalizerHelper.
2212 bool AMDGPULegalizerInfo::legalizeBuildVector(
2213   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2214   Register Dst = MI.getOperand(0).getReg();
2215   LLT DstTy = MRI.getType(Dst);
2216   const LLT S32 = LLT::scalar(32);
2217   const LLT V2S16 = LLT::vector(2, 16);
2218   (void)DstTy;
2219   (void)V2S16;
2220   assert(DstTy == V2S16);
2221 
2222   Register Src0 = MI.getOperand(1).getReg();
2223   Register Src1 = MI.getOperand(2).getReg();
2224   assert(MRI.getType(Src0) == LLT::scalar(16));
2225 
2226   B.setInstr(MI);
2227   auto Merge = B.buildMerge(S32, {Src0, Src1});
2228   B.buildBitcast(Dst, Merge);
2229 
2230   MI.eraseFromParent();
2231   return true;
2232 }
2233 
2234 // Return the use branch instruction, otherwise null if the usage is invalid.
2235 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2236                                        MachineRegisterInfo &MRI,
2237                                        MachineInstr *&Br) {
2238   Register CondDef = MI.getOperand(0).getReg();
2239   if (!MRI.hasOneNonDBGUse(CondDef))
2240     return nullptr;
2241 
2242   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2243   if (UseMI.getParent() != MI.getParent() ||
2244       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2245     return nullptr;
2246 
2247   // Make sure the cond br is followed by a G_BR
2248   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2249   if (Next != MI.getParent()->end()) {
2250     if (Next->getOpcode() != AMDGPU::G_BR)
2251       return nullptr;
2252     Br = &*Next;
2253   }
2254 
2255   return &UseMI;
2256 }
2257 
2258 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2259                                                MachineRegisterInfo &MRI,
2260                                                Register LiveIn,
2261                                                Register PhyReg) const {
2262   assert(PhyReg.isPhysical() && "Physical register expected");
2263 
2264   // Insert the live-in copy, if required, by defining destination virtual
2265   // register.
2266   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2267   if (!MRI.getVRegDef(LiveIn)) {
2268     // FIXME: Should have scoped insert pt
2269     MachineBasicBlock &OrigInsBB = B.getMBB();
2270     auto OrigInsPt = B.getInsertPt();
2271 
2272     MachineBasicBlock &EntryMBB = B.getMF().front();
2273     EntryMBB.addLiveIn(PhyReg);
2274     B.setInsertPt(EntryMBB, EntryMBB.begin());
2275     B.buildCopy(LiveIn, PhyReg);
2276 
2277     B.setInsertPt(OrigInsBB, OrigInsPt);
2278   }
2279 
2280   return LiveIn;
2281 }
2282 
2283 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2284                                                 MachineRegisterInfo &MRI,
2285                                                 Register PhyReg, LLT Ty,
2286                                                 bool InsertLiveInCopy) const {
2287   assert(PhyReg.isPhysical() && "Physical register expected");
2288 
2289   // Get or create virtual live-in regester
2290   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2291   if (!LiveIn) {
2292     LiveIn = MRI.createGenericVirtualRegister(Ty);
2293     MRI.addLiveIn(PhyReg, LiveIn);
2294   }
2295 
2296   // When the actual true copy required is from virtual register to physical
2297   // register (to be inserted later), live-in copy insertion from physical
2298   // to register virtual register is not required
2299   if (!InsertLiveInCopy)
2300     return LiveIn;
2301 
2302   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2303 }
2304 
2305 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2306     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2307   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2308   const ArgDescriptor *Arg;
2309   const TargetRegisterClass *RC;
2310   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2311   if (!Arg) {
2312     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2313     return nullptr;
2314   }
2315   return Arg;
2316 }
2317 
2318 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2319                                          const ArgDescriptor *Arg) const {
2320   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2321     return false; // TODO: Handle these
2322 
2323   Register SrcReg = Arg->getRegister();
2324   assert(SrcReg.isPhysical() && "Physical register expected");
2325   assert(DstReg.isVirtual() && "Virtual register expected");
2326 
2327   MachineRegisterInfo &MRI = *B.getMRI();
2328 
2329   LLT Ty = MRI.getType(DstReg);
2330   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2331 
2332   if (Arg->isMasked()) {
2333     // TODO: Should we try to emit this once in the entry block?
2334     const LLT S32 = LLT::scalar(32);
2335     const unsigned Mask = Arg->getMask();
2336     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2337 
2338     Register AndMaskSrc = LiveIn;
2339 
2340     if (Shift != 0) {
2341       auto ShiftAmt = B.buildConstant(S32, Shift);
2342       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2343     }
2344 
2345     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2346   } else {
2347     B.buildCopy(DstReg, LiveIn);
2348   }
2349 
2350   return true;
2351 }
2352 
2353 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2354     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2355     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2356   B.setInstr(MI);
2357 
2358   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2359   if (!Arg)
2360     return false;
2361 
2362   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2363     return false;
2364 
2365   MI.eraseFromParent();
2366   return true;
2367 }
2368 
2369 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2370                                        MachineRegisterInfo &MRI,
2371                                        MachineIRBuilder &B) const {
2372   B.setInstr(MI);
2373   Register Dst = MI.getOperand(0).getReg();
2374   LLT DstTy = MRI.getType(Dst);
2375   LLT S16 = LLT::scalar(16);
2376   LLT S32 = LLT::scalar(32);
2377   LLT S64 = LLT::scalar(64);
2378 
2379   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2380     return true;
2381 
2382   if (DstTy == S16)
2383     return legalizeFDIV16(MI, MRI, B);
2384   if (DstTy == S32)
2385     return legalizeFDIV32(MI, MRI, B);
2386   if (DstTy == S64)
2387     return legalizeFDIV64(MI, MRI, B);
2388 
2389   return false;
2390 }
2391 
2392 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2393   const LLT S32 = LLT::scalar(32);
2394 
2395   auto Cvt0 = B.buildUITOFP(S32, Src);
2396   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2397   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2398   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2399   return B.buildFPTOUI(S32, Mul).getReg(0);
2400 }
2401 
2402 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2403                                                   Register DstReg,
2404                                                   Register Num,
2405                                                   Register Den,
2406                                                   bool IsRem) const {
2407   const LLT S1 = LLT::scalar(1);
2408   const LLT S32 = LLT::scalar(32);
2409 
2410   // RCP =  URECIP(Den) = 2^32 / Den + e
2411   // e is rounding error.
2412   auto RCP = buildDivRCP(B, Den);
2413 
2414   // RCP_LO = mul(RCP, Den)
2415   auto RCP_LO = B.buildMul(S32, RCP, Den);
2416 
2417   // RCP_HI = mulhu (RCP, Den) */
2418   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2419 
2420   // NEG_RCP_LO = -RCP_LO
2421   auto Zero = B.buildConstant(S32, 0);
2422   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2423 
2424   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2425   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2426   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2427 
2428   // Calculate the rounding error from the URECIP instruction
2429   // E = mulhu(ABS_RCP_LO, RCP)
2430   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2431 
2432   // RCP_A_E = RCP + E
2433   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2434 
2435   // RCP_S_E = RCP - E
2436   auto RCP_S_E = B.buildSub(S32, RCP, E);
2437 
2438   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2439   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2440 
2441   // Quotient = mulhu(Tmp0, Num)stmp
2442   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2443 
2444   // Num_S_Remainder = Quotient * Den
2445   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2446 
2447   // Remainder = Num - Num_S_Remainder
2448   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2449 
2450   // Remainder_GE_Den = Remainder >= Den
2451   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2452 
2453   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2454   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2455                                        Num, Num_S_Remainder);
2456 
2457   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2458   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2459 
2460   // Calculate Division result:
2461 
2462   // Quotient_A_One = Quotient + 1
2463   auto One = B.buildConstant(S32, 1);
2464   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2465 
2466   // Quotient_S_One = Quotient - 1
2467   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2468 
2469   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2470   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2471 
2472   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2473   if (IsRem) {
2474     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2475 
2476     // Calculate Rem result:
2477     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2478 
2479     // Remainder_A_Den = Remainder + Den
2480     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2481 
2482     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2483     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2484 
2485     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2486     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2487   } else {
2488     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2489   }
2490 }
2491 
2492 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2493                                               MachineRegisterInfo &MRI,
2494                                               MachineIRBuilder &B) const {
2495   B.setInstr(MI);
2496   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2497   Register DstReg = MI.getOperand(0).getReg();
2498   Register Num = MI.getOperand(1).getReg();
2499   Register Den = MI.getOperand(2).getReg();
2500   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2501   MI.eraseFromParent();
2502   return true;
2503 }
2504 
2505 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2506                                             MachineRegisterInfo &MRI,
2507                                             MachineIRBuilder &B) const {
2508   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2509     return legalizeUDIV_UREM32(MI, MRI, B);
2510   return false;
2511 }
2512 
2513 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2514                                               MachineRegisterInfo &MRI,
2515                                               MachineIRBuilder &B) const {
2516   B.setInstr(MI);
2517   const LLT S32 = LLT::scalar(32);
2518 
2519   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2520   Register DstReg = MI.getOperand(0).getReg();
2521   Register LHS = MI.getOperand(1).getReg();
2522   Register RHS = MI.getOperand(2).getReg();
2523 
2524   auto ThirtyOne = B.buildConstant(S32, 31);
2525   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2526   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2527 
2528   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2529   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2530 
2531   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2532   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2533 
2534   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2535   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2536 
2537   if (IsRem) {
2538     auto RSign = LHSign; // Remainder sign is the same as LHS
2539     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2540     B.buildSub(DstReg, UDivRem, RSign);
2541   } else {
2542     auto DSign = B.buildXor(S32, LHSign, RHSign);
2543     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2544     B.buildSub(DstReg, UDivRem, DSign);
2545   }
2546 
2547   MI.eraseFromParent();
2548   return true;
2549 }
2550 
2551 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2552                                             MachineRegisterInfo &MRI,
2553                                             MachineIRBuilder &B) const {
2554   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2555     return legalizeSDIV_SREM32(MI, MRI, B);
2556   return false;
2557 }
2558 
2559 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2560                                                  MachineRegisterInfo &MRI,
2561                                                  MachineIRBuilder &B) const {
2562   Register Res = MI.getOperand(0).getReg();
2563   Register LHS = MI.getOperand(1).getReg();
2564   Register RHS = MI.getOperand(2).getReg();
2565 
2566   uint16_t Flags = MI.getFlags();
2567 
2568   LLT ResTy = MRI.getType(Res);
2569   LLT S32 = LLT::scalar(32);
2570   LLT S64 = LLT::scalar(64);
2571 
2572   const MachineFunction &MF = B.getMF();
2573   bool Unsafe =
2574     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2575 
2576   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2577     return false;
2578 
2579   if (!Unsafe && ResTy == S32 &&
2580       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2581     return false;
2582 
2583   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2584     // 1 / x -> RCP(x)
2585     if (CLHS->isExactlyValue(1.0)) {
2586       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2587         .addUse(RHS)
2588         .setMIFlags(Flags);
2589 
2590       MI.eraseFromParent();
2591       return true;
2592     }
2593 
2594     // -1 / x -> RCP( FNEG(x) )
2595     if (CLHS->isExactlyValue(-1.0)) {
2596       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2597       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2598         .addUse(FNeg.getReg(0))
2599         .setMIFlags(Flags);
2600 
2601       MI.eraseFromParent();
2602       return true;
2603     }
2604   }
2605 
2606   // x / y -> x * (1.0 / y)
2607   if (Unsafe) {
2608     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2609       .addUse(RHS)
2610       .setMIFlags(Flags);
2611     B.buildFMul(Res, LHS, RCP, Flags);
2612 
2613     MI.eraseFromParent();
2614     return true;
2615   }
2616 
2617   return false;
2618 }
2619 
2620 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2621                                          MachineRegisterInfo &MRI,
2622                                          MachineIRBuilder &B) const {
2623   B.setInstr(MI);
2624   Register Res = MI.getOperand(0).getReg();
2625   Register LHS = MI.getOperand(1).getReg();
2626   Register RHS = MI.getOperand(2).getReg();
2627 
2628   uint16_t Flags = MI.getFlags();
2629 
2630   LLT S16 = LLT::scalar(16);
2631   LLT S32 = LLT::scalar(32);
2632 
2633   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2634   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2635 
2636   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2637     .addUse(RHSExt.getReg(0))
2638     .setMIFlags(Flags);
2639 
2640   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2641   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2642 
2643   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2644     .addUse(RDst.getReg(0))
2645     .addUse(RHS)
2646     .addUse(LHS)
2647     .setMIFlags(Flags);
2648 
2649   MI.eraseFromParent();
2650   return true;
2651 }
2652 
2653 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2654 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2655 static void toggleSPDenormMode(bool Enable,
2656                                MachineIRBuilder &B,
2657                                const GCNSubtarget &ST,
2658                                AMDGPU::SIModeRegisterDefaults Mode) {
2659   // Set SP denorm mode to this value.
2660   unsigned SPDenormMode =
2661     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2662 
2663   if (ST.hasDenormModeInst()) {
2664     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2665     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2666 
2667     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2668     B.buildInstr(AMDGPU::S_DENORM_MODE)
2669       .addImm(NewDenormModeValue);
2670 
2671   } else {
2672     // Select FP32 bit field in mode register.
2673     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2674                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2675                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2676 
2677     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2678       .addImm(SPDenormMode)
2679       .addImm(SPDenormModeBitField);
2680   }
2681 }
2682 
2683 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2684                                          MachineRegisterInfo &MRI,
2685                                          MachineIRBuilder &B) const {
2686   B.setInstr(MI);
2687   Register Res = MI.getOperand(0).getReg();
2688   Register LHS = MI.getOperand(1).getReg();
2689   Register RHS = MI.getOperand(2).getReg();
2690   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2691   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2692 
2693   uint16_t Flags = MI.getFlags();
2694 
2695   LLT S32 = LLT::scalar(32);
2696   LLT S1 = LLT::scalar(1);
2697 
2698   auto One = B.buildFConstant(S32, 1.0f);
2699 
2700   auto DenominatorScaled =
2701     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2702       .addUse(RHS)
2703       .addUse(LHS)
2704       .addImm(1)
2705       .setMIFlags(Flags);
2706   auto NumeratorScaled =
2707     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2708       .addUse(LHS)
2709       .addUse(RHS)
2710       .addImm(0)
2711       .setMIFlags(Flags);
2712 
2713   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2714     .addUse(DenominatorScaled.getReg(0))
2715     .setMIFlags(Flags);
2716   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2717 
2718   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2719   // aren't modeled as reading it.
2720   if (!Mode.allFP32Denormals())
2721     toggleSPDenormMode(true, B, ST, Mode);
2722 
2723   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2724   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2725   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2726   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2727   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2728   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2729 
2730   if (!Mode.allFP32Denormals())
2731     toggleSPDenormMode(false, B, ST, Mode);
2732 
2733   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2734     .addUse(Fma4.getReg(0))
2735     .addUse(Fma1.getReg(0))
2736     .addUse(Fma3.getReg(0))
2737     .addUse(NumeratorScaled.getReg(1))
2738     .setMIFlags(Flags);
2739 
2740   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2741     .addUse(Fmas.getReg(0))
2742     .addUse(RHS)
2743     .addUse(LHS)
2744     .setMIFlags(Flags);
2745 
2746   MI.eraseFromParent();
2747   return true;
2748 }
2749 
2750 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2751                                          MachineRegisterInfo &MRI,
2752                                          MachineIRBuilder &B) const {
2753   B.setInstr(MI);
2754   Register Res = MI.getOperand(0).getReg();
2755   Register LHS = MI.getOperand(1).getReg();
2756   Register RHS = MI.getOperand(2).getReg();
2757 
2758   uint16_t Flags = MI.getFlags();
2759 
2760   LLT S64 = LLT::scalar(64);
2761   LLT S1 = LLT::scalar(1);
2762 
2763   auto One = B.buildFConstant(S64, 1.0);
2764 
2765   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2766     .addUse(LHS)
2767     .addUse(RHS)
2768     .addImm(1)
2769     .setMIFlags(Flags);
2770 
2771   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2772 
2773   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2774     .addUse(DivScale0.getReg(0))
2775     .setMIFlags(Flags);
2776 
2777   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2778   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2779   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2780 
2781   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2782     .addUse(LHS)
2783     .addUse(RHS)
2784     .addImm(0)
2785     .setMIFlags(Flags);
2786 
2787   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2788   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2789   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2790 
2791   Register Scale;
2792   if (!ST.hasUsableDivScaleConditionOutput()) {
2793     // Workaround a hardware bug on SI where the condition output from div_scale
2794     // is not usable.
2795 
2796     LLT S32 = LLT::scalar(32);
2797 
2798     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2799     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2800     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2801     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2802 
2803     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2804                               Scale1Unmerge.getReg(1));
2805     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2806                               Scale0Unmerge.getReg(1));
2807     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2808   } else {
2809     Scale = DivScale1.getReg(1);
2810   }
2811 
2812   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2813     .addUse(Fma4.getReg(0))
2814     .addUse(Fma3.getReg(0))
2815     .addUse(Mul.getReg(0))
2816     .addUse(Scale)
2817     .setMIFlags(Flags);
2818 
2819   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2820     .addUse(Fmas.getReg(0))
2821     .addUse(RHS)
2822     .addUse(LHS)
2823     .setMIFlags(Flags);
2824 
2825   MI.eraseFromParent();
2826   return true;
2827 }
2828 
2829 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2830                                                  MachineRegisterInfo &MRI,
2831                                                  MachineIRBuilder &B) const {
2832   B.setInstr(MI);
2833   Register Res = MI.getOperand(0).getReg();
2834   Register LHS = MI.getOperand(2).getReg();
2835   Register RHS = MI.getOperand(3).getReg();
2836   uint16_t Flags = MI.getFlags();
2837 
2838   LLT S32 = LLT::scalar(32);
2839   LLT S1 = LLT::scalar(1);
2840 
2841   auto Abs = B.buildFAbs(S32, RHS, Flags);
2842   const APFloat C0Val(1.0f);
2843 
2844   auto C0 = B.buildConstant(S32, 0x6f800000);
2845   auto C1 = B.buildConstant(S32, 0x2f800000);
2846   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2847 
2848   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2849   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2850 
2851   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2852 
2853   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2854     .addUse(Mul0.getReg(0))
2855     .setMIFlags(Flags);
2856 
2857   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2858 
2859   B.buildFMul(Res, Sel, Mul1, Flags);
2860 
2861   MI.eraseFromParent();
2862   return true;
2863 }
2864 
2865 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2866                                                  MachineRegisterInfo &MRI,
2867                                                  MachineIRBuilder &B) const {
2868   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2869   if (!MFI->isEntryFunction()) {
2870     return legalizePreloadedArgIntrin(MI, MRI, B,
2871                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2872   }
2873 
2874   B.setInstr(MI);
2875 
2876   uint64_t Offset =
2877     ST.getTargetLowering()->getImplicitParameterOffset(
2878       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2879   Register DstReg = MI.getOperand(0).getReg();
2880   LLT DstTy = MRI.getType(DstReg);
2881   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2882 
2883   const ArgDescriptor *Arg;
2884   const TargetRegisterClass *RC;
2885   std::tie(Arg, RC)
2886     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2887   if (!Arg)
2888     return false;
2889 
2890   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2891   if (!loadInputValue(KernargPtrReg, B, Arg))
2892     return false;
2893 
2894   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2895   MI.eraseFromParent();
2896   return true;
2897 }
2898 
2899 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2900                                               MachineRegisterInfo &MRI,
2901                                               MachineIRBuilder &B,
2902                                               unsigned AddrSpace) const {
2903   B.setInstr(MI);
2904   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2905   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2906   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2907   MI.eraseFromParent();
2908   return true;
2909 }
2910 
2911 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2912 // offset (the offset that is included in bounds checking and swizzling, to be
2913 // split between the instruction's voffset and immoffset fields) and soffset
2914 // (the offset that is excluded from bounds checking and swizzling, to go in
2915 // the instruction's soffset field).  This function takes the first kind of
2916 // offset and figures out how to split it between voffset and immoffset.
2917 std::tuple<Register, unsigned, unsigned>
2918 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2919                                         Register OrigOffset) const {
2920   const unsigned MaxImm = 4095;
2921   Register BaseReg;
2922   unsigned TotalConstOffset;
2923   MachineInstr *OffsetDef;
2924   const LLT S32 = LLT::scalar(32);
2925 
2926   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2927     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2928 
2929   unsigned ImmOffset = TotalConstOffset;
2930 
2931   // If the immediate value is too big for the immoffset field, put the value
2932   // and -4096 into the immoffset field so that the value that is copied/added
2933   // for the voffset field is a multiple of 4096, and it stands more chance
2934   // of being CSEd with the copy/add for another similar load/store.
2935   // However, do not do that rounding down to a multiple of 4096 if that is a
2936   // negative number, as it appears to be illegal to have a negative offset
2937   // in the vgpr, even if adding the immediate offset makes it positive.
2938   unsigned Overflow = ImmOffset & ~MaxImm;
2939   ImmOffset -= Overflow;
2940   if ((int32_t)Overflow < 0) {
2941     Overflow += ImmOffset;
2942     ImmOffset = 0;
2943   }
2944 
2945   if (Overflow != 0) {
2946     if (!BaseReg) {
2947       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2948     } else {
2949       auto OverflowVal = B.buildConstant(S32, Overflow);
2950       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2951     }
2952   }
2953 
2954   if (!BaseReg)
2955     BaseReg = B.buildConstant(S32, 0).getReg(0);
2956 
2957   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2958 }
2959 
2960 /// Handle register layout difference for f16 images for some subtargets.
2961 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2962                                              MachineRegisterInfo &MRI,
2963                                              Register Reg) const {
2964   if (!ST.hasUnpackedD16VMem())
2965     return Reg;
2966 
2967   const LLT S16 = LLT::scalar(16);
2968   const LLT S32 = LLT::scalar(32);
2969   LLT StoreVT = MRI.getType(Reg);
2970   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2971 
2972   auto Unmerge = B.buildUnmerge(S16, Reg);
2973 
2974   SmallVector<Register, 4> WideRegs;
2975   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2976     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2977 
2978   int NumElts = StoreVT.getNumElements();
2979 
2980   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2981 }
2982 
2983 Register AMDGPULegalizerInfo::fixStoreSourceType(
2984   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2985   MachineRegisterInfo *MRI = B.getMRI();
2986   LLT Ty = MRI->getType(VData);
2987 
2988   const LLT S16 = LLT::scalar(16);
2989 
2990   // Fixup illegal register types for i8 stores.
2991   if (Ty == LLT::scalar(8) || Ty == S16) {
2992     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2993     return AnyExt;
2994   }
2995 
2996   if (Ty.isVector()) {
2997     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2998       if (IsFormat)
2999         return handleD16VData(B, *MRI, VData);
3000     }
3001   }
3002 
3003   return VData;
3004 }
3005 
3006 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3007                                               MachineRegisterInfo &MRI,
3008                                               MachineIRBuilder &B,
3009                                               bool IsTyped,
3010                                               bool IsFormat) const {
3011   B.setInstr(MI);
3012 
3013   Register VData = MI.getOperand(1).getReg();
3014   LLT Ty = MRI.getType(VData);
3015   LLT EltTy = Ty.getScalarType();
3016   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3017   const LLT S32 = LLT::scalar(32);
3018 
3019   VData = fixStoreSourceType(B, VData, IsFormat);
3020   Register RSrc = MI.getOperand(2).getReg();
3021 
3022   MachineMemOperand *MMO = *MI.memoperands_begin();
3023   const int MemSize = MMO->getSize();
3024 
3025   unsigned ImmOffset;
3026   unsigned TotalOffset;
3027 
3028   // The typed intrinsics add an immediate after the registers.
3029   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3030 
3031   // The struct intrinsic variants add one additional operand over raw.
3032   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3033   Register VIndex;
3034   int OpOffset = 0;
3035   if (HasVIndex) {
3036     VIndex = MI.getOperand(3).getReg();
3037     OpOffset = 1;
3038   }
3039 
3040   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3041   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3042 
3043   unsigned Format = 0;
3044   if (IsTyped) {
3045     Format = MI.getOperand(5 + OpOffset).getImm();
3046     ++OpOffset;
3047   }
3048 
3049   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3050 
3051   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3052   if (TotalOffset != 0)
3053     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3054 
3055   unsigned Opc;
3056   if (IsTyped) {
3057     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3058                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3059   } else if (IsFormat) {
3060     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3061                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3062   } else {
3063     switch (MemSize) {
3064     case 1:
3065       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3066       break;
3067     case 2:
3068       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3069       break;
3070     default:
3071       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3072       break;
3073     }
3074   }
3075 
3076   if (!VIndex)
3077     VIndex = B.buildConstant(S32, 0).getReg(0);
3078 
3079   auto MIB = B.buildInstr(Opc)
3080     .addUse(VData)              // vdata
3081     .addUse(RSrc)               // rsrc
3082     .addUse(VIndex)             // vindex
3083     .addUse(VOffset)            // voffset
3084     .addUse(SOffset)            // soffset
3085     .addImm(ImmOffset);         // offset(imm)
3086 
3087   if (IsTyped)
3088     MIB.addImm(Format);
3089 
3090   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3091      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3092      .addMemOperand(MMO);
3093 
3094   MI.eraseFromParent();
3095   return true;
3096 }
3097 
3098 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3099                                              MachineRegisterInfo &MRI,
3100                                              MachineIRBuilder &B,
3101                                              bool IsFormat,
3102                                              bool IsTyped) const {
3103   B.setInstr(MI);
3104 
3105   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3106   MachineMemOperand *MMO = *MI.memoperands_begin();
3107   const int MemSize = MMO->getSize();
3108   const LLT S32 = LLT::scalar(32);
3109 
3110   Register Dst = MI.getOperand(0).getReg();
3111   Register RSrc = MI.getOperand(2).getReg();
3112 
3113   // The typed intrinsics add an immediate after the registers.
3114   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3115 
3116   // The struct intrinsic variants add one additional operand over raw.
3117   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3118   Register VIndex;
3119   int OpOffset = 0;
3120   if (HasVIndex) {
3121     VIndex = MI.getOperand(3).getReg();
3122     OpOffset = 1;
3123   }
3124 
3125   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3126   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3127 
3128   unsigned Format = 0;
3129   if (IsTyped) {
3130     Format = MI.getOperand(5 + OpOffset).getImm();
3131     ++OpOffset;
3132   }
3133 
3134   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3135   unsigned ImmOffset;
3136   unsigned TotalOffset;
3137 
3138   LLT Ty = MRI.getType(Dst);
3139   LLT EltTy = Ty.getScalarType();
3140   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3141   const bool Unpacked = ST.hasUnpackedD16VMem();
3142 
3143   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3144   if (TotalOffset != 0)
3145     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3146 
3147   unsigned Opc;
3148 
3149   if (IsTyped) {
3150     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3151                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3152   } else if (IsFormat) {
3153     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3154                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3155   } else {
3156     switch (MemSize) {
3157     case 1:
3158       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3159       break;
3160     case 2:
3161       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3162       break;
3163     default:
3164       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3165       break;
3166     }
3167   }
3168 
3169   Register LoadDstReg;
3170 
3171   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3172   LLT UnpackedTy = Ty.changeElementSize(32);
3173 
3174   if (IsExtLoad)
3175     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3176   else if (Unpacked && IsD16 && Ty.isVector())
3177     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3178   else
3179     LoadDstReg = Dst;
3180 
3181   if (!VIndex)
3182     VIndex = B.buildConstant(S32, 0).getReg(0);
3183 
3184   auto MIB = B.buildInstr(Opc)
3185     .addDef(LoadDstReg)         // vdata
3186     .addUse(RSrc)               // rsrc
3187     .addUse(VIndex)             // vindex
3188     .addUse(VOffset)            // voffset
3189     .addUse(SOffset)            // soffset
3190     .addImm(ImmOffset);         // offset(imm)
3191 
3192   if (IsTyped)
3193     MIB.addImm(Format);
3194 
3195   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3196      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3197      .addMemOperand(MMO);
3198 
3199   if (LoadDstReg != Dst) {
3200     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3201 
3202     // Widen result for extending loads was widened.
3203     if (IsExtLoad)
3204       B.buildTrunc(Dst, LoadDstReg);
3205     else {
3206       // Repack to original 16-bit vector result
3207       // FIXME: G_TRUNC should work, but legalization currently fails
3208       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3209       SmallVector<Register, 4> Repack;
3210       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3211         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3212       B.buildMerge(Dst, Repack);
3213     }
3214   }
3215 
3216   MI.eraseFromParent();
3217   return true;
3218 }
3219 
3220 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3221                                                MachineIRBuilder &B,
3222                                                bool IsInc) const {
3223   B.setInstr(MI);
3224   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3225                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3226   B.buildInstr(Opc)
3227     .addDef(MI.getOperand(0).getReg())
3228     .addUse(MI.getOperand(2).getReg())
3229     .addUse(MI.getOperand(3).getReg())
3230     .cloneMemRefs(MI);
3231   MI.eraseFromParent();
3232   return true;
3233 }
3234 
3235 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3236   switch (IntrID) {
3237   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3238   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3239     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3240   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3241   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3242     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3243   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3244   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3245     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3246   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3247   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3248     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3249   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3250   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3251     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3252   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3253   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3254     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3255   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3256   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3257     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3258   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3259   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3260     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3261   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3262   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3263     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3264   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3265   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3266     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3267   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3268   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3269     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3270   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3271   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3272     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3273   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3274   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3275     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3276   default:
3277     llvm_unreachable("unhandled atomic opcode");
3278   }
3279 }
3280 
3281 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3282                                                MachineIRBuilder &B,
3283                                                Intrinsic::ID IID) const {
3284   B.setInstr(MI);
3285 
3286   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3287                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3288 
3289   Register Dst = MI.getOperand(0).getReg();
3290   Register VData = MI.getOperand(2).getReg();
3291 
3292   Register CmpVal;
3293   int OpOffset = 0;
3294 
3295   if (IsCmpSwap) {
3296     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3297     ++OpOffset;
3298   }
3299 
3300   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3301   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3302 
3303   // The struct intrinsic variants add one additional operand over raw.
3304   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3305   Register VIndex;
3306   if (HasVIndex) {
3307     VIndex = MI.getOperand(4 + OpOffset).getReg();
3308     ++OpOffset;
3309   }
3310 
3311   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3312   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3313   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3314 
3315   MachineMemOperand *MMO = *MI.memoperands_begin();
3316 
3317   unsigned ImmOffset;
3318   unsigned TotalOffset;
3319   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3320   if (TotalOffset != 0)
3321     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3322 
3323   if (!VIndex)
3324     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3325 
3326   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3327     .addDef(Dst)
3328     .addUse(VData); // vdata
3329 
3330   if (IsCmpSwap)
3331     MIB.addReg(CmpVal);
3332 
3333   MIB.addUse(RSrc)               // rsrc
3334      .addUse(VIndex)             // vindex
3335      .addUse(VOffset)            // voffset
3336      .addUse(SOffset)            // soffset
3337      .addImm(ImmOffset)          // offset(imm)
3338      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3339      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3340      .addMemOperand(MMO);
3341 
3342   MI.eraseFromParent();
3343   return true;
3344 }
3345 
3346 // Produce a vector of s16 elements from s32 pieces.
3347 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3348                              ArrayRef<Register> UnmergeParts) {
3349   const LLT S16 = LLT::scalar(16);
3350 
3351   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3352   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3353     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3354 
3355   B.buildBuildVector(DstReg, RemergeParts);
3356 }
3357 
3358 /// Convert a set of s32 registers to a result vector with s16 elements.
3359 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3360                                ArrayRef<Register> UnmergeParts) {
3361   MachineRegisterInfo &MRI = *B.getMRI();
3362   const LLT V2S16 = LLT::vector(2, 16);
3363   LLT TargetTy = MRI.getType(DstReg);
3364   int NumElts = UnmergeParts.size();
3365 
3366   if (NumElts == 1) {
3367     assert(TargetTy == V2S16);
3368     B.buildBitcast(DstReg, UnmergeParts[0]);
3369     return;
3370   }
3371 
3372   SmallVector<Register, 4> RemergeParts(NumElts);
3373   for (int I = 0; I != NumElts; ++I)
3374     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3375 
3376   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3377     B.buildConcatVectors(DstReg, RemergeParts);
3378     return;
3379   }
3380 
3381   const LLT V3S16 = LLT::vector(3, 16);
3382   const LLT V6S16 = LLT::vector(6, 16);
3383 
3384   // Widen to v6s16 and unpack v3 parts.
3385   assert(TargetTy == V3S16);
3386 
3387   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3388   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3389   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3390 }
3391 
3392 // FIXME: Just vector trunc should be sufficent, but legalization currently
3393 // broken.
3394 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3395                                   Register WideDstReg) {
3396   const LLT S32 = LLT::scalar(32);
3397   const LLT S16 = LLT::scalar(16);
3398 
3399   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3400 
3401   int NumOps = Unmerge->getNumOperands() - 1;
3402   SmallVector<Register, 4> RemergeParts(NumOps);
3403   for (int I = 0; I != NumOps; ++I)
3404     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3405 
3406   B.buildBuildVector(DstReg, RemergeParts);
3407 }
3408 
3409 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3410     MachineInstr &MI, MachineIRBuilder &B,
3411     GISelChangeObserver &Observer,
3412     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3413   bool IsTFE = MI.getNumExplicitDefs() == 2;
3414 
3415   // We are only processing the operands of d16 image operations on subtargets
3416   // that use the unpacked register layout, or need to repack the TFE result.
3417 
3418   // TODO: Need to handle a16 images too
3419   // TODO: Do we need to guard against already legalized intrinsics?
3420   if (!IsTFE && !ST.hasUnpackedD16VMem())
3421     return true;
3422 
3423   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3424     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3425 
3426   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3427     return true;
3428 
3429   B.setInstr(MI);
3430 
3431   MachineRegisterInfo *MRI = B.getMRI();
3432   const LLT S32 = LLT::scalar(32);
3433   const LLT S16 = LLT::scalar(16);
3434 
3435   if (BaseOpcode->Store) { // No TFE for stores?
3436     Register VData = MI.getOperand(1).getReg();
3437     LLT Ty = MRI->getType(VData);
3438     if (!Ty.isVector() || Ty.getElementType() != S16)
3439       return true;
3440 
3441     B.setInstr(MI);
3442 
3443     Observer.changingInstr(MI);
3444     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3445     Observer.changedInstr(MI);
3446     return true;
3447   }
3448 
3449   Register DstReg = MI.getOperand(0).getReg();
3450   LLT Ty = MRI->getType(DstReg);
3451   const LLT EltTy = Ty.getScalarType();
3452   const bool IsD16 = Ty.getScalarType() == S16;
3453   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3454 
3455   if (IsTFE) {
3456     // In the IR, TFE is supposed to be used with a 2 element struct return
3457     // type. The intruction really returns these two values in one contiguous
3458     // register, with one additional dword beyond the loaded data. Rewrite the
3459     // return type to use a single register result.
3460     Register Dst1Reg = MI.getOperand(1).getReg();
3461     if (MRI->getType(Dst1Reg) != S32)
3462       return false;
3463 
3464     // TODO: Make sure the TFE operand bit is set.
3465 
3466     // The raw dword aligned data component of the load. The only legal cases
3467     // where this matters should be when using the packed D16 format, for
3468     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3469     LLT RoundedTy;
3470     LLT TFETy;
3471 
3472     if (IsD16 && ST.hasUnpackedD16VMem()) {
3473       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3474       TFETy = LLT::vector(NumElts + 1, 32);
3475     } else {
3476       unsigned EltSize = Ty.getScalarSizeInBits();
3477       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3478       unsigned RoundedSize = 32 * RoundedElts;
3479       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3480       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3481     }
3482 
3483     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3484     Observer.changingInstr(MI);
3485 
3486     MI.getOperand(0).setReg(TFEReg);
3487     MI.RemoveOperand(1);
3488 
3489     Observer.changedInstr(MI);
3490 
3491     // Insert after the instruction.
3492     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3493 
3494     // Now figure out how to copy the new result register back into the old
3495     // result.
3496 
3497     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3498     int NumDataElts = TFETy.getNumElements() - 1;
3499 
3500     if (!Ty.isVector()) {
3501       // Simplest case is a trivial unmerge (plus a truncate for d16).
3502       UnmergeResults[0] = Ty == S32 ?
3503         DstReg : MRI->createGenericVirtualRegister(S32);
3504 
3505       B.buildUnmerge(UnmergeResults, TFEReg);
3506       if (Ty != S32)
3507         B.buildTrunc(DstReg, UnmergeResults[0]);
3508       return true;
3509     }
3510 
3511     // We have to repack into a new vector of some kind.
3512     for (int I = 0; I != NumDataElts; ++I)
3513       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3514     B.buildUnmerge(UnmergeResults, TFEReg);
3515 
3516     // Drop the final TFE element.
3517     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3518 
3519     if (EltTy == S32)
3520       B.buildBuildVector(DstReg, DataPart);
3521     else if (ST.hasUnpackedD16VMem())
3522       truncToS16Vector(B, DstReg, DataPart);
3523     else
3524       bitcastToS16Vector(B, DstReg, DataPart);
3525 
3526     return true;
3527   }
3528 
3529   // Must be an image load.
3530   if (!Ty.isVector() || Ty.getElementType() != S16)
3531     return true;
3532 
3533   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3534 
3535   LLT WidenedTy = Ty.changeElementType(S32);
3536   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3537 
3538   Observer.changingInstr(MI);
3539   MI.getOperand(0).setReg(WideDstReg);
3540   Observer.changedInstr(MI);
3541 
3542   repackUnpackedD16Load(B, DstReg, WideDstReg);
3543   return true;
3544 }
3545 
3546 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3547   MachineInstr &MI, MachineIRBuilder &B,
3548   GISelChangeObserver &Observer) const {
3549   Register Dst = MI.getOperand(0).getReg();
3550   LLT Ty = B.getMRI()->getType(Dst);
3551   unsigned Size = Ty.getSizeInBits();
3552   MachineFunction &MF = B.getMF();
3553 
3554   Observer.changingInstr(MI);
3555 
3556   // FIXME: We don't really need this intermediate instruction. The intrinsic
3557   // should be fixed to have a memory operand. Since it's readnone, we're not
3558   // allowed to add one.
3559   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3560   MI.RemoveOperand(1); // Remove intrinsic ID
3561 
3562   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3563   // TODO: Should this use datalayout alignment?
3564   const unsigned MemSize = (Size + 7) / 8;
3565   const unsigned MemAlign = 4;
3566   MachineMemOperand *MMO = MF.getMachineMemOperand(
3567     MachinePointerInfo(),
3568     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3569     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3570   MI.addMemOperand(MF, MMO);
3571 
3572   // There are no 96-bit result scalar loads, but widening to 128-bit should
3573   // always be legal. We may need to restore this to a 96-bit result if it turns
3574   // out this needs to be converted to a vector load during RegBankSelect.
3575   if (!isPowerOf2_32(Size)) {
3576     LegalizerHelper Helper(MF, *this, Observer, B);
3577     B.setInstr(MI);
3578 
3579     if (Ty.isVector())
3580       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3581     else
3582       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3583   }
3584 
3585   Observer.changedInstr(MI);
3586   return true;
3587 }
3588 
3589 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
3590                                                 MachineRegisterInfo &MRI,
3591                                                 MachineIRBuilder &B) const {
3592   B.setInstr(MI);
3593 
3594   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
3595   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3596       !ST.isTrapHandlerEnabled()) {
3597     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
3598   } else {
3599     // Pass queue pointer to trap handler as input, and insert trap instruction
3600     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
3601     const ArgDescriptor *Arg =
3602         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
3603     if (!Arg)
3604       return false;
3605     MachineRegisterInfo &MRI = *B.getMRI();
3606     Register SGPR01(AMDGPU::SGPR0_SGPR1);
3607     Register LiveIn = getLiveInRegister(
3608         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
3609         /*InsertLiveInCopy=*/false);
3610     if (!loadInputValue(LiveIn, B, Arg))
3611       return false;
3612     B.buildCopy(SGPR01, LiveIn);
3613     B.buildInstr(AMDGPU::S_TRAP)
3614         .addImm(GCNSubtarget::TrapIDLLVMTrap)
3615         .addReg(SGPR01, RegState::Implicit);
3616   }
3617 
3618   MI.eraseFromParent();
3619   return true;
3620 }
3621 
3622 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
3623     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3624   B.setInstr(MI);
3625 
3626   // Is non-HSA path or trap-handler disabled? then, report a warning
3627   // accordingly
3628   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3629       !ST.isTrapHandlerEnabled()) {
3630     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
3631                                      "debugtrap handler not supported",
3632                                      MI.getDebugLoc(), DS_Warning);
3633     LLVMContext &Ctx = B.getMF().getFunction().getContext();
3634     Ctx.diagnose(NoTrap);
3635   } else {
3636     // Insert debug-trap instruction
3637     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
3638   }
3639 
3640   MI.eraseFromParent();
3641   return true;
3642 }
3643 
3644 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3645                                             MachineIRBuilder &B,
3646                                             GISelChangeObserver &Observer) const {
3647   MachineRegisterInfo &MRI = *B.getMRI();
3648 
3649   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3650   auto IntrID = MI.getIntrinsicID();
3651   switch (IntrID) {
3652   case Intrinsic::amdgcn_if:
3653   case Intrinsic::amdgcn_else: {
3654     MachineInstr *Br = nullptr;
3655     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3656       const SIRegisterInfo *TRI
3657         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3658 
3659       B.setInstr(*BrCond);
3660       Register Def = MI.getOperand(1).getReg();
3661       Register Use = MI.getOperand(3).getReg();
3662 
3663       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3664       if (Br)
3665         BrTarget = Br->getOperand(0).getMBB();
3666 
3667       if (IntrID == Intrinsic::amdgcn_if) {
3668         B.buildInstr(AMDGPU::SI_IF)
3669           .addDef(Def)
3670           .addUse(Use)
3671           .addMBB(BrTarget);
3672       } else {
3673         B.buildInstr(AMDGPU::SI_ELSE)
3674           .addDef(Def)
3675           .addUse(Use)
3676           .addMBB(BrTarget)
3677           .addImm(0);
3678       }
3679 
3680       if (Br)
3681         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3682 
3683       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3684       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3685       MI.eraseFromParent();
3686       BrCond->eraseFromParent();
3687       return true;
3688     }
3689 
3690     return false;
3691   }
3692   case Intrinsic::amdgcn_loop: {
3693     MachineInstr *Br = nullptr;
3694     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3695       const SIRegisterInfo *TRI
3696         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3697 
3698       B.setInstr(*BrCond);
3699 
3700       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3701       if (Br)
3702         BrTarget = Br->getOperand(0).getMBB();
3703 
3704       Register Reg = MI.getOperand(2).getReg();
3705       B.buildInstr(AMDGPU::SI_LOOP)
3706         .addUse(Reg)
3707         .addMBB(BrTarget);
3708 
3709       if (Br)
3710         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3711 
3712       MI.eraseFromParent();
3713       BrCond->eraseFromParent();
3714       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3715       return true;
3716     }
3717 
3718     return false;
3719   }
3720   case Intrinsic::amdgcn_kernarg_segment_ptr:
3721     return legalizePreloadedArgIntrin(
3722       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3723   case Intrinsic::amdgcn_implicitarg_ptr:
3724     return legalizeImplicitArgPtr(MI, MRI, B);
3725   case Intrinsic::amdgcn_workitem_id_x:
3726     return legalizePreloadedArgIntrin(MI, MRI, B,
3727                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3728   case Intrinsic::amdgcn_workitem_id_y:
3729     return legalizePreloadedArgIntrin(MI, MRI, B,
3730                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3731   case Intrinsic::amdgcn_workitem_id_z:
3732     return legalizePreloadedArgIntrin(MI, MRI, B,
3733                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3734   case Intrinsic::amdgcn_workgroup_id_x:
3735     return legalizePreloadedArgIntrin(MI, MRI, B,
3736                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3737   case Intrinsic::amdgcn_workgroup_id_y:
3738     return legalizePreloadedArgIntrin(MI, MRI, B,
3739                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3740   case Intrinsic::amdgcn_workgroup_id_z:
3741     return legalizePreloadedArgIntrin(MI, MRI, B,
3742                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3743   case Intrinsic::amdgcn_dispatch_ptr:
3744     return legalizePreloadedArgIntrin(MI, MRI, B,
3745                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3746   case Intrinsic::amdgcn_queue_ptr:
3747     return legalizePreloadedArgIntrin(MI, MRI, B,
3748                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3749   case Intrinsic::amdgcn_implicit_buffer_ptr:
3750     return legalizePreloadedArgIntrin(
3751       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3752   case Intrinsic::amdgcn_dispatch_id:
3753     return legalizePreloadedArgIntrin(MI, MRI, B,
3754                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3755   case Intrinsic::amdgcn_fdiv_fast:
3756     return legalizeFDIVFastIntrin(MI, MRI, B);
3757   case Intrinsic::amdgcn_is_shared:
3758     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3759   case Intrinsic::amdgcn_is_private:
3760     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3761   case Intrinsic::amdgcn_wavefrontsize: {
3762     B.setInstr(MI);
3763     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3764     MI.eraseFromParent();
3765     return true;
3766   }
3767   case Intrinsic::amdgcn_s_buffer_load:
3768     return legalizeSBufferLoad(MI, B, Observer);
3769   case Intrinsic::amdgcn_raw_buffer_store:
3770   case Intrinsic::amdgcn_struct_buffer_store:
3771     return legalizeBufferStore(MI, MRI, B, false, false);
3772   case Intrinsic::amdgcn_raw_buffer_store_format:
3773   case Intrinsic::amdgcn_struct_buffer_store_format:
3774     return legalizeBufferStore(MI, MRI, B, false, true);
3775   case Intrinsic::amdgcn_raw_tbuffer_store:
3776   case Intrinsic::amdgcn_struct_tbuffer_store:
3777     return legalizeBufferStore(MI, MRI, B, true, true);
3778   case Intrinsic::amdgcn_raw_buffer_load:
3779   case Intrinsic::amdgcn_struct_buffer_load:
3780     return legalizeBufferLoad(MI, MRI, B, false, false);
3781   case Intrinsic::amdgcn_raw_buffer_load_format:
3782   case Intrinsic::amdgcn_struct_buffer_load_format:
3783     return legalizeBufferLoad(MI, MRI, B, true, false);
3784   case Intrinsic::amdgcn_raw_tbuffer_load:
3785   case Intrinsic::amdgcn_struct_tbuffer_load:
3786     return legalizeBufferLoad(MI, MRI, B, true, true);
3787   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3788   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3789   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3790   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3791   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3792   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3793   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3794   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3795   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3796   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3797   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3798   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3799   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3800   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3801   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3802   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3803   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3804   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3805   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3806   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3807   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3808   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3809   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3810   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3811   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3812   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3813     return legalizeBufferAtomic(MI, B, IntrID);
3814   case Intrinsic::amdgcn_atomic_inc:
3815     return legalizeAtomicIncDec(MI, B, true);
3816   case Intrinsic::amdgcn_atomic_dec:
3817     return legalizeAtomicIncDec(MI, B, false);
3818   case Intrinsic::trap:
3819     return legalizeTrapIntrinsic(MI, MRI, B);
3820   case Intrinsic::debugtrap:
3821     return legalizeDebugTrapIntrinsic(MI, MRI, B);
3822   default: {
3823     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3824             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3825       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3826     return true;
3827   }
3828   }
3829 
3830   return true;
3831 }
3832