1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
70   return [=](const LegalityQuery &Query) {
71     const LLT Ty = Query.Types[TypeIdx];
72     return Ty.isVector() &&
73            Ty.getNumElements() % 2 != 0 &&
74            Ty.getElementType().getSizeInBits() < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     const LLT EltTy = Ty.getScalarType();
83     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
84   };
85 }
86 
87 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getElementType();
91     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
92   };
93 }
94 
95 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     unsigned Size = Ty.getSizeInBits();
100     unsigned Pieces = (Size + 63) / 64;
101     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
102     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
103   };
104 }
105 
106 // Increase the number of vector elements to reach the next multiple of 32-bit
107 // type.
108 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
109   return [=](const LegalityQuery &Query) {
110     const LLT Ty = Query.Types[TypeIdx];
111 
112     const LLT EltTy = Ty.getElementType();
113     const int Size = Ty.getSizeInBits();
114     const int EltSize = EltTy.getSizeInBits();
115     const int NextMul32 = (Size + 31) / 32;
116 
117     assert(EltSize < 32);
118 
119     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
120     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
121   };
122 }
123 
124 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
125   return [=](const LegalityQuery &Query) {
126     const LLT QueryTy = Query.Types[TypeIdx];
127     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
128   };
129 }
130 
131 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
132   return [=](const LegalityQuery &Query) {
133     const LLT QueryTy = Query.Types[TypeIdx];
134     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
135   };
136 }
137 
138 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT QueryTy = Query.Types[TypeIdx];
141     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
142   };
143 }
144 
145 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
146 // v2s16.
147 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     if (Ty.isVector()) {
151       const int EltSize = Ty.getElementType().getSizeInBits();
152       return EltSize == 32 || EltSize == 64 ||
153             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
154              EltSize == 128 || EltSize == 256;
155     }
156 
157     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
158   };
159 }
160 
161 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getElementType() == Type;
165   };
166 }
167 
168 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     if (!QueryTy.isVector())
172       return false;
173     const LLT EltTy = QueryTy.getElementType();
174     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
175   };
176 }
177 
178 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT Ty = Query.Types[TypeIdx];
181     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
182            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
183   };
184 }
185 
186 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
187   return [=](const LegalityQuery &Query) {
188     return Query.Types[TypeIdx0].getSizeInBits() <
189            Query.Types[TypeIdx1].getSizeInBits();
190   };
191 }
192 
193 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
194   return [=](const LegalityQuery &Query) {
195     return Query.Types[TypeIdx0].getSizeInBits() >
196            Query.Types[TypeIdx1].getSizeInBits();
197   };
198 }
199 
200 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
201                                          const GCNTargetMachine &TM)
202   :  ST(ST_) {
203   using namespace TargetOpcode;
204 
205   auto GetAddrSpacePtr = [&TM](unsigned AS) {
206     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
207   };
208 
209   const LLT S1 = LLT::scalar(1);
210   const LLT S16 = LLT::scalar(16);
211   const LLT S32 = LLT::scalar(32);
212   const LLT S64 = LLT::scalar(64);
213   const LLT S128 = LLT::scalar(128);
214   const LLT S256 = LLT::scalar(256);
215   const LLT S512 = LLT::scalar(512);
216   const LLT S1024 = LLT::scalar(1024);
217 
218   const LLT V2S16 = LLT::vector(2, 16);
219   const LLT V4S16 = LLT::vector(4, 16);
220 
221   const LLT V2S32 = LLT::vector(2, 32);
222   const LLT V3S32 = LLT::vector(3, 32);
223   const LLT V4S32 = LLT::vector(4, 32);
224   const LLT V5S32 = LLT::vector(5, 32);
225   const LLT V6S32 = LLT::vector(6, 32);
226   const LLT V7S32 = LLT::vector(7, 32);
227   const LLT V8S32 = LLT::vector(8, 32);
228   const LLT V9S32 = LLT::vector(9, 32);
229   const LLT V10S32 = LLT::vector(10, 32);
230   const LLT V11S32 = LLT::vector(11, 32);
231   const LLT V12S32 = LLT::vector(12, 32);
232   const LLT V13S32 = LLT::vector(13, 32);
233   const LLT V14S32 = LLT::vector(14, 32);
234   const LLT V15S32 = LLT::vector(15, 32);
235   const LLT V16S32 = LLT::vector(16, 32);
236   const LLT V32S32 = LLT::vector(32, 32);
237 
238   const LLT V2S64 = LLT::vector(2, 64);
239   const LLT V3S64 = LLT::vector(3, 64);
240   const LLT V4S64 = LLT::vector(4, 64);
241   const LLT V5S64 = LLT::vector(5, 64);
242   const LLT V6S64 = LLT::vector(6, 64);
243   const LLT V7S64 = LLT::vector(7, 64);
244   const LLT V8S64 = LLT::vector(8, 64);
245   const LLT V16S64 = LLT::vector(16, 64);
246 
247   std::initializer_list<LLT> AllS32Vectors =
248     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
249      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
250   std::initializer_list<LLT> AllS64Vectors =
251     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
252 
253   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
254   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
255   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
256   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
257   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
258   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
259   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
260 
261   const LLT CodePtr = FlatPtr;
262 
263   const std::initializer_list<LLT> AddrSpaces64 = {
264     GlobalPtr, ConstantPtr, FlatPtr
265   };
266 
267   const std::initializer_list<LLT> AddrSpaces32 = {
268     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
269   };
270 
271   const std::initializer_list<LLT> FPTypesBase = {
272     S32, S64
273   };
274 
275   const std::initializer_list<LLT> FPTypes16 = {
276     S32, S64, S16
277   };
278 
279   const std::initializer_list<LLT> FPTypesPK16 = {
280     S32, S64, S16, V2S16
281   };
282 
283   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
284 
285   setAction({G_BRCOND, S1}, Legal); // VCC branches
286   setAction({G_BRCOND, S32}, Legal); // SCC branches
287 
288   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
289   // elements for v3s16
290   getActionDefinitionsBuilder(G_PHI)
291     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
292     .legalFor(AllS32Vectors)
293     .legalFor(AllS64Vectors)
294     .legalFor(AddrSpaces64)
295     .legalFor(AddrSpaces32)
296     .clampScalar(0, S32, S256)
297     .widenScalarToNextPow2(0, 32)
298     .clampMaxNumElements(0, S32, 16)
299     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
300     .legalIf(isPointer(0));
301 
302   if (ST.hasVOP3PInsts()) {
303     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
304       .legalFor({S32, S16, V2S16})
305       .clampScalar(0, S16, S32)
306       .clampMaxNumElements(0, S16, 2)
307       .scalarize(0)
308       .widenScalarToNextPow2(0, 32);
309   } else if (ST.has16BitInsts()) {
310     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
311       .legalFor({S32, S16})
312       .clampScalar(0, S16, S32)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32})
318       .clampScalar(0, S32, S32)
319       .scalarize(0);
320   }
321 
322   // FIXME: Not really legal. Placeholder for custom lowering.
323   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
324     .customFor({S32, S64})
325     .clampScalar(0, S32, S64)
326     .widenScalarToNextPow2(0, 32)
327     .scalarize(0);
328 
329   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
330     .legalFor({S32})
331     .clampScalar(0, S32, S32)
332     .scalarize(0);
333 
334   // Report legal for any types we can handle anywhere. For the cases only legal
335   // on the SALU, RegBankSelect will be able to re-legalize.
336   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
337     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
338     .clampScalar(0, S32, S64)
339     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
340     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
341     .widenScalarToNextPow2(0)
342     .scalarize(0);
343 
344   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
345                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
346     .legalFor({{S32, S1}, {S32, S32}})
347     .minScalar(0, S32)
348     // TODO: .scalarize(0)
349     .lower();
350 
351   getActionDefinitionsBuilder(G_BITCAST)
352     // Don't worry about the size constraint.
353     .legalIf(all(isRegisterType(0), isRegisterType(1)))
354     .lower();
355 
356 
357   getActionDefinitionsBuilder(G_CONSTANT)
358     .legalFor({S1, S32, S64, S16, GlobalPtr,
359                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
360     .clampScalar(0, S32, S64)
361     .widenScalarToNextPow2(0)
362     .legalIf(isPointer(0));
363 
364   getActionDefinitionsBuilder(G_FCONSTANT)
365     .legalFor({S32, S64, S16})
366     .clampScalar(0, S16, S64);
367 
368   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
369       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
370                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
371       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
372       .clampScalarOrElt(0, S32, S1024)
373       .legalIf(isMultiple32(0))
374       .widenScalarToNextPow2(0, 32)
375       .clampMaxNumElements(0, S32, 16);
376 
377   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
378   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
379     .unsupportedFor({PrivatePtr})
380     .custom();
381   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
382 
383   auto &FPOpActions = getActionDefinitionsBuilder(
384     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
385     .legalFor({S32, S64});
386   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
387     .customFor({S32, S64});
388   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
389     .customFor({S32, S64});
390 
391   if (ST.has16BitInsts()) {
392     if (ST.hasVOP3PInsts())
393       FPOpActions.legalFor({S16, V2S16});
394     else
395       FPOpActions.legalFor({S16});
396 
397     TrigActions.customFor({S16});
398     FDIVActions.customFor({S16});
399   }
400 
401   auto &MinNumMaxNum = getActionDefinitionsBuilder({
402       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
403 
404   if (ST.hasVOP3PInsts()) {
405     MinNumMaxNum.customFor(FPTypesPK16)
406       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
407       .clampMaxNumElements(0, S16, 2)
408       .clampScalar(0, S16, S64)
409       .scalarize(0);
410   } else if (ST.has16BitInsts()) {
411     MinNumMaxNum.customFor(FPTypes16)
412       .clampScalar(0, S16, S64)
413       .scalarize(0);
414   } else {
415     MinNumMaxNum.customFor(FPTypesBase)
416       .clampScalar(0, S32, S64)
417       .scalarize(0);
418   }
419 
420   if (ST.hasVOP3PInsts())
421     FPOpActions.clampMaxNumElements(0, S16, 2);
422 
423   FPOpActions
424     .scalarize(0)
425     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
426 
427   TrigActions
428     .scalarize(0)
429     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
430 
431   FDIVActions
432     .scalarize(0)
433     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
434 
435   getActionDefinitionsBuilder({G_FNEG, G_FABS})
436     .legalFor(FPTypesPK16)
437     .clampMaxNumElements(0, S16, 2)
438     .scalarize(0)
439     .clampScalar(0, S16, S64);
440 
441   if (ST.has16BitInsts()) {
442     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
443       .legalFor({S32, S64, S16})
444       .scalarize(0)
445       .clampScalar(0, S16, S64);
446   } else {
447     getActionDefinitionsBuilder(G_FSQRT)
448       .legalFor({S32, S64})
449       .scalarize(0)
450       .clampScalar(0, S32, S64);
451 
452     if (ST.hasFractBug()) {
453       getActionDefinitionsBuilder(G_FFLOOR)
454         .customFor({S64})
455         .legalFor({S32, S64})
456         .scalarize(0)
457         .clampScalar(0, S32, S64);
458     } else {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .legalFor({S32, S64})
461         .scalarize(0)
462         .clampScalar(0, S32, S64);
463     }
464   }
465 
466   getActionDefinitionsBuilder(G_FPTRUNC)
467     .legalFor({{S32, S64}, {S16, S32}})
468     .scalarize(0)
469     .lower();
470 
471   getActionDefinitionsBuilder(G_FPEXT)
472     .legalFor({{S64, S32}, {S32, S16}})
473     .lowerFor({{S64, S16}}) // FIXME: Implement
474     .scalarize(0);
475 
476   getActionDefinitionsBuilder(G_FSUB)
477       // Use actual fsub instruction
478       .legalFor({S32})
479       // Must use fadd + fneg
480       .lowerFor({S64, S16, V2S16})
481       .scalarize(0)
482       .clampScalar(0, S32, S64);
483 
484   // Whether this is legal depends on the floating point mode for the function.
485   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
486   if (ST.hasMadF16())
487     FMad.customFor({S32, S16});
488   else
489     FMad.customFor({S32});
490   FMad.scalarize(0)
491       .lower();
492 
493   // TODO: Do we need to clamp maximum bitwidth?
494   getActionDefinitionsBuilder(G_TRUNC)
495     .legalIf(isScalar(0))
496     .legalFor({{V2S16, V2S32}})
497     .clampMaxNumElements(0, S16, 2)
498     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
499     // situations (like an invalid implicit use), we don't want to infinite loop
500     // in the legalizer.
501     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
502     .alwaysLegal();
503 
504   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
505     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
506                {S32, S1}, {S64, S1}, {S16, S1}})
507     .scalarize(0)
508     .clampScalar(0, S32, S64)
509     .widenScalarToNextPow2(1, 32);
510 
511   // TODO: Split s1->s64 during regbankselect for VALU.
512   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
513     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
514     .lowerFor({{S32, S64}})
515     .lowerIf(typeIs(1, S1))
516     .customFor({{S64, S64}});
517   if (ST.has16BitInsts())
518     IToFP.legalFor({{S16, S16}});
519   IToFP.clampScalar(1, S32, S64)
520        .scalarize(0)
521        .widenScalarToNextPow2(1);
522 
523   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
524     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
525     .customFor({{S64, S64}});
526   if (ST.has16BitInsts())
527     FPToI.legalFor({{S16, S16}});
528   else
529     FPToI.minScalar(1, S32);
530 
531   FPToI.minScalar(0, S32)
532        .scalarize(0)
533        .lower();
534 
535   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
536     .scalarize(0)
537     .lower();
538 
539   if (ST.has16BitInsts()) {
540     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
541       .legalFor({S16, S32, S64})
542       .clampScalar(0, S16, S64)
543       .scalarize(0);
544   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
545     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
546       .legalFor({S32, S64})
547       .clampScalar(0, S32, S64)
548       .scalarize(0);
549   } else {
550     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
551       .legalFor({S32})
552       .customFor({S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   }
556 
557   // FIXME: Clamp offset operand.
558   getActionDefinitionsBuilder(G_PTR_ADD)
559     .legalIf(isPointer(0))
560     .scalarize(0);
561 
562   getActionDefinitionsBuilder(G_PTRMASK)
563     .legalIf(typeInSet(1, {S64, S32}))
564     .minScalar(1, S32)
565     .maxScalarIf(sizeIs(0, 32), 1, S32)
566     .maxScalarIf(sizeIs(0, 64), 1, S64)
567     .scalarize(0);
568 
569   auto &CmpBuilder =
570     getActionDefinitionsBuilder(G_ICMP)
571     // The compare output type differs based on the register bank of the output,
572     // so make both s1 and s32 legal.
573     //
574     // Scalar compares producing output in scc will be promoted to s32, as that
575     // is the allocatable register type that will be needed for the copy from
576     // scc. This will be promoted during RegBankSelect, and we assume something
577     // before that won't try to use s32 result types.
578     //
579     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
580     // bank.
581     .legalForCartesianProduct(
582       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
583     .legalForCartesianProduct(
584       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
585   if (ST.has16BitInsts()) {
586     CmpBuilder.legalFor({{S1, S16}});
587   }
588 
589   CmpBuilder
590     .widenScalarToNextPow2(1)
591     .clampScalar(1, S32, S64)
592     .scalarize(0)
593     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
594 
595   getActionDefinitionsBuilder(G_FCMP)
596     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
597     .widenScalarToNextPow2(1)
598     .clampScalar(1, S32, S64)
599     .scalarize(0);
600 
601   // FIXME: fpow has a selection pattern that should move to custom lowering.
602   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
603   if (ST.has16BitInsts())
604     Exp2Ops.legalFor({S32, S16});
605   else
606     Exp2Ops.legalFor({S32});
607   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
608   Exp2Ops.scalarize(0);
609 
610   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
611   if (ST.has16BitInsts())
612     ExpOps.customFor({{S32}, {S16}});
613   else
614     ExpOps.customFor({S32});
615   ExpOps.clampScalar(0, MinScalarFPTy, S32)
616         .scalarize(0);
617 
618   // The 64-bit versions produce 32-bit results, but only on the SALU.
619   getActionDefinitionsBuilder(G_CTPOP)
620     .legalFor({{S32, S32}, {S32, S64}})
621     .clampScalar(0, S32, S32)
622     .clampScalar(1, S32, S64)
623     .scalarize(0)
624     .widenScalarToNextPow2(0, 32)
625     .widenScalarToNextPow2(1, 32);
626 
627   // The hardware instructions return a different result on 0 than the generic
628   // instructions expect. The hardware produces -1, but these produce the
629   // bitwidth.
630   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
631     .scalarize(0)
632     .clampScalar(0, S32, S32)
633     .clampScalar(1, S32, S64)
634     .widenScalarToNextPow2(0, 32)
635     .widenScalarToNextPow2(1, 32)
636     .lower();
637 
638   // The 64-bit versions produce 32-bit results, but only on the SALU.
639   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
640     .legalFor({{S32, S32}, {S32, S64}})
641     .clampScalar(0, S32, S32)
642     .clampScalar(1, S32, S64)
643     .scalarize(0)
644     .widenScalarToNextPow2(0, 32)
645     .widenScalarToNextPow2(1, 32);
646 
647   getActionDefinitionsBuilder(G_BITREVERSE)
648     .legalFor({S32})
649     .clampScalar(0, S32, S32)
650     .scalarize(0);
651 
652   if (ST.has16BitInsts()) {
653     getActionDefinitionsBuilder(G_BSWAP)
654       .legalFor({S16, S32, V2S16})
655       .clampMaxNumElements(0, S16, 2)
656       // FIXME: Fixing non-power-of-2 before clamp is workaround for
657       // narrowScalar limitation.
658       .widenScalarToNextPow2(0)
659       .clampScalar(0, S16, S32)
660       .scalarize(0);
661 
662     if (ST.hasVOP3PInsts()) {
663       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
664         .legalFor({S32, S16, V2S16})
665         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
666         .clampMaxNumElements(0, S16, 2)
667         .minScalar(0, S16)
668         .widenScalarToNextPow2(0)
669         .scalarize(0)
670         .lower();
671     } else {
672       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
673         .legalFor({S32, S16})
674         .widenScalarToNextPow2(0)
675         .minScalar(0, S16)
676         .scalarize(0)
677         .lower();
678     }
679   } else {
680     // TODO: Should have same legality without v_perm_b32
681     getActionDefinitionsBuilder(G_BSWAP)
682       .legalFor({S32})
683       .lowerIf(narrowerThan(0, 32))
684       // FIXME: Fixing non-power-of-2 before clamp is workaround for
685       // narrowScalar limitation.
686       .widenScalarToNextPow2(0)
687       .maxScalar(0, S32)
688       .scalarize(0)
689       .lower();
690 
691     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
692       .legalFor({S32})
693       .minScalar(0, S32)
694       .widenScalarToNextPow2(0)
695       .scalarize(0)
696       .lower();
697   }
698 
699   getActionDefinitionsBuilder(G_INTTOPTR)
700     // List the common cases
701     .legalForCartesianProduct(AddrSpaces64, {S64})
702     .legalForCartesianProduct(AddrSpaces32, {S32})
703     .scalarize(0)
704     // Accept any address space as long as the size matches
705     .legalIf(sameSize(0, 1))
706     .widenScalarIf(smallerThan(1, 0),
707       [](const LegalityQuery &Query) {
708         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
709       })
710     .narrowScalarIf(greaterThan(1, 0),
711       [](const LegalityQuery &Query) {
712         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
713       });
714 
715   getActionDefinitionsBuilder(G_PTRTOINT)
716     // List the common cases
717     .legalForCartesianProduct(AddrSpaces64, {S64})
718     .legalForCartesianProduct(AddrSpaces32, {S32})
719     .scalarize(0)
720     // Accept any address space as long as the size matches
721     .legalIf(sameSize(0, 1))
722     .widenScalarIf(smallerThan(0, 1),
723       [](const LegalityQuery &Query) {
724         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
725       })
726     .narrowScalarIf(
727       greaterThan(0, 1),
728       [](const LegalityQuery &Query) {
729         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
730       });
731 
732   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
733     .scalarize(0)
734     .custom();
735 
736   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
737   // handle some operations by just promoting the register during
738   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
739   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
740     switch (AS) {
741     // FIXME: Private element size.
742     case AMDGPUAS::PRIVATE_ADDRESS:
743       return 32;
744     // FIXME: Check subtarget
745     case AMDGPUAS::LOCAL_ADDRESS:
746       return ST.useDS128() ? 128 : 64;
747 
748     // Treat constant and global as identical. SMRD loads are sometimes usable
749     // for global loads (ideally constant address space should be eliminated)
750     // depending on the context. Legality cannot be context dependent, but
751     // RegBankSelect can split the load as necessary depending on the pointer
752     // register bank/uniformity and if the memory is invariant or not written in
753     // a kernel.
754     case AMDGPUAS::CONSTANT_ADDRESS:
755     case AMDGPUAS::GLOBAL_ADDRESS:
756       return IsLoad ? 512 : 128;
757     default:
758       return 128;
759     }
760   };
761 
762   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
763                                     bool IsLoad) -> bool {
764     const LLT DstTy = Query.Types[0];
765 
766     // Split vector extloads.
767     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
768     unsigned Align = Query.MMODescrs[0].AlignInBits;
769 
770     if (MemSize < DstTy.getSizeInBits())
771       MemSize = std::max(MemSize, Align);
772 
773     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
774       return true;
775 
776     const LLT PtrTy = Query.Types[1];
777     unsigned AS = PtrTy.getAddressSpace();
778     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
779       return true;
780 
781     // Catch weird sized loads that don't evenly divide into the access sizes
782     // TODO: May be able to widen depending on alignment etc.
783     unsigned NumRegs = (MemSize + 31) / 32;
784     if (NumRegs == 3) {
785       if (!ST.hasDwordx3LoadStores())
786         return true;
787     } else {
788       // If the alignment allows, these should have been widened.
789       if (!isPowerOf2_32(NumRegs))
790         return true;
791     }
792 
793     if (Align < MemSize) {
794       const SITargetLowering *TLI = ST.getTargetLowering();
795       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
796     }
797 
798     return false;
799   };
800 
801   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
802     unsigned Size = Query.Types[0].getSizeInBits();
803     if (isPowerOf2_32(Size))
804       return false;
805 
806     if (Size == 96 && ST.hasDwordx3LoadStores())
807       return false;
808 
809     unsigned AddrSpace = Query.Types[1].getAddressSpace();
810     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
811       return false;
812 
813     unsigned Align = Query.MMODescrs[0].AlignInBits;
814     unsigned RoundedSize = NextPowerOf2(Size);
815     return (Align >= RoundedSize);
816   };
817 
818   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
819   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
820   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
821 
822   // TODO: Refine based on subtargets which support unaligned access or 128-bit
823   // LDS
824   // TODO: Unsupported flat for SI.
825 
826   for (unsigned Op : {G_LOAD, G_STORE}) {
827     const bool IsStore = Op == G_STORE;
828 
829     auto &Actions = getActionDefinitionsBuilder(Op);
830     // Whitelist the common cases.
831     // TODO: Loads to s16 on gfx9
832     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
833                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
834                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
835                                       {S128, GlobalPtr, 128, GlobalAlign32},
836                                       {S64, GlobalPtr, 64, GlobalAlign32},
837                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
838                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
839                                       {S32, GlobalPtr, 8, GlobalAlign8},
840                                       {S32, GlobalPtr, 16, GlobalAlign16},
841 
842                                       {S32, LocalPtr, 32, 32},
843                                       {S64, LocalPtr, 64, 32},
844                                       {V2S32, LocalPtr, 64, 32},
845                                       {S32, LocalPtr, 8, 8},
846                                       {S32, LocalPtr, 16, 16},
847                                       {V2S16, LocalPtr, 32, 32},
848 
849                                       {S32, PrivatePtr, 32, 32},
850                                       {S32, PrivatePtr, 8, 8},
851                                       {S32, PrivatePtr, 16, 16},
852                                       {V2S16, PrivatePtr, 32, 32},
853 
854                                       {S32, FlatPtr, 32, GlobalAlign32},
855                                       {S32, FlatPtr, 16, GlobalAlign16},
856                                       {S32, FlatPtr, 8, GlobalAlign8},
857                                       {V2S16, FlatPtr, 32, GlobalAlign32},
858 
859                                       {S32, ConstantPtr, 32, GlobalAlign32},
860                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
861                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
862                                       {S64, ConstantPtr, 64, GlobalAlign32},
863                                       {S128, ConstantPtr, 128, GlobalAlign32},
864                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
865     Actions
866         .customIf(typeIs(1, Constant32Ptr))
867         // Widen suitably aligned loads by loading extra elements.
868         .moreElementsIf([=](const LegalityQuery &Query) {
869             const LLT Ty = Query.Types[0];
870             return Op == G_LOAD && Ty.isVector() &&
871                    shouldWidenLoadResult(Query);
872           }, moreElementsToNextPow2(0))
873         .widenScalarIf([=](const LegalityQuery &Query) {
874             const LLT Ty = Query.Types[0];
875             return Op == G_LOAD && !Ty.isVector() &&
876                    shouldWidenLoadResult(Query);
877           }, widenScalarOrEltToNextPow2(0))
878         .narrowScalarIf(
879             [=](const LegalityQuery &Query) -> bool {
880               return !Query.Types[0].isVector() &&
881                      needToSplitMemOp(Query, Op == G_LOAD);
882             },
883             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
884               const LLT DstTy = Query.Types[0];
885               const LLT PtrTy = Query.Types[1];
886 
887               const unsigned DstSize = DstTy.getSizeInBits();
888               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
889 
890               // Split extloads.
891               if (DstSize > MemSize)
892                 return std::make_pair(0, LLT::scalar(MemSize));
893 
894               if (!isPowerOf2_32(DstSize)) {
895                 // We're probably decomposing an odd sized store. Try to split
896                 // to the widest type. TODO: Account for alignment. As-is it
897                 // should be OK, since the new parts will be further legalized.
898                 unsigned FloorSize = PowerOf2Floor(DstSize);
899                 return std::make_pair(0, LLT::scalar(FloorSize));
900               }
901 
902               if (DstSize > 32 && (DstSize % 32 != 0)) {
903                 // FIXME: Need a way to specify non-extload of larger size if
904                 // suitably aligned.
905                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
906               }
907 
908               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
909                                                      Op == G_LOAD);
910               if (MemSize > MaxSize)
911                 return std::make_pair(0, LLT::scalar(MaxSize));
912 
913               unsigned Align = Query.MMODescrs[0].AlignInBits;
914               return std::make_pair(0, LLT::scalar(Align));
915             })
916         .fewerElementsIf(
917             [=](const LegalityQuery &Query) -> bool {
918               return Query.Types[0].isVector() &&
919                      needToSplitMemOp(Query, Op == G_LOAD);
920             },
921             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
922               const LLT DstTy = Query.Types[0];
923               const LLT PtrTy = Query.Types[1];
924 
925               LLT EltTy = DstTy.getElementType();
926               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
927                                                      Op == G_LOAD);
928 
929               // FIXME: Handle widened to power of 2 results better. This ends
930               // up scalarizing.
931               // FIXME: 3 element stores scalarized on SI
932 
933               // Split if it's too large for the address space.
934               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
935                 unsigned NumElts = DstTy.getNumElements();
936                 unsigned EltSize = EltTy.getSizeInBits();
937 
938                 if (MaxSize % EltSize == 0) {
939                   return std::make_pair(
940                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
941                 }
942 
943                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
944 
945                 // FIXME: Refine when odd breakdowns handled
946                 // The scalars will need to be re-legalized.
947                 if (NumPieces == 1 || NumPieces >= NumElts ||
948                     NumElts % NumPieces != 0)
949                   return std::make_pair(0, EltTy);
950 
951                 return std::make_pair(0,
952                                       LLT::vector(NumElts / NumPieces, EltTy));
953               }
954 
955               // FIXME: We could probably handle weird extending loads better.
956               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
957               if (DstTy.getSizeInBits() > MemSize)
958                 return std::make_pair(0, EltTy);
959 
960               unsigned EltSize = EltTy.getSizeInBits();
961               unsigned DstSize = DstTy.getSizeInBits();
962               if (!isPowerOf2_32(DstSize)) {
963                 // We're probably decomposing an odd sized store. Try to split
964                 // to the widest type. TODO: Account for alignment. As-is it
965                 // should be OK, since the new parts will be further legalized.
966                 unsigned FloorSize = PowerOf2Floor(DstSize);
967                 return std::make_pair(
968                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
969               }
970 
971               // Need to split because of alignment.
972               unsigned Align = Query.MMODescrs[0].AlignInBits;
973               if (EltSize > Align &&
974                   (EltSize / Align < DstTy.getNumElements())) {
975                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
976               }
977 
978               // May need relegalization for the scalars.
979               return std::make_pair(0, EltTy);
980             })
981         .minScalar(0, S32);
982 
983     if (IsStore)
984       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
985 
986     // TODO: Need a bitcast lower option?
987     Actions
988         .legalIf([=](const LegalityQuery &Query) {
989           const LLT Ty0 = Query.Types[0];
990           unsigned Size = Ty0.getSizeInBits();
991           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
992           unsigned Align = Query.MMODescrs[0].AlignInBits;
993 
994           // FIXME: Widening store from alignment not valid.
995           if (MemSize < Size)
996             MemSize = std::max(MemSize, Align);
997 
998           // No extending vector loads.
999           if (Size > MemSize && Ty0.isVector())
1000             return false;
1001 
1002           switch (MemSize) {
1003           case 8:
1004           case 16:
1005             return Size == 32;
1006           case 32:
1007           case 64:
1008           case 128:
1009             return true;
1010           case 96:
1011             return ST.hasDwordx3LoadStores();
1012           case 256:
1013           case 512:
1014             return true;
1015           default:
1016             return false;
1017           }
1018         })
1019         .widenScalarToNextPow2(0)
1020         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1021   }
1022 
1023   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1024                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1025                                                   {S32, GlobalPtr, 16, 2 * 8},
1026                                                   {S32, LocalPtr, 8, 8},
1027                                                   {S32, LocalPtr, 16, 16},
1028                                                   {S32, PrivatePtr, 8, 8},
1029                                                   {S32, PrivatePtr, 16, 16},
1030                                                   {S32, ConstantPtr, 8, 8},
1031                                                   {S32, ConstantPtr, 16, 2 * 8}});
1032   if (ST.hasFlatAddressSpace()) {
1033     ExtLoads.legalForTypesWithMemDesc(
1034         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1035   }
1036 
1037   ExtLoads.clampScalar(0, S32, S32)
1038           .widenScalarToNextPow2(0)
1039           .unsupportedIfMemSizeNotPow2()
1040           .lower();
1041 
1042   auto &Atomics = getActionDefinitionsBuilder(
1043     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1044      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1045      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1046      G_ATOMICRMW_UMIN})
1047     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1048                {S64, GlobalPtr}, {S64, LocalPtr}});
1049   if (ST.hasFlatAddressSpace()) {
1050     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1051   }
1052 
1053   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1054     .legalFor({{S32, LocalPtr}});
1055 
1056   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1057   // demarshalling
1058   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1059     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1060                 {S32, FlatPtr}, {S64, FlatPtr}})
1061     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1062                {S32, RegionPtr}, {S64, RegionPtr}});
1063   // TODO: Pointer types, any 32-bit or 64-bit vector
1064 
1065   // Condition should be s32 for scalar, s1 for vector.
1066   getActionDefinitionsBuilder(G_SELECT)
1067     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1068           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1069           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1070     .clampScalar(0, S16, S64)
1071     .scalarize(1)
1072     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1073     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1074     .clampMaxNumElements(0, S32, 2)
1075     .clampMaxNumElements(0, LocalPtr, 2)
1076     .clampMaxNumElements(0, PrivatePtr, 2)
1077     .scalarize(0)
1078     .widenScalarToNextPow2(0)
1079     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1080 
1081   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1082   // be more flexible with the shift amount type.
1083   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1084     .legalFor({{S32, S32}, {S64, S32}});
1085   if (ST.has16BitInsts()) {
1086     if (ST.hasVOP3PInsts()) {
1087       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1088             .clampMaxNumElements(0, S16, 2);
1089     } else
1090       Shifts.legalFor({{S16, S16}});
1091 
1092     // TODO: Support 16-bit shift amounts for all types
1093     Shifts.widenScalarIf(
1094       [=](const LegalityQuery &Query) {
1095         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1096         // 32-bit amount.
1097         const LLT ValTy = Query.Types[0];
1098         const LLT AmountTy = Query.Types[1];
1099         return ValTy.getSizeInBits() <= 16 &&
1100                AmountTy.getSizeInBits() < 16;
1101       }, changeTo(1, S16));
1102     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1103     Shifts.clampScalar(1, S32, S32);
1104     Shifts.clampScalar(0, S16, S64);
1105     Shifts.widenScalarToNextPow2(0, 16);
1106   } else {
1107     // Make sure we legalize the shift amount type first, as the general
1108     // expansion for the shifted type will produce much worse code if it hasn't
1109     // been truncated already.
1110     Shifts.clampScalar(1, S32, S32);
1111     Shifts.clampScalar(0, S32, S64);
1112     Shifts.widenScalarToNextPow2(0, 32);
1113   }
1114   Shifts.scalarize(0);
1115 
1116   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1117     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1118     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1119     unsigned IdxTypeIdx = 2;
1120 
1121     getActionDefinitionsBuilder(Op)
1122       .customIf([=](const LegalityQuery &Query) {
1123           const LLT EltTy = Query.Types[EltTypeIdx];
1124           const LLT VecTy = Query.Types[VecTypeIdx];
1125           const LLT IdxTy = Query.Types[IdxTypeIdx];
1126           return (EltTy.getSizeInBits() == 16 ||
1127                   EltTy.getSizeInBits() % 32 == 0) &&
1128                  VecTy.getSizeInBits() % 32 == 0 &&
1129                  VecTy.getSizeInBits() <= 1024 &&
1130                  IdxTy.getSizeInBits() == 32;
1131         })
1132       .clampScalar(EltTypeIdx, S32, S64)
1133       .clampScalar(VecTypeIdx, S32, S64)
1134       .clampScalar(IdxTypeIdx, S32, S32);
1135   }
1136 
1137   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1138     .unsupportedIf([=](const LegalityQuery &Query) {
1139         const LLT &EltTy = Query.Types[1].getElementType();
1140         return Query.Types[0] != EltTy;
1141       });
1142 
1143   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1144     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1145     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1146 
1147     // FIXME: Doesn't handle extract of illegal sizes.
1148     getActionDefinitionsBuilder(Op)
1149       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1150       // FIXME: Multiples of 16 should not be legal.
1151       .legalIf([=](const LegalityQuery &Query) {
1152           const LLT BigTy = Query.Types[BigTyIdx];
1153           const LLT LitTy = Query.Types[LitTyIdx];
1154           return (BigTy.getSizeInBits() % 32 == 0) &&
1155                  (LitTy.getSizeInBits() % 16 == 0);
1156         })
1157       .widenScalarIf(
1158         [=](const LegalityQuery &Query) {
1159           const LLT BigTy = Query.Types[BigTyIdx];
1160           return (BigTy.getScalarSizeInBits() < 16);
1161         },
1162         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1163       .widenScalarIf(
1164         [=](const LegalityQuery &Query) {
1165           const LLT LitTy = Query.Types[LitTyIdx];
1166           return (LitTy.getScalarSizeInBits() < 16);
1167         },
1168         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1169       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1170       .widenScalarToNextPow2(BigTyIdx, 32);
1171 
1172   }
1173 
1174   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1175     .legalForCartesianProduct(AllS32Vectors, {S32})
1176     .legalForCartesianProduct(AllS64Vectors, {S64})
1177     .clampNumElements(0, V16S32, V32S32)
1178     .clampNumElements(0, V2S64, V16S64)
1179     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1180 
1181   if (ST.hasScalarPackInsts()) {
1182     BuildVector
1183       // FIXME: Should probably widen s1 vectors straight to s32
1184       .minScalarOrElt(0, S16)
1185       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1186       .minScalar(1, S32);
1187 
1188     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1189       .legalFor({V2S16, S32})
1190       .lower();
1191     BuildVector.minScalarOrElt(0, S32);
1192   } else {
1193     BuildVector.customFor({V2S16, S16});
1194     BuildVector.minScalarOrElt(0, S32);
1195 
1196     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1197       .customFor({V2S16, S32})
1198       .lower();
1199   }
1200 
1201   BuildVector.legalIf(isRegisterType(0));
1202 
1203   // FIXME: Clamp maximum size
1204   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1205     .legalIf(isRegisterType(0));
1206 
1207   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1208   // pre-legalize.
1209   if (ST.hasVOP3PInsts()) {
1210     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1211       .customFor({V2S16, V2S16})
1212       .lower();
1213   } else
1214     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1215 
1216   // Merge/Unmerge
1217   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1218     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1219     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1220 
1221     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1222       const LLT Ty = Query.Types[TypeIdx];
1223       if (Ty.isVector()) {
1224         const LLT &EltTy = Ty.getElementType();
1225         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1226           return true;
1227         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1228           return true;
1229       }
1230       return false;
1231     };
1232 
1233     auto &Builder = getActionDefinitionsBuilder(Op)
1234       .lowerFor({{S16, V2S16}})
1235       .lowerIf([=](const LegalityQuery &Query) {
1236           const LLT BigTy = Query.Types[BigTyIdx];
1237           return BigTy.getSizeInBits() == 32;
1238         })
1239       // Try to widen to s16 first for small types.
1240       // TODO: Only do this on targets with legal s16 shifts
1241       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1242       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1243       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1244       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1245                            elementTypeIs(1, S16)),
1246                        changeTo(1, V2S16))
1247       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1248       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1249       // valid.
1250       .clampScalar(LitTyIdx, S32, S512)
1251       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1252       // Break up vectors with weird elements into scalars
1253       .fewerElementsIf(
1254         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1255         scalarize(0))
1256       .fewerElementsIf(
1257         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1258         scalarize(1))
1259       .clampScalar(BigTyIdx, S32, S1024);
1260 
1261     if (Op == G_MERGE_VALUES) {
1262       Builder.widenScalarIf(
1263         // TODO: Use 16-bit shifts if legal for 8-bit values?
1264         [=](const LegalityQuery &Query) {
1265           const LLT Ty = Query.Types[LitTyIdx];
1266           return Ty.getSizeInBits() < 32;
1267         },
1268         changeTo(LitTyIdx, S32));
1269     }
1270 
1271     Builder.widenScalarIf(
1272       [=](const LegalityQuery &Query) {
1273         const LLT Ty = Query.Types[BigTyIdx];
1274         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1275           Ty.getSizeInBits() % 16 != 0;
1276       },
1277       [=](const LegalityQuery &Query) {
1278         // Pick the next power of 2, or a multiple of 64 over 128.
1279         // Whichever is smaller.
1280         const LLT &Ty = Query.Types[BigTyIdx];
1281         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1282         if (NewSizeInBits >= 256) {
1283           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1284           if (RoundedTo < NewSizeInBits)
1285             NewSizeInBits = RoundedTo;
1286         }
1287         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1288       })
1289       .legalIf([=](const LegalityQuery &Query) {
1290           const LLT &BigTy = Query.Types[BigTyIdx];
1291           const LLT &LitTy = Query.Types[LitTyIdx];
1292 
1293           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1294             return false;
1295           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1296             return false;
1297 
1298           return BigTy.getSizeInBits() % 16 == 0 &&
1299                  LitTy.getSizeInBits() % 16 == 0 &&
1300                  BigTy.getSizeInBits() <= 1024;
1301         })
1302       // Any vectors left are the wrong size. Scalarize them.
1303       .scalarize(0)
1304       .scalarize(1);
1305   }
1306 
1307   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1308   // RegBankSelect.
1309   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1310     .legalFor({{S32}, {S64}});
1311 
1312   if (ST.hasVOP3PInsts()) {
1313     SextInReg.lowerFor({{V2S16}})
1314       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1315       // get more vector shift opportunities, since we'll get those when
1316       // expanded.
1317       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1318   } else if (ST.has16BitInsts()) {
1319     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1320   } else {
1321     // Prefer to promote to s32 before lowering if we don't have 16-bit
1322     // shifts. This avoid a lot of intermediate truncate and extend operations.
1323     SextInReg.lowerFor({{S32}, {S64}});
1324   }
1325 
1326   SextInReg
1327     .scalarize(0)
1328     .clampScalar(0, S32, S64)
1329     .lower();
1330 
1331   getActionDefinitionsBuilder(G_FSHR)
1332     .legalFor({{S32, S32}})
1333     .scalarize(0)
1334     .lower();
1335 
1336   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1337     .legalFor({S64});
1338 
1339   getActionDefinitionsBuilder({
1340       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1341       G_FCOPYSIGN,
1342 
1343       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1344       G_READ_REGISTER,
1345       G_WRITE_REGISTER,
1346 
1347       G_SADDO, G_SSUBO,
1348 
1349        // TODO: Implement
1350       G_FMINIMUM, G_FMAXIMUM,
1351       G_FSHL
1352     }).lower();
1353 
1354   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1355         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1356         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1357     .unsupported();
1358 
1359   computeTables();
1360   verify(*ST.getInstrInfo());
1361 }
1362 
1363 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1364                                          MachineRegisterInfo &MRI,
1365                                          MachineIRBuilder &B,
1366                                          GISelChangeObserver &Observer) const {
1367   switch (MI.getOpcode()) {
1368   case TargetOpcode::G_ADDRSPACE_CAST:
1369     return legalizeAddrSpaceCast(MI, MRI, B);
1370   case TargetOpcode::G_FRINT:
1371     return legalizeFrint(MI, MRI, B);
1372   case TargetOpcode::G_FCEIL:
1373     return legalizeFceil(MI, MRI, B);
1374   case TargetOpcode::G_INTRINSIC_TRUNC:
1375     return legalizeIntrinsicTrunc(MI, MRI, B);
1376   case TargetOpcode::G_SITOFP:
1377     return legalizeITOFP(MI, MRI, B, true);
1378   case TargetOpcode::G_UITOFP:
1379     return legalizeITOFP(MI, MRI, B, false);
1380   case TargetOpcode::G_FPTOSI:
1381     return legalizeFPTOI(MI, MRI, B, true);
1382   case TargetOpcode::G_FPTOUI:
1383     return legalizeFPTOI(MI, MRI, B, false);
1384   case TargetOpcode::G_FMINNUM:
1385   case TargetOpcode::G_FMAXNUM:
1386   case TargetOpcode::G_FMINNUM_IEEE:
1387   case TargetOpcode::G_FMAXNUM_IEEE:
1388     return legalizeMinNumMaxNum(MI, MRI, B);
1389   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1390     return legalizeExtractVectorElt(MI, MRI, B);
1391   case TargetOpcode::G_INSERT_VECTOR_ELT:
1392     return legalizeInsertVectorElt(MI, MRI, B);
1393   case TargetOpcode::G_SHUFFLE_VECTOR:
1394     return legalizeShuffleVector(MI, MRI, B);
1395   case TargetOpcode::G_FSIN:
1396   case TargetOpcode::G_FCOS:
1397     return legalizeSinCos(MI, MRI, B);
1398   case TargetOpcode::G_GLOBAL_VALUE:
1399     return legalizeGlobalValue(MI, MRI, B);
1400   case TargetOpcode::G_LOAD:
1401     return legalizeLoad(MI, MRI, B, Observer);
1402   case TargetOpcode::G_FMAD:
1403     return legalizeFMad(MI, MRI, B);
1404   case TargetOpcode::G_FDIV:
1405     return legalizeFDIV(MI, MRI, B);
1406   case TargetOpcode::G_UDIV:
1407   case TargetOpcode::G_UREM:
1408     return legalizeUDIV_UREM(MI, MRI, B);
1409   case TargetOpcode::G_SDIV:
1410   case TargetOpcode::G_SREM:
1411     return legalizeSDIV_SREM(MI, MRI, B);
1412   case TargetOpcode::G_ATOMIC_CMPXCHG:
1413     return legalizeAtomicCmpXChg(MI, MRI, B);
1414   case TargetOpcode::G_FLOG:
1415     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1416   case TargetOpcode::G_FLOG10:
1417     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1418   case TargetOpcode::G_FEXP:
1419     return legalizeFExp(MI, B);
1420   case TargetOpcode::G_FPOW:
1421     return legalizeFPow(MI, B);
1422   case TargetOpcode::G_FFLOOR:
1423     return legalizeFFloor(MI, MRI, B);
1424   case TargetOpcode::G_BUILD_VECTOR:
1425     return legalizeBuildVector(MI, MRI, B);
1426   default:
1427     return false;
1428   }
1429 
1430   llvm_unreachable("expected switch to return");
1431 }
1432 
1433 Register AMDGPULegalizerInfo::getSegmentAperture(
1434   unsigned AS,
1435   MachineRegisterInfo &MRI,
1436   MachineIRBuilder &B) const {
1437   MachineFunction &MF = B.getMF();
1438   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1439   const LLT S32 = LLT::scalar(32);
1440 
1441   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1442 
1443   if (ST.hasApertureRegs()) {
1444     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1445     // getreg.
1446     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1447         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1448         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1449     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1450         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1451         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1452     unsigned Encoding =
1453         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1454         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1455         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1456 
1457     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1458 
1459     B.buildInstr(AMDGPU::S_GETREG_B32)
1460       .addDef(GetReg)
1461       .addImm(Encoding);
1462     MRI.setType(GetReg, S32);
1463 
1464     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1465     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1466   }
1467 
1468   Register QueuePtr = MRI.createGenericVirtualRegister(
1469     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1470 
1471   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1472   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1473     return Register();
1474 
1475   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1476   // private_segment_aperture_base_hi.
1477   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1478 
1479   // TODO: can we be smarter about machine pointer info?
1480   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1481   MachineMemOperand *MMO = MF.getMachineMemOperand(
1482       PtrInfo,
1483       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1484           MachineMemOperand::MOInvariant,
1485       4, commonAlignment(Align(64), StructOffset));
1486 
1487   Register LoadAddr;
1488 
1489   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1490   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1491 }
1492 
1493 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1494   MachineInstr &MI, MachineRegisterInfo &MRI,
1495   MachineIRBuilder &B) const {
1496   MachineFunction &MF = B.getMF();
1497 
1498   B.setInstr(MI);
1499 
1500   const LLT S32 = LLT::scalar(32);
1501   Register Dst = MI.getOperand(0).getReg();
1502   Register Src = MI.getOperand(1).getReg();
1503 
1504   LLT DstTy = MRI.getType(Dst);
1505   LLT SrcTy = MRI.getType(Src);
1506   unsigned DestAS = DstTy.getAddressSpace();
1507   unsigned SrcAS = SrcTy.getAddressSpace();
1508 
1509   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1510   // vector element.
1511   assert(!DstTy.isVector());
1512 
1513   const AMDGPUTargetMachine &TM
1514     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1515 
1516   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1517   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1518     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1519     return true;
1520   }
1521 
1522   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1523     // Truncate.
1524     B.buildExtract(Dst, Src, 0);
1525     MI.eraseFromParent();
1526     return true;
1527   }
1528 
1529   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1530     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1531     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1532 
1533     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1534     // another. Merge operands are required to be the same type, but creating an
1535     // extra ptrtoint would be kind of pointless.
1536     auto HighAddr = B.buildConstant(
1537       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1538     B.buildMerge(Dst, {Src, HighAddr});
1539     MI.eraseFromParent();
1540     return true;
1541   }
1542 
1543   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1544     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1545            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1546     unsigned NullVal = TM.getNullPointerValue(DestAS);
1547 
1548     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1549     auto FlatNull = B.buildConstant(SrcTy, 0);
1550 
1551     // Extract low 32-bits of the pointer.
1552     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1553 
1554     auto CmpRes =
1555         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1556     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1557 
1558     MI.eraseFromParent();
1559     return true;
1560   }
1561 
1562   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1563     return false;
1564 
1565   if (!ST.hasFlatAddressSpace())
1566     return false;
1567 
1568   auto SegmentNull =
1569       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1570   auto FlatNull =
1571       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1572 
1573   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1574   if (!ApertureReg.isValid())
1575     return false;
1576 
1577   auto CmpRes =
1578       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1579 
1580   // Coerce the type of the low half of the result so we can use merge_values.
1581   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1582 
1583   // TODO: Should we allow mismatched types but matching sizes in merges to
1584   // avoid the ptrtoint?
1585   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1586   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1587 
1588   MI.eraseFromParent();
1589   return true;
1590 }
1591 
1592 bool AMDGPULegalizerInfo::legalizeFrint(
1593   MachineInstr &MI, MachineRegisterInfo &MRI,
1594   MachineIRBuilder &B) const {
1595   B.setInstr(MI);
1596 
1597   Register Src = MI.getOperand(1).getReg();
1598   LLT Ty = MRI.getType(Src);
1599   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1600 
1601   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1602   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1603 
1604   auto C1 = B.buildFConstant(Ty, C1Val);
1605   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1606 
1607   // TODO: Should this propagate fast-math-flags?
1608   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1609   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1610 
1611   auto C2 = B.buildFConstant(Ty, C2Val);
1612   auto Fabs = B.buildFAbs(Ty, Src);
1613 
1614   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1615   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1616   return true;
1617 }
1618 
1619 bool AMDGPULegalizerInfo::legalizeFceil(
1620   MachineInstr &MI, MachineRegisterInfo &MRI,
1621   MachineIRBuilder &B) const {
1622   B.setInstr(MI);
1623 
1624   const LLT S1 = LLT::scalar(1);
1625   const LLT S64 = LLT::scalar(64);
1626 
1627   Register Src = MI.getOperand(1).getReg();
1628   assert(MRI.getType(Src) == S64);
1629 
1630   // result = trunc(src)
1631   // if (src > 0.0 && src != result)
1632   //   result += 1.0
1633 
1634   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1635 
1636   const auto Zero = B.buildFConstant(S64, 0.0);
1637   const auto One = B.buildFConstant(S64, 1.0);
1638   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1639   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1640   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1641   auto Add = B.buildSelect(S64, And, One, Zero);
1642 
1643   // TODO: Should this propagate fast-math-flags?
1644   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1645   return true;
1646 }
1647 
1648 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1649                                               MachineIRBuilder &B) {
1650   const unsigned FractBits = 52;
1651   const unsigned ExpBits = 11;
1652   LLT S32 = LLT::scalar(32);
1653 
1654   auto Const0 = B.buildConstant(S32, FractBits - 32);
1655   auto Const1 = B.buildConstant(S32, ExpBits);
1656 
1657   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1658     .addUse(Const0.getReg(0))
1659     .addUse(Const1.getReg(0));
1660 
1661   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1662 }
1663 
1664 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1665   MachineInstr &MI, MachineRegisterInfo &MRI,
1666   MachineIRBuilder &B) const {
1667   B.setInstr(MI);
1668 
1669   const LLT S1 = LLT::scalar(1);
1670   const LLT S32 = LLT::scalar(32);
1671   const LLT S64 = LLT::scalar(64);
1672 
1673   Register Src = MI.getOperand(1).getReg();
1674   assert(MRI.getType(Src) == S64);
1675 
1676   // TODO: Should this use extract since the low half is unused?
1677   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1678   Register Hi = Unmerge.getReg(1);
1679 
1680   // Extract the upper half, since this is where we will find the sign and
1681   // exponent.
1682   auto Exp = extractF64Exponent(Hi, B);
1683 
1684   const unsigned FractBits = 52;
1685 
1686   // Extract the sign bit.
1687   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1688   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1689 
1690   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1691 
1692   const auto Zero32 = B.buildConstant(S32, 0);
1693 
1694   // Extend back to 64-bits.
1695   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1696 
1697   auto Shr = B.buildAShr(S64, FractMask, Exp);
1698   auto Not = B.buildNot(S64, Shr);
1699   auto Tmp0 = B.buildAnd(S64, Src, Not);
1700   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1701 
1702   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1703   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1704 
1705   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1706   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1707   return true;
1708 }
1709 
1710 bool AMDGPULegalizerInfo::legalizeITOFP(
1711   MachineInstr &MI, MachineRegisterInfo &MRI,
1712   MachineIRBuilder &B, bool Signed) const {
1713   B.setInstr(MI);
1714 
1715   Register Dst = MI.getOperand(0).getReg();
1716   Register Src = MI.getOperand(1).getReg();
1717 
1718   const LLT S64 = LLT::scalar(64);
1719   const LLT S32 = LLT::scalar(32);
1720 
1721   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1722 
1723   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1724 
1725   auto CvtHi = Signed ?
1726     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1727     B.buildUITOFP(S64, Unmerge.getReg(1));
1728 
1729   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1730 
1731   auto ThirtyTwo = B.buildConstant(S32, 32);
1732   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1733     .addUse(CvtHi.getReg(0))
1734     .addUse(ThirtyTwo.getReg(0));
1735 
1736   // TODO: Should this propagate fast-math-flags?
1737   B.buildFAdd(Dst, LdExp, CvtLo);
1738   MI.eraseFromParent();
1739   return true;
1740 }
1741 
1742 // TODO: Copied from DAG implementation. Verify logic and document how this
1743 // actually works.
1744 bool AMDGPULegalizerInfo::legalizeFPTOI(
1745   MachineInstr &MI, MachineRegisterInfo &MRI,
1746   MachineIRBuilder &B, bool Signed) const {
1747   B.setInstr(MI);
1748 
1749   Register Dst = MI.getOperand(0).getReg();
1750   Register Src = MI.getOperand(1).getReg();
1751 
1752   const LLT S64 = LLT::scalar(64);
1753   const LLT S32 = LLT::scalar(32);
1754 
1755   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1756 
1757   unsigned Flags = MI.getFlags();
1758 
1759   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1760   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1761   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1762 
1763   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1764   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1765   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1766 
1767   auto Hi = Signed ?
1768     B.buildFPTOSI(S32, FloorMul) :
1769     B.buildFPTOUI(S32, FloorMul);
1770   auto Lo = B.buildFPTOUI(S32, Fma);
1771 
1772   B.buildMerge(Dst, { Lo, Hi });
1773   MI.eraseFromParent();
1774 
1775   return true;
1776 }
1777 
1778 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1779   MachineInstr &MI, MachineRegisterInfo &MRI,
1780   MachineIRBuilder &B) const {
1781   MachineFunction &MF = B.getMF();
1782   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1783 
1784   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1785                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1786 
1787   // With ieee_mode disabled, the instructions have the correct behavior
1788   // already for G_FMINNUM/G_FMAXNUM
1789   if (!MFI->getMode().IEEE)
1790     return !IsIEEEOp;
1791 
1792   if (IsIEEEOp)
1793     return true;
1794 
1795   MachineIRBuilder HelperBuilder(MI);
1796   GISelObserverWrapper DummyObserver;
1797   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1798   HelperBuilder.setInstr(MI);
1799   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1800 }
1801 
1802 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1803   MachineInstr &MI, MachineRegisterInfo &MRI,
1804   MachineIRBuilder &B) const {
1805   // TODO: Should move some of this into LegalizerHelper.
1806 
1807   // TODO: Promote dynamic indexing of s16 to s32
1808 
1809   // FIXME: Artifact combiner probably should have replaced the truncated
1810   // constant before this, so we shouldn't need
1811   // getConstantVRegValWithLookThrough.
1812   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1813     MI.getOperand(2).getReg(), MRI);
1814   if (!IdxVal) // Dynamic case will be selected to register indexing.
1815     return true;
1816 
1817   Register Dst = MI.getOperand(0).getReg();
1818   Register Vec = MI.getOperand(1).getReg();
1819 
1820   LLT VecTy = MRI.getType(Vec);
1821   LLT EltTy = VecTy.getElementType();
1822   assert(EltTy == MRI.getType(Dst));
1823 
1824   B.setInstr(MI);
1825 
1826   if (IdxVal->Value < VecTy.getNumElements())
1827     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1828   else
1829     B.buildUndef(Dst);
1830 
1831   MI.eraseFromParent();
1832   return true;
1833 }
1834 
1835 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1836   MachineInstr &MI, MachineRegisterInfo &MRI,
1837   MachineIRBuilder &B) const {
1838   // TODO: Should move some of this into LegalizerHelper.
1839 
1840   // TODO: Promote dynamic indexing of s16 to s32
1841 
1842   // FIXME: Artifact combiner probably should have replaced the truncated
1843   // constant before this, so we shouldn't need
1844   // getConstantVRegValWithLookThrough.
1845   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1846     MI.getOperand(3).getReg(), MRI);
1847   if (!IdxVal) // Dynamic case will be selected to register indexing.
1848     return true;
1849 
1850   Register Dst = MI.getOperand(0).getReg();
1851   Register Vec = MI.getOperand(1).getReg();
1852   Register Ins = MI.getOperand(2).getReg();
1853 
1854   LLT VecTy = MRI.getType(Vec);
1855   LLT EltTy = VecTy.getElementType();
1856   assert(EltTy == MRI.getType(Ins));
1857 
1858   B.setInstr(MI);
1859 
1860   if (IdxVal->Value < VecTy.getNumElements())
1861     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1862   else
1863     B.buildUndef(Dst);
1864 
1865   MI.eraseFromParent();
1866   return true;
1867 }
1868 
1869 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1870   MachineInstr &MI, MachineRegisterInfo &MRI,
1871   MachineIRBuilder &B) const {
1872   const LLT V2S16 = LLT::vector(2, 16);
1873 
1874   Register Dst = MI.getOperand(0).getReg();
1875   Register Src0 = MI.getOperand(1).getReg();
1876   LLT DstTy = MRI.getType(Dst);
1877   LLT SrcTy = MRI.getType(Src0);
1878 
1879   if (SrcTy == V2S16 && DstTy == V2S16 &&
1880       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1881     return true;
1882 
1883   MachineIRBuilder HelperBuilder(MI);
1884   GISelObserverWrapper DummyObserver;
1885   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1886   HelperBuilder.setInstr(MI);
1887   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1888 }
1889 
1890 bool AMDGPULegalizerInfo::legalizeSinCos(
1891   MachineInstr &MI, MachineRegisterInfo &MRI,
1892   MachineIRBuilder &B) const {
1893   B.setInstr(MI);
1894 
1895   Register DstReg = MI.getOperand(0).getReg();
1896   Register SrcReg = MI.getOperand(1).getReg();
1897   LLT Ty = MRI.getType(DstReg);
1898   unsigned Flags = MI.getFlags();
1899 
1900   Register TrigVal;
1901   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1902   if (ST.hasTrigReducedRange()) {
1903     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1904     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1905       .addUse(MulVal.getReg(0))
1906       .setMIFlags(Flags).getReg(0);
1907   } else
1908     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1909 
1910   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1911     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1912   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1913     .addUse(TrigVal)
1914     .setMIFlags(Flags);
1915   MI.eraseFromParent();
1916   return true;
1917 }
1918 
1919 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1920   Register DstReg, LLT PtrTy,
1921   MachineIRBuilder &B, const GlobalValue *GV,
1922   unsigned Offset, unsigned GAFlags) const {
1923   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1924   // to the following code sequence:
1925   //
1926   // For constant address space:
1927   //   s_getpc_b64 s[0:1]
1928   //   s_add_u32 s0, s0, $symbol
1929   //   s_addc_u32 s1, s1, 0
1930   //
1931   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1932   //   a fixup or relocation is emitted to replace $symbol with a literal
1933   //   constant, which is a pc-relative offset from the encoding of the $symbol
1934   //   operand to the global variable.
1935   //
1936   // For global address space:
1937   //   s_getpc_b64 s[0:1]
1938   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1939   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1940   //
1941   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1942   //   fixups or relocations are emitted to replace $symbol@*@lo and
1943   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1944   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1945   //   operand to the global variable.
1946   //
1947   // What we want here is an offset from the value returned by s_getpc
1948   // (which is the address of the s_add_u32 instruction) to the global
1949   // variable, but since the encoding of $symbol starts 4 bytes after the start
1950   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1951   // small. This requires us to add 4 to the global variable offset in order to
1952   // compute the correct address.
1953 
1954   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1955 
1956   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1957     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1958 
1959   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1960     .addDef(PCReg);
1961 
1962   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1963   if (GAFlags == SIInstrInfo::MO_NONE)
1964     MIB.addImm(0);
1965   else
1966     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1967 
1968   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1969 
1970   if (PtrTy.getSizeInBits() == 32)
1971     B.buildExtract(DstReg, PCReg, 0);
1972   return true;
1973  }
1974 
1975 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1976   MachineInstr &MI, MachineRegisterInfo &MRI,
1977   MachineIRBuilder &B) const {
1978   Register DstReg = MI.getOperand(0).getReg();
1979   LLT Ty = MRI.getType(DstReg);
1980   unsigned AS = Ty.getAddressSpace();
1981 
1982   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1983   MachineFunction &MF = B.getMF();
1984   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1985   B.setInstr(MI);
1986 
1987   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1988     if (!MFI->isEntryFunction()) {
1989       const Function &Fn = MF.getFunction();
1990       DiagnosticInfoUnsupported BadLDSDecl(
1991         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1992         DS_Warning);
1993       Fn.getContext().diagnose(BadLDSDecl);
1994 
1995       // We currently don't have a way to correctly allocate LDS objects that
1996       // aren't directly associated with a kernel. We do force inlining of
1997       // functions that use local objects. However, if these dead functions are
1998       // not eliminated, we don't want a compile time error. Just emit a warning
1999       // and a trap, since there should be no callable path here.
2000       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2001       B.buildUndef(DstReg);
2002       MI.eraseFromParent();
2003       return true;
2004     }
2005 
2006     // TODO: We could emit code to handle the initialization somewhere.
2007     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2008       const SITargetLowering *TLI = ST.getTargetLowering();
2009       if (!TLI->shouldUseLDSConstAddress(GV)) {
2010         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2011         return true; // Leave in place;
2012       }
2013 
2014       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2015       MI.eraseFromParent();
2016       return true;
2017     }
2018 
2019     const Function &Fn = MF.getFunction();
2020     DiagnosticInfoUnsupported BadInit(
2021       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2022     Fn.getContext().diagnose(BadInit);
2023     return true;
2024   }
2025 
2026   const SITargetLowering *TLI = ST.getTargetLowering();
2027 
2028   if (TLI->shouldEmitFixup(GV)) {
2029     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2030     MI.eraseFromParent();
2031     return true;
2032   }
2033 
2034   if (TLI->shouldEmitPCReloc(GV)) {
2035     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2036     MI.eraseFromParent();
2037     return true;
2038   }
2039 
2040   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2041   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2042 
2043   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2044       MachinePointerInfo::getGOT(MF),
2045       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2046           MachineMemOperand::MOInvariant,
2047       8 /*Size*/, Align(8));
2048 
2049   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2050 
2051   if (Ty.getSizeInBits() == 32) {
2052     // Truncate if this is a 32-bit constant adrdess.
2053     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2054     B.buildExtract(DstReg, Load, 0);
2055   } else
2056     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2057 
2058   MI.eraseFromParent();
2059   return true;
2060 }
2061 
2062 bool AMDGPULegalizerInfo::legalizeLoad(
2063   MachineInstr &MI, MachineRegisterInfo &MRI,
2064   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2065   B.setInstr(MI);
2066   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2067   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2068   Observer.changingInstr(MI);
2069   MI.getOperand(1).setReg(Cast.getReg(0));
2070   Observer.changedInstr(MI);
2071   return true;
2072 }
2073 
2074 bool AMDGPULegalizerInfo::legalizeFMad(
2075   MachineInstr &MI, MachineRegisterInfo &MRI,
2076   MachineIRBuilder &B) const {
2077   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2078   assert(Ty.isScalar());
2079 
2080   MachineFunction &MF = B.getMF();
2081   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2082 
2083   // TODO: Always legal with future ftz flag.
2084   // FIXME: Do we need just output?
2085   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2086     return true;
2087   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2088     return true;
2089 
2090   MachineIRBuilder HelperBuilder(MI);
2091   GISelObserverWrapper DummyObserver;
2092   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2093   HelperBuilder.setInstr(MI);
2094   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2095 }
2096 
2097 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2098   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2099   Register DstReg = MI.getOperand(0).getReg();
2100   Register PtrReg = MI.getOperand(1).getReg();
2101   Register CmpVal = MI.getOperand(2).getReg();
2102   Register NewVal = MI.getOperand(3).getReg();
2103 
2104   assert(SITargetLowering::isFlatGlobalAddrSpace(
2105            MRI.getType(PtrReg).getAddressSpace()) &&
2106          "this should not have been custom lowered");
2107 
2108   LLT ValTy = MRI.getType(CmpVal);
2109   LLT VecTy = LLT::vector(2, ValTy);
2110 
2111   B.setInstr(MI);
2112   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2113 
2114   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2115     .addDef(DstReg)
2116     .addUse(PtrReg)
2117     .addUse(PackedVal)
2118     .setMemRefs(MI.memoperands());
2119 
2120   MI.eraseFromParent();
2121   return true;
2122 }
2123 
2124 bool AMDGPULegalizerInfo::legalizeFlog(
2125   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2126   Register Dst = MI.getOperand(0).getReg();
2127   Register Src = MI.getOperand(1).getReg();
2128   LLT Ty = B.getMRI()->getType(Dst);
2129   unsigned Flags = MI.getFlags();
2130   B.setInstr(MI);
2131 
2132   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2133   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2134 
2135   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2136   MI.eraseFromParent();
2137   return true;
2138 }
2139 
2140 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2141                                        MachineIRBuilder &B) const {
2142   Register Dst = MI.getOperand(0).getReg();
2143   Register Src = MI.getOperand(1).getReg();
2144   unsigned Flags = MI.getFlags();
2145   LLT Ty = B.getMRI()->getType(Dst);
2146   B.setInstr(MI);
2147 
2148   auto K = B.buildFConstant(Ty, numbers::log2e);
2149   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2150   B.buildFExp2(Dst, Mul, Flags);
2151   MI.eraseFromParent();
2152   return true;
2153 }
2154 
2155 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2156                                        MachineIRBuilder &B) const {
2157   Register Dst = MI.getOperand(0).getReg();
2158   Register Src0 = MI.getOperand(1).getReg();
2159   Register Src1 = MI.getOperand(2).getReg();
2160   unsigned Flags = MI.getFlags();
2161   LLT Ty = B.getMRI()->getType(Dst);
2162   B.setInstr(MI);
2163   const LLT S16 = LLT::scalar(16);
2164   const LLT S32 = LLT::scalar(32);
2165 
2166   if (Ty == S32) {
2167     auto Log = B.buildFLog2(S32, Src0, Flags);
2168     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2169       .addUse(Log.getReg(0))
2170       .addUse(Src1)
2171       .setMIFlags(Flags);
2172     B.buildFExp2(Dst, Mul, Flags);
2173   } else if (Ty == S16) {
2174     // There's no f16 fmul_legacy, so we need to convert for it.
2175     auto Log = B.buildFLog2(S16, Src0, Flags);
2176     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2177     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2178     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2179       .addUse(Ext0.getReg(0))
2180       .addUse(Ext1.getReg(0))
2181       .setMIFlags(Flags);
2182 
2183     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2184   } else
2185     return false;
2186 
2187   MI.eraseFromParent();
2188   return true;
2189 }
2190 
2191 // Find a source register, ignoring any possible source modifiers.
2192 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2193   Register ModSrc = OrigSrc;
2194   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2195     ModSrc = SrcFNeg->getOperand(1).getReg();
2196     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2197       ModSrc = SrcFAbs->getOperand(1).getReg();
2198   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2199     ModSrc = SrcFAbs->getOperand(1).getReg();
2200   return ModSrc;
2201 }
2202 
2203 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2204                                          MachineRegisterInfo &MRI,
2205                                          MachineIRBuilder &B) const {
2206   B.setInstr(MI);
2207 
2208   const LLT S1 = LLT::scalar(1);
2209   const LLT S64 = LLT::scalar(64);
2210   Register Dst = MI.getOperand(0).getReg();
2211   Register OrigSrc = MI.getOperand(1).getReg();
2212   unsigned Flags = MI.getFlags();
2213   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2214          "this should not have been custom lowered");
2215 
2216   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2217   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2218   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2219   // V_FRACT bug is:
2220   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2221   //
2222   // Convert floor(x) to (x - fract(x))
2223 
2224   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2225     .addUse(OrigSrc)
2226     .setMIFlags(Flags);
2227 
2228   // Give source modifier matching some assistance before obscuring a foldable
2229   // pattern.
2230 
2231   // TODO: We can avoid the neg on the fract? The input sign to fract
2232   // shouldn't matter?
2233   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2234 
2235   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2236 
2237   Register Min = MRI.createGenericVirtualRegister(S64);
2238 
2239   // We don't need to concern ourselves with the snan handling difference, so
2240   // use the one which will directly select.
2241   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2242   if (MFI->getMode().IEEE)
2243     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2244   else
2245     B.buildFMinNum(Min, Fract, Const, Flags);
2246 
2247   Register CorrectedFract = Min;
2248   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2249     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2250     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2251   }
2252 
2253   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2254   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2255 
2256   MI.eraseFromParent();
2257   return true;
2258 }
2259 
2260 // Turn an illegal packed v2s16 build vector into bit operations.
2261 // TODO: This should probably be a bitcast action in LegalizerHelper.
2262 bool AMDGPULegalizerInfo::legalizeBuildVector(
2263   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2264   Register Dst = MI.getOperand(0).getReg();
2265   const LLT S32 = LLT::scalar(32);
2266   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2267 
2268   Register Src0 = MI.getOperand(1).getReg();
2269   Register Src1 = MI.getOperand(2).getReg();
2270   assert(MRI.getType(Src0) == LLT::scalar(16));
2271 
2272   B.setInstr(MI);
2273   auto Merge = B.buildMerge(S32, {Src0, Src1});
2274   B.buildBitcast(Dst, Merge);
2275 
2276   MI.eraseFromParent();
2277   return true;
2278 }
2279 
2280 // Return the use branch instruction, otherwise null if the usage is invalid.
2281 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2282                                        MachineRegisterInfo &MRI,
2283                                        MachineInstr *&Br,
2284                                        MachineBasicBlock *&UncondBrTarget) {
2285   Register CondDef = MI.getOperand(0).getReg();
2286   if (!MRI.hasOneNonDBGUse(CondDef))
2287     return nullptr;
2288 
2289   MachineBasicBlock *Parent = MI.getParent();
2290   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2291   if (UseMI.getParent() != Parent ||
2292       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2293     return nullptr;
2294 
2295   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2296   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2297   if (Next == Parent->end()) {
2298     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2299     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2300       return nullptr;
2301     UncondBrTarget = &*NextMBB;
2302   } else {
2303     if (Next->getOpcode() != AMDGPU::G_BR)
2304       return nullptr;
2305     Br = &*Next;
2306     UncondBrTarget = Br->getOperand(0).getMBB();
2307   }
2308 
2309   return &UseMI;
2310 }
2311 
2312 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2313                                                MachineRegisterInfo &MRI,
2314                                                Register LiveIn,
2315                                                Register PhyReg) const {
2316   assert(PhyReg.isPhysical() && "Physical register expected");
2317 
2318   // Insert the live-in copy, if required, by defining destination virtual
2319   // register.
2320   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2321   if (!MRI.getVRegDef(LiveIn)) {
2322     // FIXME: Should have scoped insert pt
2323     MachineBasicBlock &OrigInsBB = B.getMBB();
2324     auto OrigInsPt = B.getInsertPt();
2325 
2326     MachineBasicBlock &EntryMBB = B.getMF().front();
2327     EntryMBB.addLiveIn(PhyReg);
2328     B.setInsertPt(EntryMBB, EntryMBB.begin());
2329     B.buildCopy(LiveIn, PhyReg);
2330 
2331     B.setInsertPt(OrigInsBB, OrigInsPt);
2332   }
2333 
2334   return LiveIn;
2335 }
2336 
2337 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2338                                                 MachineRegisterInfo &MRI,
2339                                                 Register PhyReg, LLT Ty,
2340                                                 bool InsertLiveInCopy) const {
2341   assert(PhyReg.isPhysical() && "Physical register expected");
2342 
2343   // Get or create virtual live-in regester
2344   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2345   if (!LiveIn) {
2346     LiveIn = MRI.createGenericVirtualRegister(Ty);
2347     MRI.addLiveIn(PhyReg, LiveIn);
2348   }
2349 
2350   // When the actual true copy required is from virtual register to physical
2351   // register (to be inserted later), live-in copy insertion from physical
2352   // to register virtual register is not required
2353   if (!InsertLiveInCopy)
2354     return LiveIn;
2355 
2356   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2357 }
2358 
2359 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2360     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2361   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2362   const ArgDescriptor *Arg;
2363   const TargetRegisterClass *RC;
2364   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2365   if (!Arg) {
2366     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2367     return nullptr;
2368   }
2369   return Arg;
2370 }
2371 
2372 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2373                                          const ArgDescriptor *Arg) const {
2374   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2375     return false; // TODO: Handle these
2376 
2377   Register SrcReg = Arg->getRegister();
2378   assert(SrcReg.isPhysical() && "Physical register expected");
2379   assert(DstReg.isVirtual() && "Virtual register expected");
2380 
2381   MachineRegisterInfo &MRI = *B.getMRI();
2382 
2383   LLT Ty = MRI.getType(DstReg);
2384   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2385 
2386   if (Arg->isMasked()) {
2387     // TODO: Should we try to emit this once in the entry block?
2388     const LLT S32 = LLT::scalar(32);
2389     const unsigned Mask = Arg->getMask();
2390     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2391 
2392     Register AndMaskSrc = LiveIn;
2393 
2394     if (Shift != 0) {
2395       auto ShiftAmt = B.buildConstant(S32, Shift);
2396       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2397     }
2398 
2399     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2400   } else {
2401     B.buildCopy(DstReg, LiveIn);
2402   }
2403 
2404   return true;
2405 }
2406 
2407 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2408     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2409     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2410   B.setInstr(MI);
2411 
2412   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2413   if (!Arg)
2414     return false;
2415 
2416   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2417     return false;
2418 
2419   MI.eraseFromParent();
2420   return true;
2421 }
2422 
2423 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2424                                        MachineRegisterInfo &MRI,
2425                                        MachineIRBuilder &B) const {
2426   B.setInstr(MI);
2427   Register Dst = MI.getOperand(0).getReg();
2428   LLT DstTy = MRI.getType(Dst);
2429   LLT S16 = LLT::scalar(16);
2430   LLT S32 = LLT::scalar(32);
2431   LLT S64 = LLT::scalar(64);
2432 
2433   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2434     return true;
2435 
2436   if (DstTy == S16)
2437     return legalizeFDIV16(MI, MRI, B);
2438   if (DstTy == S32)
2439     return legalizeFDIV32(MI, MRI, B);
2440   if (DstTy == S64)
2441     return legalizeFDIV64(MI, MRI, B);
2442 
2443   return false;
2444 }
2445 
2446 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2447   const LLT S32 = LLT::scalar(32);
2448 
2449   auto Cvt0 = B.buildUITOFP(S32, Src);
2450   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2451   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2452   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2453   return B.buildFPTOUI(S32, Mul).getReg(0);
2454 }
2455 
2456 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2457                                                   Register DstReg,
2458                                                   Register Num,
2459                                                   Register Den,
2460                                                   bool IsRem) const {
2461   const LLT S1 = LLT::scalar(1);
2462   const LLT S32 = LLT::scalar(32);
2463 
2464   // RCP =  URECIP(Den) = 2^32 / Den + e
2465   // e is rounding error.
2466   auto RCP = buildDivRCP(B, Den);
2467 
2468   // RCP_LO = mul(RCP, Den)
2469   auto RCP_LO = B.buildMul(S32, RCP, Den);
2470 
2471   // RCP_HI = mulhu (RCP, Den) */
2472   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2473 
2474   // NEG_RCP_LO = -RCP_LO
2475   auto Zero = B.buildConstant(S32, 0);
2476   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2477 
2478   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2479   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2480   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2481 
2482   // Calculate the rounding error from the URECIP instruction
2483   // E = mulhu(ABS_RCP_LO, RCP)
2484   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2485 
2486   // RCP_A_E = RCP + E
2487   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2488 
2489   // RCP_S_E = RCP - E
2490   auto RCP_S_E = B.buildSub(S32, RCP, E);
2491 
2492   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2493   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2494 
2495   // Quotient = mulhu(Tmp0, Num)stmp
2496   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2497 
2498   // Num_S_Remainder = Quotient * Den
2499   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2500 
2501   // Remainder = Num - Num_S_Remainder
2502   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2503 
2504   // Remainder_GE_Den = Remainder >= Den
2505   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2506 
2507   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2508   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2509                                        Num, Num_S_Remainder);
2510 
2511   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2512   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2513 
2514   // Calculate Division result:
2515 
2516   // Quotient_A_One = Quotient + 1
2517   auto One = B.buildConstant(S32, 1);
2518   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2519 
2520   // Quotient_S_One = Quotient - 1
2521   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2522 
2523   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2524   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2525 
2526   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2527   if (IsRem) {
2528     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2529 
2530     // Calculate Rem result:
2531     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2532 
2533     // Remainder_A_Den = Remainder + Den
2534     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2535 
2536     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2537     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2538 
2539     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2540     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2541   } else {
2542     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2543   }
2544 }
2545 
2546 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2547                                               MachineRegisterInfo &MRI,
2548                                               MachineIRBuilder &B) const {
2549   B.setInstr(MI);
2550   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2551   Register DstReg = MI.getOperand(0).getReg();
2552   Register Num = MI.getOperand(1).getReg();
2553   Register Den = MI.getOperand(2).getReg();
2554   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2555   MI.eraseFromParent();
2556   return true;
2557 }
2558 
2559 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2560 //
2561 // Return lo, hi of result
2562 //
2563 // %cvt.lo = G_UITOFP Val.lo
2564 // %cvt.hi = G_UITOFP Val.hi
2565 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2566 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2567 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2568 // %mul2 = G_FMUL %mul1, 2**(-32)
2569 // %trunc = G_INTRINSIC_TRUNC %mul2
2570 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2571 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2572 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2573                                                        Register Val) {
2574   const LLT S32 = LLT::scalar(32);
2575   auto Unmerge = B.buildUnmerge(S32, Val);
2576 
2577   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2578   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2579 
2580   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2581                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2582 
2583   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2584   auto Mul1 =
2585       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2586 
2587   // 2**(-32)
2588   auto Mul2 =
2589       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2590   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2591 
2592   // -(2**32)
2593   auto Mad2 = B.buildFMAD(S32, Trunc,
2594                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2595 
2596   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2597   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2598 
2599   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2600 }
2601 
2602 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2603                                               MachineRegisterInfo &MRI,
2604                                               MachineIRBuilder &B) const {
2605   B.setInstr(MI);
2606 
2607   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2608   const LLT S32 = LLT::scalar(32);
2609   const LLT S64 = LLT::scalar(64);
2610   const LLT S1 = LLT::scalar(1);
2611   Register Numer = MI.getOperand(1).getReg();
2612   Register Denom = MI.getOperand(2).getReg();
2613   Register RcpLo, RcpHi;
2614 
2615   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2616 
2617   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2618 
2619   auto Zero64 = B.buildConstant(S64, 0);
2620   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2621 
2622   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2623   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2624 
2625   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2626   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2627   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2628 
2629   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2630   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2631   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2632   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2633 
2634   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2635   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2636   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2637   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2638   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2639 
2640   auto Zero32 = B.buildConstant(S32, 0);
2641   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2642   auto Add2_HiC =
2643       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2644   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2645   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2646 
2647   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2648   Register NumerLo = UnmergeNumer.getReg(0);
2649   Register NumerHi = UnmergeNumer.getReg(1);
2650 
2651   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2652   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2653   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2654   Register Mul3_Lo = UnmergeMul3.getReg(0);
2655   Register Mul3_Hi = UnmergeMul3.getReg(1);
2656   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2657   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2658   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2659   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2660 
2661   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2662   Register DenomLo = UnmergeDenom.getReg(0);
2663   Register DenomHi = UnmergeDenom.getReg(1);
2664 
2665   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2666   auto C1 = B.buildSExt(S32, CmpHi);
2667 
2668   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2669   auto C2 = B.buildSExt(S32, CmpLo);
2670 
2671   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2672   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2673 
2674   // TODO: Here and below portions of the code can be enclosed into if/endif.
2675   // Currently control flow is unconditional and we have 4 selects after
2676   // potential endif to substitute PHIs.
2677 
2678   // if C3 != 0 ...
2679   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2680   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2681   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2682   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2683 
2684   auto One64 = B.buildConstant(S64, 1);
2685   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2686 
2687   auto C4 =
2688       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2689   auto C5 =
2690       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2691   auto C6 = B.buildSelect(
2692       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2693 
2694   // if (C6 != 0)
2695   auto Add4 = B.buildAdd(S64, Add3, One64);
2696   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2697 
2698   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2699   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2700   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2701 
2702   // endif C6
2703   // endif C3
2704 
2705   if (IsDiv) {
2706     auto Sel1 = B.buildSelect(
2707         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2708     B.buildSelect(MI.getOperand(0),
2709                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2710   } else {
2711     auto Sel2 = B.buildSelect(
2712         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2713     B.buildSelect(MI.getOperand(0),
2714                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2715   }
2716 
2717   MI.eraseFromParent();
2718   return true;
2719 }
2720 
2721 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2722                                             MachineRegisterInfo &MRI,
2723                                             MachineIRBuilder &B) const {
2724   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2725   if (Ty == LLT::scalar(32))
2726     return legalizeUDIV_UREM32(MI, MRI, B);
2727   if (Ty == LLT::scalar(64))
2728     return legalizeUDIV_UREM64(MI, MRI, B);
2729   return false;
2730 }
2731 
2732 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2733                                               MachineRegisterInfo &MRI,
2734                                               MachineIRBuilder &B) const {
2735   B.setInstr(MI);
2736   const LLT S32 = LLT::scalar(32);
2737 
2738   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2739   Register DstReg = MI.getOperand(0).getReg();
2740   Register LHS = MI.getOperand(1).getReg();
2741   Register RHS = MI.getOperand(2).getReg();
2742 
2743   auto ThirtyOne = B.buildConstant(S32, 31);
2744   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2745   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2746 
2747   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2748   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2749 
2750   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2751   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2752 
2753   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2754   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2755 
2756   if (IsRem) {
2757     auto RSign = LHSign; // Remainder sign is the same as LHS
2758     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2759     B.buildSub(DstReg, UDivRem, RSign);
2760   } else {
2761     auto DSign = B.buildXor(S32, LHSign, RHSign);
2762     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2763     B.buildSub(DstReg, UDivRem, DSign);
2764   }
2765 
2766   MI.eraseFromParent();
2767   return true;
2768 }
2769 
2770 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2771                                             MachineRegisterInfo &MRI,
2772                                             MachineIRBuilder &B) const {
2773   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2774     return legalizeSDIV_SREM32(MI, MRI, B);
2775   return false;
2776 }
2777 
2778 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2779                                                  MachineRegisterInfo &MRI,
2780                                                  MachineIRBuilder &B) const {
2781   Register Res = MI.getOperand(0).getReg();
2782   Register LHS = MI.getOperand(1).getReg();
2783   Register RHS = MI.getOperand(2).getReg();
2784 
2785   uint16_t Flags = MI.getFlags();
2786 
2787   LLT ResTy = MRI.getType(Res);
2788   LLT S32 = LLT::scalar(32);
2789   LLT S64 = LLT::scalar(64);
2790 
2791   const MachineFunction &MF = B.getMF();
2792   bool Unsafe =
2793     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2794 
2795   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2796     return false;
2797 
2798   if (!Unsafe && ResTy == S32 &&
2799       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2800     return false;
2801 
2802   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2803     // 1 / x -> RCP(x)
2804     if (CLHS->isExactlyValue(1.0)) {
2805       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2806         .addUse(RHS)
2807         .setMIFlags(Flags);
2808 
2809       MI.eraseFromParent();
2810       return true;
2811     }
2812 
2813     // -1 / x -> RCP( FNEG(x) )
2814     if (CLHS->isExactlyValue(-1.0)) {
2815       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2816       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2817         .addUse(FNeg.getReg(0))
2818         .setMIFlags(Flags);
2819 
2820       MI.eraseFromParent();
2821       return true;
2822     }
2823   }
2824 
2825   // x / y -> x * (1.0 / y)
2826   if (Unsafe) {
2827     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2828       .addUse(RHS)
2829       .setMIFlags(Flags);
2830     B.buildFMul(Res, LHS, RCP, Flags);
2831 
2832     MI.eraseFromParent();
2833     return true;
2834   }
2835 
2836   return false;
2837 }
2838 
2839 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2840                                          MachineRegisterInfo &MRI,
2841                                          MachineIRBuilder &B) const {
2842   B.setInstr(MI);
2843   Register Res = MI.getOperand(0).getReg();
2844   Register LHS = MI.getOperand(1).getReg();
2845   Register RHS = MI.getOperand(2).getReg();
2846 
2847   uint16_t Flags = MI.getFlags();
2848 
2849   LLT S16 = LLT::scalar(16);
2850   LLT S32 = LLT::scalar(32);
2851 
2852   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2853   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2854 
2855   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2856     .addUse(RHSExt.getReg(0))
2857     .setMIFlags(Flags);
2858 
2859   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2860   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2861 
2862   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2863     .addUse(RDst.getReg(0))
2864     .addUse(RHS)
2865     .addUse(LHS)
2866     .setMIFlags(Flags);
2867 
2868   MI.eraseFromParent();
2869   return true;
2870 }
2871 
2872 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2873 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2874 static void toggleSPDenormMode(bool Enable,
2875                                MachineIRBuilder &B,
2876                                const GCNSubtarget &ST,
2877                                AMDGPU::SIModeRegisterDefaults Mode) {
2878   // Set SP denorm mode to this value.
2879   unsigned SPDenormMode =
2880     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2881 
2882   if (ST.hasDenormModeInst()) {
2883     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2884     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2885 
2886     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2887     B.buildInstr(AMDGPU::S_DENORM_MODE)
2888       .addImm(NewDenormModeValue);
2889 
2890   } else {
2891     // Select FP32 bit field in mode register.
2892     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2893                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2894                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2895 
2896     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2897       .addImm(SPDenormMode)
2898       .addImm(SPDenormModeBitField);
2899   }
2900 }
2901 
2902 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2903                                          MachineRegisterInfo &MRI,
2904                                          MachineIRBuilder &B) const {
2905   B.setInstr(MI);
2906   Register Res = MI.getOperand(0).getReg();
2907   Register LHS = MI.getOperand(1).getReg();
2908   Register RHS = MI.getOperand(2).getReg();
2909   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2910   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2911 
2912   uint16_t Flags = MI.getFlags();
2913 
2914   LLT S32 = LLT::scalar(32);
2915   LLT S1 = LLT::scalar(1);
2916 
2917   auto One = B.buildFConstant(S32, 1.0f);
2918 
2919   auto DenominatorScaled =
2920     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2921       .addUse(LHS)
2922       .addUse(RHS)
2923       .addImm(0)
2924       .setMIFlags(Flags);
2925   auto NumeratorScaled =
2926     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2927       .addUse(LHS)
2928       .addUse(RHS)
2929       .addImm(1)
2930       .setMIFlags(Flags);
2931 
2932   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2933     .addUse(DenominatorScaled.getReg(0))
2934     .setMIFlags(Flags);
2935   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2936 
2937   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2938   // aren't modeled as reading it.
2939   if (!Mode.allFP32Denormals())
2940     toggleSPDenormMode(true, B, ST, Mode);
2941 
2942   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2943   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2944   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2945   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2946   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2947   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2948 
2949   if (!Mode.allFP32Denormals())
2950     toggleSPDenormMode(false, B, ST, Mode);
2951 
2952   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2953     .addUse(Fma4.getReg(0))
2954     .addUse(Fma1.getReg(0))
2955     .addUse(Fma3.getReg(0))
2956     .addUse(NumeratorScaled.getReg(1))
2957     .setMIFlags(Flags);
2958 
2959   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2960     .addUse(Fmas.getReg(0))
2961     .addUse(RHS)
2962     .addUse(LHS)
2963     .setMIFlags(Flags);
2964 
2965   MI.eraseFromParent();
2966   return true;
2967 }
2968 
2969 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2970                                          MachineRegisterInfo &MRI,
2971                                          MachineIRBuilder &B) const {
2972   B.setInstr(MI);
2973   Register Res = MI.getOperand(0).getReg();
2974   Register LHS = MI.getOperand(1).getReg();
2975   Register RHS = MI.getOperand(2).getReg();
2976 
2977   uint16_t Flags = MI.getFlags();
2978 
2979   LLT S64 = LLT::scalar(64);
2980   LLT S1 = LLT::scalar(1);
2981 
2982   auto One = B.buildFConstant(S64, 1.0);
2983 
2984   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2985     .addUse(LHS)
2986     .addUse(RHS)
2987     .addImm(0)
2988     .setMIFlags(Flags);
2989 
2990   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2991 
2992   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2993     .addUse(DivScale0.getReg(0))
2994     .setMIFlags(Flags);
2995 
2996   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2997   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2998   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2999 
3000   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3001     .addUse(LHS)
3002     .addUse(RHS)
3003     .addImm(1)
3004     .setMIFlags(Flags);
3005 
3006   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3007   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3008   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3009 
3010   Register Scale;
3011   if (!ST.hasUsableDivScaleConditionOutput()) {
3012     // Workaround a hardware bug on SI where the condition output from div_scale
3013     // is not usable.
3014 
3015     LLT S32 = LLT::scalar(32);
3016 
3017     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3018     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3019     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3020     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3021 
3022     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3023                               Scale1Unmerge.getReg(1));
3024     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3025                               Scale0Unmerge.getReg(1));
3026     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3027   } else {
3028     Scale = DivScale1.getReg(1);
3029   }
3030 
3031   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3032     .addUse(Fma4.getReg(0))
3033     .addUse(Fma3.getReg(0))
3034     .addUse(Mul.getReg(0))
3035     .addUse(Scale)
3036     .setMIFlags(Flags);
3037 
3038   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3039     .addUse(Fmas.getReg(0))
3040     .addUse(RHS)
3041     .addUse(LHS)
3042     .setMIFlags(Flags);
3043 
3044   MI.eraseFromParent();
3045   return true;
3046 }
3047 
3048 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3049                                                  MachineRegisterInfo &MRI,
3050                                                  MachineIRBuilder &B) const {
3051   B.setInstr(MI);
3052   Register Res = MI.getOperand(0).getReg();
3053   Register LHS = MI.getOperand(2).getReg();
3054   Register RHS = MI.getOperand(3).getReg();
3055   uint16_t Flags = MI.getFlags();
3056 
3057   LLT S32 = LLT::scalar(32);
3058   LLT S1 = LLT::scalar(1);
3059 
3060   auto Abs = B.buildFAbs(S32, RHS, Flags);
3061   const APFloat C0Val(1.0f);
3062 
3063   auto C0 = B.buildConstant(S32, 0x6f800000);
3064   auto C1 = B.buildConstant(S32, 0x2f800000);
3065   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3066 
3067   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3068   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3069 
3070   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3071 
3072   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3073     .addUse(Mul0.getReg(0))
3074     .setMIFlags(Flags);
3075 
3076   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3077 
3078   B.buildFMul(Res, Sel, Mul1, Flags);
3079 
3080   MI.eraseFromParent();
3081   return true;
3082 }
3083 
3084 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3085                                                  MachineRegisterInfo &MRI,
3086                                                  MachineIRBuilder &B) const {
3087   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3088   if (!MFI->isEntryFunction()) {
3089     return legalizePreloadedArgIntrin(MI, MRI, B,
3090                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3091   }
3092 
3093   B.setInstr(MI);
3094 
3095   uint64_t Offset =
3096     ST.getTargetLowering()->getImplicitParameterOffset(
3097       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3098   Register DstReg = MI.getOperand(0).getReg();
3099   LLT DstTy = MRI.getType(DstReg);
3100   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3101 
3102   const ArgDescriptor *Arg;
3103   const TargetRegisterClass *RC;
3104   std::tie(Arg, RC)
3105     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3106   if (!Arg)
3107     return false;
3108 
3109   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3110   if (!loadInputValue(KernargPtrReg, B, Arg))
3111     return false;
3112 
3113   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3114   MI.eraseFromParent();
3115   return true;
3116 }
3117 
3118 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3119                                               MachineRegisterInfo &MRI,
3120                                               MachineIRBuilder &B,
3121                                               unsigned AddrSpace) const {
3122   B.setInstr(MI);
3123   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3124   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3125   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3126   MI.eraseFromParent();
3127   return true;
3128 }
3129 
3130 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3131 // offset (the offset that is included in bounds checking and swizzling, to be
3132 // split between the instruction's voffset and immoffset fields) and soffset
3133 // (the offset that is excluded from bounds checking and swizzling, to go in
3134 // the instruction's soffset field).  This function takes the first kind of
3135 // offset and figures out how to split it between voffset and immoffset.
3136 std::tuple<Register, unsigned, unsigned>
3137 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3138                                         Register OrigOffset) const {
3139   const unsigned MaxImm = 4095;
3140   Register BaseReg;
3141   unsigned TotalConstOffset;
3142   MachineInstr *OffsetDef;
3143   const LLT S32 = LLT::scalar(32);
3144 
3145   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3146     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3147 
3148   unsigned ImmOffset = TotalConstOffset;
3149 
3150   // If the immediate value is too big for the immoffset field, put the value
3151   // and -4096 into the immoffset field so that the value that is copied/added
3152   // for the voffset field is a multiple of 4096, and it stands more chance
3153   // of being CSEd with the copy/add for another similar load/store.
3154   // However, do not do that rounding down to a multiple of 4096 if that is a
3155   // negative number, as it appears to be illegal to have a negative offset
3156   // in the vgpr, even if adding the immediate offset makes it positive.
3157   unsigned Overflow = ImmOffset & ~MaxImm;
3158   ImmOffset -= Overflow;
3159   if ((int32_t)Overflow < 0) {
3160     Overflow += ImmOffset;
3161     ImmOffset = 0;
3162   }
3163 
3164   if (Overflow != 0) {
3165     if (!BaseReg) {
3166       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3167     } else {
3168       auto OverflowVal = B.buildConstant(S32, Overflow);
3169       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3170     }
3171   }
3172 
3173   if (!BaseReg)
3174     BaseReg = B.buildConstant(S32, 0).getReg(0);
3175 
3176   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3177 }
3178 
3179 /// Handle register layout difference for f16 images for some subtargets.
3180 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3181                                              MachineRegisterInfo &MRI,
3182                                              Register Reg) const {
3183   if (!ST.hasUnpackedD16VMem())
3184     return Reg;
3185 
3186   const LLT S16 = LLT::scalar(16);
3187   const LLT S32 = LLT::scalar(32);
3188   LLT StoreVT = MRI.getType(Reg);
3189   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3190 
3191   auto Unmerge = B.buildUnmerge(S16, Reg);
3192 
3193   SmallVector<Register, 4> WideRegs;
3194   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3195     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3196 
3197   int NumElts = StoreVT.getNumElements();
3198 
3199   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3200 }
3201 
3202 Register AMDGPULegalizerInfo::fixStoreSourceType(
3203   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3204   MachineRegisterInfo *MRI = B.getMRI();
3205   LLT Ty = MRI->getType(VData);
3206 
3207   const LLT S16 = LLT::scalar(16);
3208 
3209   // Fixup illegal register types for i8 stores.
3210   if (Ty == LLT::scalar(8) || Ty == S16) {
3211     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3212     return AnyExt;
3213   }
3214 
3215   if (Ty.isVector()) {
3216     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3217       if (IsFormat)
3218         return handleD16VData(B, *MRI, VData);
3219     }
3220   }
3221 
3222   return VData;
3223 }
3224 
3225 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3226                                               MachineRegisterInfo &MRI,
3227                                               MachineIRBuilder &B,
3228                                               bool IsTyped,
3229                                               bool IsFormat) const {
3230   B.setInstr(MI);
3231 
3232   Register VData = MI.getOperand(1).getReg();
3233   LLT Ty = MRI.getType(VData);
3234   LLT EltTy = Ty.getScalarType();
3235   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3236   const LLT S32 = LLT::scalar(32);
3237 
3238   VData = fixStoreSourceType(B, VData, IsFormat);
3239   Register RSrc = MI.getOperand(2).getReg();
3240 
3241   MachineMemOperand *MMO = *MI.memoperands_begin();
3242   const int MemSize = MMO->getSize();
3243 
3244   unsigned ImmOffset;
3245   unsigned TotalOffset;
3246 
3247   // The typed intrinsics add an immediate after the registers.
3248   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3249 
3250   // The struct intrinsic variants add one additional operand over raw.
3251   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3252   Register VIndex;
3253   int OpOffset = 0;
3254   if (HasVIndex) {
3255     VIndex = MI.getOperand(3).getReg();
3256     OpOffset = 1;
3257   }
3258 
3259   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3260   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3261 
3262   unsigned Format = 0;
3263   if (IsTyped) {
3264     Format = MI.getOperand(5 + OpOffset).getImm();
3265     ++OpOffset;
3266   }
3267 
3268   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3269 
3270   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3271   if (TotalOffset != 0)
3272     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3273 
3274   unsigned Opc;
3275   if (IsTyped) {
3276     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3277                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3278   } else if (IsFormat) {
3279     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3280                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3281   } else {
3282     switch (MemSize) {
3283     case 1:
3284       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3285       break;
3286     case 2:
3287       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3288       break;
3289     default:
3290       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3291       break;
3292     }
3293   }
3294 
3295   if (!VIndex)
3296     VIndex = B.buildConstant(S32, 0).getReg(0);
3297 
3298   auto MIB = B.buildInstr(Opc)
3299     .addUse(VData)              // vdata
3300     .addUse(RSrc)               // rsrc
3301     .addUse(VIndex)             // vindex
3302     .addUse(VOffset)            // voffset
3303     .addUse(SOffset)            // soffset
3304     .addImm(ImmOffset);         // offset(imm)
3305 
3306   if (IsTyped)
3307     MIB.addImm(Format);
3308 
3309   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3310      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3311      .addMemOperand(MMO);
3312 
3313   MI.eraseFromParent();
3314   return true;
3315 }
3316 
3317 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3318                                              MachineRegisterInfo &MRI,
3319                                              MachineIRBuilder &B,
3320                                              bool IsFormat,
3321                                              bool IsTyped) const {
3322   B.setInstr(MI);
3323 
3324   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3325   MachineMemOperand *MMO = *MI.memoperands_begin();
3326   const int MemSize = MMO->getSize();
3327   const LLT S32 = LLT::scalar(32);
3328 
3329   Register Dst = MI.getOperand(0).getReg();
3330   Register RSrc = MI.getOperand(2).getReg();
3331 
3332   // The typed intrinsics add an immediate after the registers.
3333   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3334 
3335   // The struct intrinsic variants add one additional operand over raw.
3336   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3337   Register VIndex;
3338   int OpOffset = 0;
3339   if (HasVIndex) {
3340     VIndex = MI.getOperand(3).getReg();
3341     OpOffset = 1;
3342   }
3343 
3344   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3345   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3346 
3347   unsigned Format = 0;
3348   if (IsTyped) {
3349     Format = MI.getOperand(5 + OpOffset).getImm();
3350     ++OpOffset;
3351   }
3352 
3353   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3354   unsigned ImmOffset;
3355   unsigned TotalOffset;
3356 
3357   LLT Ty = MRI.getType(Dst);
3358   LLT EltTy = Ty.getScalarType();
3359   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3360   const bool Unpacked = ST.hasUnpackedD16VMem();
3361 
3362   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3363   if (TotalOffset != 0)
3364     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3365 
3366   unsigned Opc;
3367 
3368   if (IsTyped) {
3369     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3370                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3371   } else if (IsFormat) {
3372     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3373                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3374   } else {
3375     switch (MemSize) {
3376     case 1:
3377       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3378       break;
3379     case 2:
3380       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3381       break;
3382     default:
3383       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3384       break;
3385     }
3386   }
3387 
3388   Register LoadDstReg;
3389 
3390   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3391   LLT UnpackedTy = Ty.changeElementSize(32);
3392 
3393   if (IsExtLoad)
3394     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3395   else if (Unpacked && IsD16 && Ty.isVector())
3396     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3397   else
3398     LoadDstReg = Dst;
3399 
3400   if (!VIndex)
3401     VIndex = B.buildConstant(S32, 0).getReg(0);
3402 
3403   auto MIB = B.buildInstr(Opc)
3404     .addDef(LoadDstReg)         // vdata
3405     .addUse(RSrc)               // rsrc
3406     .addUse(VIndex)             // vindex
3407     .addUse(VOffset)            // voffset
3408     .addUse(SOffset)            // soffset
3409     .addImm(ImmOffset);         // offset(imm)
3410 
3411   if (IsTyped)
3412     MIB.addImm(Format);
3413 
3414   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3415      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3416      .addMemOperand(MMO);
3417 
3418   if (LoadDstReg != Dst) {
3419     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3420 
3421     // Widen result for extending loads was widened.
3422     if (IsExtLoad)
3423       B.buildTrunc(Dst, LoadDstReg);
3424     else {
3425       // Repack to original 16-bit vector result
3426       // FIXME: G_TRUNC should work, but legalization currently fails
3427       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3428       SmallVector<Register, 4> Repack;
3429       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3430         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3431       B.buildMerge(Dst, Repack);
3432     }
3433   }
3434 
3435   MI.eraseFromParent();
3436   return true;
3437 }
3438 
3439 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3440                                                MachineIRBuilder &B,
3441                                                bool IsInc) const {
3442   B.setInstr(MI);
3443   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3444                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3445   B.buildInstr(Opc)
3446     .addDef(MI.getOperand(0).getReg())
3447     .addUse(MI.getOperand(2).getReg())
3448     .addUse(MI.getOperand(3).getReg())
3449     .cloneMemRefs(MI);
3450   MI.eraseFromParent();
3451   return true;
3452 }
3453 
3454 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3455   switch (IntrID) {
3456   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3457   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3458     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3459   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3460   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3461     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3462   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3463   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3464     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3465   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3466   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3467     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3468   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3469   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3470     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3471   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3472   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3473     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3474   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3475   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3476     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3477   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3478   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3479     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3480   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3481   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3482     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3483   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3484   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3485     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3486   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3487   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3488     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3489   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3490   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3491     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3492   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3493   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3494     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3495   default:
3496     llvm_unreachable("unhandled atomic opcode");
3497   }
3498 }
3499 
3500 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3501                                                MachineIRBuilder &B,
3502                                                Intrinsic::ID IID) const {
3503   B.setInstr(MI);
3504 
3505   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3506                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3507 
3508   Register Dst = MI.getOperand(0).getReg();
3509   Register VData = MI.getOperand(2).getReg();
3510 
3511   Register CmpVal;
3512   int OpOffset = 0;
3513 
3514   if (IsCmpSwap) {
3515     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3516     ++OpOffset;
3517   }
3518 
3519   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3520   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3521 
3522   // The struct intrinsic variants add one additional operand over raw.
3523   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3524   Register VIndex;
3525   if (HasVIndex) {
3526     VIndex = MI.getOperand(4 + OpOffset).getReg();
3527     ++OpOffset;
3528   }
3529 
3530   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3531   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3532   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3533 
3534   MachineMemOperand *MMO = *MI.memoperands_begin();
3535 
3536   unsigned ImmOffset;
3537   unsigned TotalOffset;
3538   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3539   if (TotalOffset != 0)
3540     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3541 
3542   if (!VIndex)
3543     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3544 
3545   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3546     .addDef(Dst)
3547     .addUse(VData); // vdata
3548 
3549   if (IsCmpSwap)
3550     MIB.addReg(CmpVal);
3551 
3552   MIB.addUse(RSrc)               // rsrc
3553      .addUse(VIndex)             // vindex
3554      .addUse(VOffset)            // voffset
3555      .addUse(SOffset)            // soffset
3556      .addImm(ImmOffset)          // offset(imm)
3557      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3558      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3559      .addMemOperand(MMO);
3560 
3561   MI.eraseFromParent();
3562   return true;
3563 }
3564 
3565 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3566 /// vector with s16 typed elements.
3567 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3568                                         SmallVectorImpl<Register> &PackedAddrs,
3569                                         int AddrIdx, int DimIdx, int NumVAddrs,
3570                                         int NumGradients) {
3571   const LLT S16 = LLT::scalar(16);
3572   const LLT V2S16 = LLT::vector(2, 16);
3573 
3574   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3575     MachineOperand &SrcOp = MI.getOperand(I);
3576     if (!SrcOp.isReg())
3577       continue; // _L to _LZ may have eliminated this.
3578 
3579     Register AddrReg = SrcOp.getReg();
3580 
3581     if (I < DimIdx) {
3582       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3583       PackedAddrs.push_back(AddrReg);
3584     } else {
3585       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3586       // derivatives dx/dh and dx/dv are packed with undef.
3587       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3588           ((NumGradients / 2) % 2 == 1 &&
3589            (I == DimIdx + (NumGradients / 2) - 1 ||
3590             I == DimIdx + NumGradients - 1)) ||
3591           // Check for _L to _LZ optimization
3592           !MI.getOperand(I + 1).isReg()) {
3593         PackedAddrs.push_back(
3594             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3595                 .getReg(0));
3596       } else {
3597         PackedAddrs.push_back(
3598             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3599                 .getReg(0));
3600         ++I;
3601       }
3602     }
3603   }
3604 }
3605 
3606 /// Convert from separate vaddr components to a single vector address register,
3607 /// and replace the remaining operands with $noreg.
3608 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3609                                      int DimIdx, int NumVAddrs) {
3610   const LLT S32 = LLT::scalar(32);
3611 
3612   SmallVector<Register, 8> AddrRegs;
3613   for (int I = 0; I != NumVAddrs; ++I) {
3614     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3615     if (SrcOp.isReg()) {
3616       AddrRegs.push_back(SrcOp.getReg());
3617       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3618     }
3619   }
3620 
3621   int NumAddrRegs = AddrRegs.size();
3622   if (NumAddrRegs != 1) {
3623     // Round up to 8 elements for v5-v7
3624     // FIXME: Missing intermediate sized register classes and instructions.
3625     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3626       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3627       auto Undef = B.buildUndef(S32);
3628       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3629       NumAddrRegs = RoundedNumRegs;
3630     }
3631 
3632     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3633     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3634   }
3635 
3636   for (int I = 1; I != NumVAddrs; ++I) {
3637     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3638     if (SrcOp.isReg())
3639       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3640   }
3641 }
3642 
3643 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3644 ///
3645 /// Depending on the subtarget, load/store with 16-bit element data need to be
3646 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3647 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3648 /// registers.
3649 ///
3650 /// We don't want to directly select image instructions just yet, but also want
3651 /// to exposes all register repacking to the legalizer/combiners. We also don't
3652 /// want a selected instrution entering RegBankSelect. In order to avoid
3653 /// defining a multitude of intermediate image instructions, directly hack on
3654 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3655 /// now unnecessary arguments with $noreg.
3656 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3657     MachineInstr &MI, MachineIRBuilder &B,
3658     GISelChangeObserver &Observer,
3659     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3660   B.setInstr(MI);
3661 
3662   const int NumDefs = MI.getNumExplicitDefs();
3663   bool IsTFE = NumDefs == 2;
3664   // We are only processing the operands of d16 image operations on subtargets
3665   // that use the unpacked register layout, or need to repack the TFE result.
3666 
3667   // TODO: Do we need to guard against already legalized intrinsics?
3668   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3669     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3670 
3671   MachineRegisterInfo *MRI = B.getMRI();
3672   const LLT S32 = LLT::scalar(32);
3673   const LLT S16 = LLT::scalar(16);
3674   const LLT V2S16 = LLT::vector(2, 16);
3675 
3676   // Index of first address argument
3677   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3678 
3679   // Check for 16 bit addresses and pack if true.
3680   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3681   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3682   const bool IsA16 = AddrTy == S16;
3683 
3684   int NumVAddrs, NumGradients;
3685   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3686   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3687     getDMaskIdx(BaseOpcode, NumDefs);
3688   unsigned DMask = 0;
3689 
3690   int DMaskLanes = 0;
3691   if (!BaseOpcode->Atomic) {
3692     DMask = MI.getOperand(DMaskIdx).getImm();
3693     if (BaseOpcode->Gather4) {
3694       DMaskLanes = 4;
3695     } else if (DMask != 0) {
3696       DMaskLanes = countPopulation(DMask);
3697     } else if (!IsTFE && !BaseOpcode->Store) {
3698       // If dmask is 0, this is a no-op load. This can be eliminated.
3699       B.buildUndef(MI.getOperand(0));
3700       MI.eraseFromParent();
3701       return true;
3702     }
3703   }
3704 
3705   Observer.changingInstr(MI);
3706   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3707 
3708   unsigned NewOpcode = NumDefs == 0 ?
3709     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3710 
3711   // Track that we legalized this
3712   MI.setDesc(B.getTII().get(NewOpcode));
3713 
3714   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3715   // dmask to be at least 1 otherwise the instruction will fail
3716   if (IsTFE && DMask == 0) {
3717     DMask = 0x1;
3718     DMaskLanes = 1;
3719     MI.getOperand(DMaskIdx).setImm(DMask);
3720   }
3721 
3722   if (BaseOpcode->Atomic) {
3723     Register VData0 = MI.getOperand(2).getReg();
3724     LLT Ty = MRI->getType(VData0);
3725 
3726     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3727     if (Ty.isVector())
3728       return false;
3729 
3730     if (BaseOpcode->AtomicX2) {
3731       Register VData1 = MI.getOperand(3).getReg();
3732       // The two values are packed in one register.
3733       LLT PackedTy = LLT::vector(2, Ty);
3734       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3735       MI.getOperand(2).setReg(Concat.getReg(0));
3736       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3737     }
3738   }
3739 
3740   int CorrectedNumVAddrs = NumVAddrs;
3741 
3742   // Optimize _L to _LZ when _L is zero
3743   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3744         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3745     const ConstantFP *ConstantLod;
3746     const int LodIdx = AddrIdx + NumVAddrs - 1;
3747 
3748     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3749       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3750         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3751         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3752           LZMappingInfo->LZ, ImageDimIntr->Dim);
3753 
3754         // The starting indexes should remain in the same place.
3755         --NumVAddrs;
3756         --CorrectedNumVAddrs;
3757 
3758         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3759           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3760         MI.RemoveOperand(LodIdx);
3761       }
3762     }
3763   }
3764 
3765   // Optimize _mip away, when 'lod' is zero
3766   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3767     int64_t ConstantLod;
3768     const int LodIdx = AddrIdx + NumVAddrs - 1;
3769 
3770     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3771       if (ConstantLod == 0) {
3772         // TODO: Change intrinsic opcode and remove operand instead or replacing
3773         // it with 0, as the _L to _LZ handling is done above.
3774         MI.getOperand(LodIdx).ChangeToImmediate(0);
3775         --CorrectedNumVAddrs;
3776       }
3777     }
3778   }
3779 
3780   // If the register allocator cannot place the address registers contiguously
3781   // without introducing moves, then using the non-sequential address encoding
3782   // is always preferable, since it saves VALU instructions and is usually a
3783   // wash in terms of code size or even better.
3784   //
3785   // However, we currently have no way of hinting to the register allocator
3786   // that MIMG addresses should be placed contiguously when it is possible to
3787   // do so, so force non-NSA for the common 2-address case as a heuristic.
3788   //
3789   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3790   // allocation when possible.
3791   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3792 
3793   // Rewrite the addressing register layout before doing anything else.
3794   if (IsA16) {
3795     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3796     // should be introduced.
3797     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3798       return false;
3799 
3800     if (NumVAddrs > 1) {
3801       SmallVector<Register, 4> PackedRegs;
3802       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3803                                   NumGradients);
3804 
3805       if (!UseNSA && PackedRegs.size() > 1) {
3806         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3807         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3808         PackedRegs[0] = Concat.getReg(0);
3809         PackedRegs.resize(1);
3810       }
3811 
3812       const int NumPacked = PackedRegs.size();
3813       for (int I = 0; I != NumVAddrs; ++I) {
3814         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3815         if (!SrcOp.isReg()) {
3816           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3817           continue;
3818         }
3819 
3820         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3821 
3822         if (I < NumPacked)
3823           SrcOp.setReg(PackedRegs[I]);
3824         else
3825           SrcOp.setReg(AMDGPU::NoRegister);
3826       }
3827     }
3828   } else if (!UseNSA && NumVAddrs > 1) {
3829     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3830   }
3831 
3832 
3833   if (BaseOpcode->Store) { // No TFE for stores?
3834     // TODO: Handle dmask trim
3835     Register VData = MI.getOperand(1).getReg();
3836     LLT Ty = MRI->getType(VData);
3837     if (!Ty.isVector() || Ty.getElementType() != S16)
3838       return true;
3839 
3840     B.setInstr(MI);
3841 
3842     Register RepackedReg = handleD16VData(B, *MRI, VData);
3843     if (RepackedReg != VData) {
3844       MI.getOperand(1).setReg(RepackedReg);
3845     }
3846 
3847     return true;
3848   }
3849 
3850   Register DstReg = MI.getOperand(0).getReg();
3851   LLT Ty = MRI->getType(DstReg);
3852   const LLT EltTy = Ty.getScalarType();
3853   const bool IsD16 = Ty.getScalarType() == S16;
3854   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3855 
3856   // Confirm that the return type is large enough for the dmask specified
3857   if (NumElts < DMaskLanes)
3858     return false;
3859 
3860   if (NumElts > 4 || DMaskLanes > 4)
3861     return false;
3862 
3863   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3864   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3865 
3866   // The raw dword aligned data component of the load. The only legal cases
3867   // where this matters should be when using the packed D16 format, for
3868   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3869   LLT RoundedTy;
3870 
3871   // S32 vector to to cover all data, plus TFE result element.
3872   LLT TFETy;
3873 
3874   // Register type to use for each loaded component. Will be S32 or V2S16.
3875   LLT RegTy;
3876 
3877   if (IsD16 && ST.hasUnpackedD16VMem()) {
3878     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3879     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3880     RegTy = S32;
3881   } else {
3882     unsigned EltSize = EltTy.getSizeInBits();
3883     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3884     unsigned RoundedSize = 32 * RoundedElts;
3885     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3886     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3887     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3888   }
3889 
3890   // The return type does not need adjustment.
3891   // TODO: Should we change s16 case to s32 or <2 x s16>?
3892   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3893     return true;
3894 
3895   Register Dst1Reg;
3896 
3897   // Insert after the instruction.
3898   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3899 
3900   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3901   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3902   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3903   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3904 
3905   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3906 
3907   MI.getOperand(0).setReg(NewResultReg);
3908 
3909   // In the IR, TFE is supposed to be used with a 2 element struct return
3910   // type. The intruction really returns these two values in one contiguous
3911   // register, with one additional dword beyond the loaded data. Rewrite the
3912   // return type to use a single register result.
3913 
3914   if (IsTFE) {
3915     Dst1Reg = MI.getOperand(1).getReg();
3916     if (MRI->getType(Dst1Reg) != S32)
3917       return false;
3918 
3919     // TODO: Make sure the TFE operand bit is set.
3920     MI.RemoveOperand(1);
3921 
3922     // Handle the easy case that requires no repack instructions.
3923     if (Ty == S32) {
3924       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3925       return true;
3926     }
3927   }
3928 
3929   // Now figure out how to copy the new result register back into the old
3930   // result.
3931   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3932 
3933   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3934 
3935   if (ResultNumRegs == 1) {
3936     assert(!IsTFE);
3937     ResultRegs[0] = NewResultReg;
3938   } else {
3939     // We have to repack into a new vector of some kind.
3940     for (int I = 0; I != NumDataRegs; ++I)
3941       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3942     B.buildUnmerge(ResultRegs, NewResultReg);
3943 
3944     // Drop the final TFE element to get the data part. The TFE result is
3945     // directly written to the right place already.
3946     if (IsTFE)
3947       ResultRegs.resize(NumDataRegs);
3948   }
3949 
3950   // For an s16 scalar result, we form an s32 result with a truncate regardless
3951   // of packed vs. unpacked.
3952   if (IsD16 && !Ty.isVector()) {
3953     B.buildTrunc(DstReg, ResultRegs[0]);
3954     return true;
3955   }
3956 
3957   // Avoid a build/concat_vector of 1 entry.
3958   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3959     B.buildBitcast(DstReg, ResultRegs[0]);
3960     return true;
3961   }
3962 
3963   assert(Ty.isVector());
3964 
3965   if (IsD16) {
3966     // For packed D16 results with TFE enabled, all the data components are
3967     // S32. Cast back to the expected type.
3968     //
3969     // TODO: We don't really need to use load s32 elements. We would only need one
3970     // cast for the TFE result if a multiple of v2s16 was used.
3971     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3972       for (Register &Reg : ResultRegs)
3973         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3974     } else if (ST.hasUnpackedD16VMem()) {
3975       for (Register &Reg : ResultRegs)
3976         Reg = B.buildTrunc(S16, Reg).getReg(0);
3977     }
3978   }
3979 
3980   auto padWithUndef = [&](LLT Ty, int NumElts) {
3981     if (NumElts == 0)
3982       return;
3983     Register Undef = B.buildUndef(Ty).getReg(0);
3984     for (int I = 0; I != NumElts; ++I)
3985       ResultRegs.push_back(Undef);
3986   };
3987 
3988   // Pad out any elements eliminated due to the dmask.
3989   LLT ResTy = MRI->getType(ResultRegs[0]);
3990   if (!ResTy.isVector()) {
3991     padWithUndef(ResTy, NumElts - ResultRegs.size());
3992     B.buildBuildVector(DstReg, ResultRegs);
3993     return true;
3994   }
3995 
3996   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3997   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3998 
3999   // Deal with the one annoying legal case.
4000   const LLT V3S16 = LLT::vector(3, 16);
4001   if (Ty == V3S16) {
4002     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4003     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4004     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4005     return true;
4006   }
4007 
4008   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4009   B.buildConcatVectors(DstReg, ResultRegs);
4010   return true;
4011 }
4012 
4013 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4014   MachineInstr &MI, MachineIRBuilder &B,
4015   GISelChangeObserver &Observer) const {
4016   Register Dst = MI.getOperand(0).getReg();
4017   LLT Ty = B.getMRI()->getType(Dst);
4018   unsigned Size = Ty.getSizeInBits();
4019   MachineFunction &MF = B.getMF();
4020 
4021   Observer.changingInstr(MI);
4022 
4023   // FIXME: We don't really need this intermediate instruction. The intrinsic
4024   // should be fixed to have a memory operand. Since it's readnone, we're not
4025   // allowed to add one.
4026   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4027   MI.RemoveOperand(1); // Remove intrinsic ID
4028 
4029   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4030   // TODO: Should this use datalayout alignment?
4031   const unsigned MemSize = (Size + 7) / 8;
4032   const Align MemAlign(4);
4033   MachineMemOperand *MMO = MF.getMachineMemOperand(
4034       MachinePointerInfo(),
4035       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4036           MachineMemOperand::MOInvariant,
4037       MemSize, MemAlign);
4038   MI.addMemOperand(MF, MMO);
4039 
4040   // There are no 96-bit result scalar loads, but widening to 128-bit should
4041   // always be legal. We may need to restore this to a 96-bit result if it turns
4042   // out this needs to be converted to a vector load during RegBankSelect.
4043   if (!isPowerOf2_32(Size)) {
4044     LegalizerHelper Helper(MF, *this, Observer, B);
4045     B.setInstr(MI);
4046 
4047     if (Ty.isVector())
4048       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4049     else
4050       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4051   }
4052 
4053   Observer.changedInstr(MI);
4054   return true;
4055 }
4056 
4057 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4058                                                 MachineRegisterInfo &MRI,
4059                                                 MachineIRBuilder &B) const {
4060   B.setInstr(MI);
4061 
4062   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4063   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4064       !ST.isTrapHandlerEnabled()) {
4065     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4066   } else {
4067     // Pass queue pointer to trap handler as input, and insert trap instruction
4068     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4069     const ArgDescriptor *Arg =
4070         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4071     if (!Arg)
4072       return false;
4073     MachineRegisterInfo &MRI = *B.getMRI();
4074     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4075     Register LiveIn = getLiveInRegister(
4076         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4077         /*InsertLiveInCopy=*/false);
4078     if (!loadInputValue(LiveIn, B, Arg))
4079       return false;
4080     B.buildCopy(SGPR01, LiveIn);
4081     B.buildInstr(AMDGPU::S_TRAP)
4082         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4083         .addReg(SGPR01, RegState::Implicit);
4084   }
4085 
4086   MI.eraseFromParent();
4087   return true;
4088 }
4089 
4090 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4091     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4092   B.setInstr(MI);
4093 
4094   // Is non-HSA path or trap-handler disabled? then, report a warning
4095   // accordingly
4096   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4097       !ST.isTrapHandlerEnabled()) {
4098     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4099                                      "debugtrap handler not supported",
4100                                      MI.getDebugLoc(), DS_Warning);
4101     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4102     Ctx.diagnose(NoTrap);
4103   } else {
4104     // Insert debug-trap instruction
4105     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4106   }
4107 
4108   MI.eraseFromParent();
4109   return true;
4110 }
4111 
4112 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4113                                             MachineIRBuilder &B,
4114                                             GISelChangeObserver &Observer) const {
4115   MachineRegisterInfo &MRI = *B.getMRI();
4116 
4117   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4118   auto IntrID = MI.getIntrinsicID();
4119   switch (IntrID) {
4120   case Intrinsic::amdgcn_if:
4121   case Intrinsic::amdgcn_else: {
4122     MachineInstr *Br = nullptr;
4123     MachineBasicBlock *UncondBrTarget = nullptr;
4124     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4125       const SIRegisterInfo *TRI
4126         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4127 
4128       B.setInstr(*BrCond);
4129       Register Def = MI.getOperand(1).getReg();
4130       Register Use = MI.getOperand(3).getReg();
4131 
4132       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4133       if (IntrID == Intrinsic::amdgcn_if) {
4134         B.buildInstr(AMDGPU::SI_IF)
4135           .addDef(Def)
4136           .addUse(Use)
4137           .addMBB(UncondBrTarget);
4138       } else {
4139         B.buildInstr(AMDGPU::SI_ELSE)
4140           .addDef(Def)
4141           .addUse(Use)
4142           .addMBB(UncondBrTarget)
4143           .addImm(0);
4144       }
4145 
4146       if (Br) {
4147         Br->getOperand(0).setMBB(CondBrTarget);
4148       } else {
4149         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4150         // since we're swapping branch targets it needs to be reinserted.
4151         // FIXME: IRTranslator should probably not do this
4152         B.buildBr(*CondBrTarget);
4153       }
4154 
4155       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4156       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4157       MI.eraseFromParent();
4158       BrCond->eraseFromParent();
4159       return true;
4160     }
4161 
4162     return false;
4163   }
4164   case Intrinsic::amdgcn_loop: {
4165     MachineInstr *Br = nullptr;
4166     MachineBasicBlock *UncondBrTarget = nullptr;
4167     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4168       const SIRegisterInfo *TRI
4169         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4170 
4171       B.setInstr(*BrCond);
4172 
4173       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4174       Register Reg = MI.getOperand(2).getReg();
4175       B.buildInstr(AMDGPU::SI_LOOP)
4176         .addUse(Reg)
4177         .addMBB(UncondBrTarget);
4178 
4179       if (Br)
4180         Br->getOperand(0).setMBB(CondBrTarget);
4181       else
4182         B.buildBr(*CondBrTarget);
4183 
4184       MI.eraseFromParent();
4185       BrCond->eraseFromParent();
4186       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4187       return true;
4188     }
4189 
4190     return false;
4191   }
4192   case Intrinsic::amdgcn_kernarg_segment_ptr:
4193     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4194       B.setInstr(MI);
4195       // This only makes sense to call in a kernel, so just lower to null.
4196       B.buildConstant(MI.getOperand(0).getReg(), 0);
4197       MI.eraseFromParent();
4198       return true;
4199     }
4200 
4201     return legalizePreloadedArgIntrin(
4202       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4203   case Intrinsic::amdgcn_implicitarg_ptr:
4204     return legalizeImplicitArgPtr(MI, MRI, B);
4205   case Intrinsic::amdgcn_workitem_id_x:
4206     return legalizePreloadedArgIntrin(MI, MRI, B,
4207                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4208   case Intrinsic::amdgcn_workitem_id_y:
4209     return legalizePreloadedArgIntrin(MI, MRI, B,
4210                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4211   case Intrinsic::amdgcn_workitem_id_z:
4212     return legalizePreloadedArgIntrin(MI, MRI, B,
4213                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4214   case Intrinsic::amdgcn_workgroup_id_x:
4215     return legalizePreloadedArgIntrin(MI, MRI, B,
4216                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4217   case Intrinsic::amdgcn_workgroup_id_y:
4218     return legalizePreloadedArgIntrin(MI, MRI, B,
4219                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4220   case Intrinsic::amdgcn_workgroup_id_z:
4221     return legalizePreloadedArgIntrin(MI, MRI, B,
4222                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4223   case Intrinsic::amdgcn_dispatch_ptr:
4224     return legalizePreloadedArgIntrin(MI, MRI, B,
4225                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4226   case Intrinsic::amdgcn_queue_ptr:
4227     return legalizePreloadedArgIntrin(MI, MRI, B,
4228                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4229   case Intrinsic::amdgcn_implicit_buffer_ptr:
4230     return legalizePreloadedArgIntrin(
4231       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4232   case Intrinsic::amdgcn_dispatch_id:
4233     return legalizePreloadedArgIntrin(MI, MRI, B,
4234                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4235   case Intrinsic::amdgcn_fdiv_fast:
4236     return legalizeFDIVFastIntrin(MI, MRI, B);
4237   case Intrinsic::amdgcn_is_shared:
4238     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4239   case Intrinsic::amdgcn_is_private:
4240     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4241   case Intrinsic::amdgcn_wavefrontsize: {
4242     B.setInstr(MI);
4243     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4244     MI.eraseFromParent();
4245     return true;
4246   }
4247   case Intrinsic::amdgcn_s_buffer_load:
4248     return legalizeSBufferLoad(MI, B, Observer);
4249   case Intrinsic::amdgcn_raw_buffer_store:
4250   case Intrinsic::amdgcn_struct_buffer_store:
4251     return legalizeBufferStore(MI, MRI, B, false, false);
4252   case Intrinsic::amdgcn_raw_buffer_store_format:
4253   case Intrinsic::amdgcn_struct_buffer_store_format:
4254     return legalizeBufferStore(MI, MRI, B, false, true);
4255   case Intrinsic::amdgcn_raw_tbuffer_store:
4256   case Intrinsic::amdgcn_struct_tbuffer_store:
4257     return legalizeBufferStore(MI, MRI, B, true, true);
4258   case Intrinsic::amdgcn_raw_buffer_load:
4259   case Intrinsic::amdgcn_struct_buffer_load:
4260     return legalizeBufferLoad(MI, MRI, B, false, false);
4261   case Intrinsic::amdgcn_raw_buffer_load_format:
4262   case Intrinsic::amdgcn_struct_buffer_load_format:
4263     return legalizeBufferLoad(MI, MRI, B, true, false);
4264   case Intrinsic::amdgcn_raw_tbuffer_load:
4265   case Intrinsic::amdgcn_struct_tbuffer_load:
4266     return legalizeBufferLoad(MI, MRI, B, true, true);
4267   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4268   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4269   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4270   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4271   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4272   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4273   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4274   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4275   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4276   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4277   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4278   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4279   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4280   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4281   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4282   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4283   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4284   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4285   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4286   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4287   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4288   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4289   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4290   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4291   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4292   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4293     return legalizeBufferAtomic(MI, B, IntrID);
4294   case Intrinsic::amdgcn_atomic_inc:
4295     return legalizeAtomicIncDec(MI, B, true);
4296   case Intrinsic::amdgcn_atomic_dec:
4297     return legalizeAtomicIncDec(MI, B, false);
4298   case Intrinsic::trap:
4299     return legalizeTrapIntrinsic(MI, MRI, B);
4300   case Intrinsic::debugtrap:
4301     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4302   default: {
4303     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4304             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4305       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4306     return true;
4307   }
4308   }
4309 
4310   return true;
4311 }
4312