1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
70   return [=](const LegalityQuery &Query) {
71     const LLT Ty = Query.Types[TypeIdx];
72     return Ty.isVector() &&
73            Ty.getNumElements() % 2 != 0 &&
74            Ty.getElementType().getSizeInBits() < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     const LLT EltTy = Ty.getScalarType();
83     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
84   };
85 }
86 
87 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getElementType();
91     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
92   };
93 }
94 
95 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     unsigned Size = Ty.getSizeInBits();
100     unsigned Pieces = (Size + 63) / 64;
101     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
102     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
103   };
104 }
105 
106 // Increase the number of vector elements to reach the next multiple of 32-bit
107 // type.
108 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
109   return [=](const LegalityQuery &Query) {
110     const LLT Ty = Query.Types[TypeIdx];
111 
112     const LLT EltTy = Ty.getElementType();
113     const int Size = Ty.getSizeInBits();
114     const int EltSize = EltTy.getSizeInBits();
115     const int NextMul32 = (Size + 31) / 32;
116 
117     assert(EltSize < 32);
118 
119     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
120     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
121   };
122 }
123 
124 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
125   return [=](const LegalityQuery &Query) {
126     const LLT QueryTy = Query.Types[TypeIdx];
127     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
128   };
129 }
130 
131 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
132   return [=](const LegalityQuery &Query) {
133     const LLT QueryTy = Query.Types[TypeIdx];
134     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
135   };
136 }
137 
138 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT QueryTy = Query.Types[TypeIdx];
141     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
142   };
143 }
144 
145 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
146 // v2s16.
147 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     if (Ty.isVector()) {
151       const int EltSize = Ty.getElementType().getSizeInBits();
152       return EltSize == 32 || EltSize == 64 ||
153             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
154              EltSize == 128 || EltSize == 256;
155     }
156 
157     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
158   };
159 }
160 
161 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     if (!QueryTy.isVector())
165       return false;
166     const LLT EltTy = QueryTy.getElementType();
167     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
168   };
169 }
170 
171 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
172   return [=](const LegalityQuery &Query) {
173     const LLT Ty = Query.Types[TypeIdx];
174     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
175            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
176   };
177 }
178 
179 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
180                                          const GCNTargetMachine &TM)
181   :  ST(ST_) {
182   using namespace TargetOpcode;
183 
184   auto GetAddrSpacePtr = [&TM](unsigned AS) {
185     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
186   };
187 
188   const LLT S1 = LLT::scalar(1);
189   const LLT S16 = LLT::scalar(16);
190   const LLT S32 = LLT::scalar(32);
191   const LLT S64 = LLT::scalar(64);
192   const LLT S128 = LLT::scalar(128);
193   const LLT S256 = LLT::scalar(256);
194   const LLT S512 = LLT::scalar(512);
195   const LLT S1024 = LLT::scalar(1024);
196 
197   const LLT V2S16 = LLT::vector(2, 16);
198   const LLT V4S16 = LLT::vector(4, 16);
199 
200   const LLT V2S32 = LLT::vector(2, 32);
201   const LLT V3S32 = LLT::vector(3, 32);
202   const LLT V4S32 = LLT::vector(4, 32);
203   const LLT V5S32 = LLT::vector(5, 32);
204   const LLT V6S32 = LLT::vector(6, 32);
205   const LLT V7S32 = LLT::vector(7, 32);
206   const LLT V8S32 = LLT::vector(8, 32);
207   const LLT V9S32 = LLT::vector(9, 32);
208   const LLT V10S32 = LLT::vector(10, 32);
209   const LLT V11S32 = LLT::vector(11, 32);
210   const LLT V12S32 = LLT::vector(12, 32);
211   const LLT V13S32 = LLT::vector(13, 32);
212   const LLT V14S32 = LLT::vector(14, 32);
213   const LLT V15S32 = LLT::vector(15, 32);
214   const LLT V16S32 = LLT::vector(16, 32);
215   const LLT V32S32 = LLT::vector(32, 32);
216 
217   const LLT V2S64 = LLT::vector(2, 64);
218   const LLT V3S64 = LLT::vector(3, 64);
219   const LLT V4S64 = LLT::vector(4, 64);
220   const LLT V5S64 = LLT::vector(5, 64);
221   const LLT V6S64 = LLT::vector(6, 64);
222   const LLT V7S64 = LLT::vector(7, 64);
223   const LLT V8S64 = LLT::vector(8, 64);
224   const LLT V16S64 = LLT::vector(16, 64);
225 
226   std::initializer_list<LLT> AllS32Vectors =
227     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
228      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
229   std::initializer_list<LLT> AllS64Vectors =
230     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
231 
232   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
233   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
234   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
235   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
236   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
237   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
238   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
239 
240   const LLT CodePtr = FlatPtr;
241 
242   const std::initializer_list<LLT> AddrSpaces64 = {
243     GlobalPtr, ConstantPtr, FlatPtr
244   };
245 
246   const std::initializer_list<LLT> AddrSpaces32 = {
247     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
248   };
249 
250   const std::initializer_list<LLT> FPTypesBase = {
251     S32, S64
252   };
253 
254   const std::initializer_list<LLT> FPTypes16 = {
255     S32, S64, S16
256   };
257 
258   const std::initializer_list<LLT> FPTypesPK16 = {
259     S32, S64, S16, V2S16
260   };
261 
262   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
263 
264   setAction({G_BRCOND, S1}, Legal); // VCC branches
265   setAction({G_BRCOND, S32}, Legal); // SCC branches
266 
267   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
268   // elements for v3s16
269   getActionDefinitionsBuilder(G_PHI)
270     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
271     .legalFor(AllS32Vectors)
272     .legalFor(AllS64Vectors)
273     .legalFor(AddrSpaces64)
274     .legalFor(AddrSpaces32)
275     .clampScalar(0, S32, S256)
276     .widenScalarToNextPow2(0, 32)
277     .clampMaxNumElements(0, S32, 16)
278     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
279     .legalIf(isPointer(0));
280 
281   if (ST.hasVOP3PInsts()) {
282     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
283       .legalFor({S32, S16, V2S16})
284       .clampScalar(0, S16, S32)
285       .clampMaxNumElements(0, S16, 2)
286       .scalarize(0)
287       .widenScalarToNextPow2(0, 32);
288   } else if (ST.has16BitInsts()) {
289     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
290       .legalFor({S32, S16})
291       .clampScalar(0, S16, S32)
292       .scalarize(0)
293       .widenScalarToNextPow2(0, 32);
294   } else {
295     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
296       .legalFor({S32})
297       .clampScalar(0, S32, S32)
298       .scalarize(0);
299   }
300 
301   // FIXME: Not really legal. Placeholder for custom lowering.
302   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
303     .customFor({S32, S64})
304     .clampScalar(0, S32, S64)
305     .widenScalarToNextPow2(0, 32)
306     .scalarize(0);
307 
308   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
309     .legalFor({S32})
310     .clampScalar(0, S32, S32)
311     .scalarize(0);
312 
313   // Report legal for any types we can handle anywhere. For the cases only legal
314   // on the SALU, RegBankSelect will be able to re-legalize.
315   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
316     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
317     .clampScalar(0, S32, S64)
318     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
319     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
320     .widenScalarToNextPow2(0)
321     .scalarize(0);
322 
323   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
324                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
325     .legalFor({{S32, S1}, {S32, S32}})
326     .minScalar(0, S32)
327     // TODO: .scalarize(0)
328     .lower();
329 
330   getActionDefinitionsBuilder(G_BITCAST)
331     // Don't worry about the size constraint.
332     .legalIf(all(isRegisterType(0), isRegisterType(1)))
333     .lower();
334 
335 
336   getActionDefinitionsBuilder(G_CONSTANT)
337     .legalFor({S1, S32, S64, S16, GlobalPtr,
338                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
339     .clampScalar(0, S32, S64)
340     .widenScalarToNextPow2(0)
341     .legalIf(isPointer(0));
342 
343   getActionDefinitionsBuilder(G_FCONSTANT)
344     .legalFor({S32, S64, S16})
345     .clampScalar(0, S16, S64);
346 
347   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
348       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
349                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
350       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
351       .clampScalarOrElt(0, S32, S1024)
352       .legalIf(isMultiple32(0))
353       .widenScalarToNextPow2(0, 32)
354       .clampMaxNumElements(0, S32, 16);
355 
356   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
357   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
358     .unsupportedFor({PrivatePtr})
359     .custom();
360   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
361 
362   auto &FPOpActions = getActionDefinitionsBuilder(
363     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
364     .legalFor({S32, S64});
365   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
366     .customFor({S32, S64});
367   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
368     .customFor({S32, S64});
369 
370   if (ST.has16BitInsts()) {
371     if (ST.hasVOP3PInsts())
372       FPOpActions.legalFor({S16, V2S16});
373     else
374       FPOpActions.legalFor({S16});
375 
376     TrigActions.customFor({S16});
377     FDIVActions.customFor({S16});
378   }
379 
380   auto &MinNumMaxNum = getActionDefinitionsBuilder({
381       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
382 
383   if (ST.hasVOP3PInsts()) {
384     MinNumMaxNum.customFor(FPTypesPK16)
385       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
386       .clampMaxNumElements(0, S16, 2)
387       .clampScalar(0, S16, S64)
388       .scalarize(0);
389   } else if (ST.has16BitInsts()) {
390     MinNumMaxNum.customFor(FPTypes16)
391       .clampScalar(0, S16, S64)
392       .scalarize(0);
393   } else {
394     MinNumMaxNum.customFor(FPTypesBase)
395       .clampScalar(0, S32, S64)
396       .scalarize(0);
397   }
398 
399   if (ST.hasVOP3PInsts())
400     FPOpActions.clampMaxNumElements(0, S16, 2);
401 
402   FPOpActions
403     .scalarize(0)
404     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
405 
406   TrigActions
407     .scalarize(0)
408     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
409 
410   FDIVActions
411     .scalarize(0)
412     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
413 
414   getActionDefinitionsBuilder({G_FNEG, G_FABS})
415     .legalFor(FPTypesPK16)
416     .clampMaxNumElements(0, S16, 2)
417     .scalarize(0)
418     .clampScalar(0, S16, S64);
419 
420   if (ST.has16BitInsts()) {
421     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
422       .legalFor({S32, S64, S16})
423       .scalarize(0)
424       .clampScalar(0, S16, S64);
425   } else {
426     getActionDefinitionsBuilder(G_FSQRT)
427       .legalFor({S32, S64})
428       .scalarize(0)
429       .clampScalar(0, S32, S64);
430 
431     if (ST.hasFractBug()) {
432       getActionDefinitionsBuilder(G_FFLOOR)
433         .customFor({S64})
434         .legalFor({S32, S64})
435         .scalarize(0)
436         .clampScalar(0, S32, S64);
437     } else {
438       getActionDefinitionsBuilder(G_FFLOOR)
439         .legalFor({S32, S64})
440         .scalarize(0)
441         .clampScalar(0, S32, S64);
442     }
443   }
444 
445   getActionDefinitionsBuilder(G_FPTRUNC)
446     .legalFor({{S32, S64}, {S16, S32}})
447     .scalarize(0)
448     .lower();
449 
450   getActionDefinitionsBuilder(G_FPEXT)
451     .legalFor({{S64, S32}, {S32, S16}})
452     .lowerFor({{S64, S16}}) // FIXME: Implement
453     .scalarize(0);
454 
455   getActionDefinitionsBuilder(G_FSUB)
456       // Use actual fsub instruction
457       .legalFor({S32})
458       // Must use fadd + fneg
459       .lowerFor({S64, S16, V2S16})
460       .scalarize(0)
461       .clampScalar(0, S32, S64);
462 
463   // Whether this is legal depends on the floating point mode for the function.
464   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
465   if (ST.hasMadF16())
466     FMad.customFor({S32, S16});
467   else
468     FMad.customFor({S32});
469   FMad.scalarize(0)
470       .lower();
471 
472   // TODO: Do we need to clamp maximum bitwidth?
473   getActionDefinitionsBuilder(G_TRUNC)
474     .legalIf(isScalar(0))
475     .legalFor({{V2S16, V2S32}})
476     .clampMaxNumElements(0, S16, 2)
477     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
478     // situations (like an invalid implicit use), we don't want to infinite loop
479     // in the legalizer.
480     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
481     .alwaysLegal();
482 
483   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
484     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
485                {S32, S1}, {S64, S1}, {S16, S1}})
486     .scalarize(0)
487     .clampScalar(0, S32, S64)
488     .widenScalarToNextPow2(1, 32);
489 
490   // TODO: Split s1->s64 during regbankselect for VALU.
491   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
492     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
493     .lowerFor({{S32, S64}})
494     .lowerIf(typeIs(1, S1))
495     .customFor({{S64, S64}});
496   if (ST.has16BitInsts())
497     IToFP.legalFor({{S16, S16}});
498   IToFP.clampScalar(1, S32, S64)
499        .scalarize(0)
500        .widenScalarToNextPow2(1);
501 
502   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
503     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
504     .customFor({{S64, S64}});
505   if (ST.has16BitInsts())
506     FPToI.legalFor({{S16, S16}});
507   else
508     FPToI.minScalar(1, S32);
509 
510   FPToI.minScalar(0, S32)
511        .scalarize(0)
512        .lower();
513 
514   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
515     .scalarize(0)
516     .lower();
517 
518   if (ST.has16BitInsts()) {
519     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
520       .legalFor({S16, S32, S64})
521       .clampScalar(0, S16, S64)
522       .scalarize(0);
523   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
524     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
525       .legalFor({S32, S64})
526       .clampScalar(0, S32, S64)
527       .scalarize(0);
528   } else {
529     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
530       .legalFor({S32})
531       .customFor({S64})
532       .clampScalar(0, S32, S64)
533       .scalarize(0);
534   }
535 
536   // FIXME: Clamp offset operand.
537   getActionDefinitionsBuilder(G_PTR_ADD)
538     .legalIf(isPointer(0))
539     .scalarize(0);
540 
541   getActionDefinitionsBuilder(G_PTRMASK)
542     .legalIf(typeInSet(1, {S64, S32}))
543     .minScalar(1, S32)
544     .maxScalarIf(sizeIs(0, 32), 1, S32)
545     .maxScalarIf(sizeIs(0, 64), 1, S64)
546     .scalarize(0);
547 
548   auto &CmpBuilder =
549     getActionDefinitionsBuilder(G_ICMP)
550     // The compare output type differs based on the register bank of the output,
551     // so make both s1 and s32 legal.
552     //
553     // Scalar compares producing output in scc will be promoted to s32, as that
554     // is the allocatable register type that will be needed for the copy from
555     // scc. This will be promoted during RegBankSelect, and we assume something
556     // before that won't try to use s32 result types.
557     //
558     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
559     // bank.
560     .legalForCartesianProduct(
561       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
562     .legalForCartesianProduct(
563       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
564   if (ST.has16BitInsts()) {
565     CmpBuilder.legalFor({{S1, S16}});
566   }
567 
568   CmpBuilder
569     .widenScalarToNextPow2(1)
570     .clampScalar(1, S32, S64)
571     .scalarize(0)
572     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
573 
574   getActionDefinitionsBuilder(G_FCMP)
575     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
576     .widenScalarToNextPow2(1)
577     .clampScalar(1, S32, S64)
578     .scalarize(0);
579 
580   // FIXME: fpow has a selection pattern that should move to custom lowering.
581   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
582   if (ST.has16BitInsts())
583     Exp2Ops.legalFor({S32, S16});
584   else
585     Exp2Ops.legalFor({S32});
586   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
587   Exp2Ops.scalarize(0);
588 
589   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
590   if (ST.has16BitInsts())
591     ExpOps.customFor({{S32}, {S16}});
592   else
593     ExpOps.customFor({S32});
594   ExpOps.clampScalar(0, MinScalarFPTy, S32)
595         .scalarize(0);
596 
597   // The 64-bit versions produce 32-bit results, but only on the SALU.
598   getActionDefinitionsBuilder(G_CTPOP)
599     .legalFor({{S32, S32}, {S32, S64}})
600     .clampScalar(0, S32, S32)
601     .clampScalar(1, S32, S64)
602     .scalarize(0)
603     .widenScalarToNextPow2(0, 32)
604     .widenScalarToNextPow2(1, 32);
605 
606   // The hardware instructions return a different result on 0 than the generic
607   // instructions expect. The hardware produces -1, but these produce the
608   // bitwidth.
609   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
610     .scalarize(0)
611     .clampScalar(0, S32, S32)
612     .clampScalar(1, S32, S64)
613     .widenScalarToNextPow2(0, 32)
614     .widenScalarToNextPow2(1, 32)
615     .lower();
616 
617   // The 64-bit versions produce 32-bit results, but only on the SALU.
618   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
619     .legalFor({{S32, S32}, {S32, S64}})
620     .clampScalar(0, S32, S32)
621     .clampScalar(1, S32, S64)
622     .scalarize(0)
623     .widenScalarToNextPow2(0, 32)
624     .widenScalarToNextPow2(1, 32);
625 
626   getActionDefinitionsBuilder(G_BITREVERSE)
627     .legalFor({S32})
628     .clampScalar(0, S32, S32)
629     .scalarize(0);
630 
631   if (ST.has16BitInsts()) {
632     getActionDefinitionsBuilder(G_BSWAP)
633       .legalFor({S16, S32, V2S16})
634       .clampMaxNumElements(0, S16, 2)
635       // FIXME: Fixing non-power-of-2 before clamp is workaround for
636       // narrowScalar limitation.
637       .widenScalarToNextPow2(0)
638       .clampScalar(0, S16, S32)
639       .scalarize(0);
640 
641     if (ST.hasVOP3PInsts()) {
642       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
643         .legalFor({S32, S16, V2S16})
644         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
645         .clampMaxNumElements(0, S16, 2)
646         .minScalar(0, S16)
647         .widenScalarToNextPow2(0)
648         .scalarize(0)
649         .lower();
650     } else {
651       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
652         .legalFor({S32, S16})
653         .widenScalarToNextPow2(0)
654         .minScalar(0, S16)
655         .scalarize(0)
656         .lower();
657     }
658   } else {
659     // TODO: Should have same legality without v_perm_b32
660     getActionDefinitionsBuilder(G_BSWAP)
661       .legalFor({S32})
662       .lowerIf(scalarNarrowerThan(0, 32))
663       // FIXME: Fixing non-power-of-2 before clamp is workaround for
664       // narrowScalar limitation.
665       .widenScalarToNextPow2(0)
666       .maxScalar(0, S32)
667       .scalarize(0)
668       .lower();
669 
670     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671       .legalFor({S32})
672       .minScalar(0, S32)
673       .widenScalarToNextPow2(0)
674       .scalarize(0)
675       .lower();
676   }
677 
678   getActionDefinitionsBuilder(G_INTTOPTR)
679     // List the common cases
680     .legalForCartesianProduct(AddrSpaces64, {S64})
681     .legalForCartesianProduct(AddrSpaces32, {S32})
682     .scalarize(0)
683     // Accept any address space as long as the size matches
684     .legalIf(sameSize(0, 1))
685     .widenScalarIf(smallerThan(1, 0),
686       [](const LegalityQuery &Query) {
687         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
688       })
689     .narrowScalarIf(largerThan(1, 0),
690       [](const LegalityQuery &Query) {
691         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
692       });
693 
694   getActionDefinitionsBuilder(G_PTRTOINT)
695     // List the common cases
696     .legalForCartesianProduct(AddrSpaces64, {S64})
697     .legalForCartesianProduct(AddrSpaces32, {S32})
698     .scalarize(0)
699     // Accept any address space as long as the size matches
700     .legalIf(sameSize(0, 1))
701     .widenScalarIf(smallerThan(0, 1),
702       [](const LegalityQuery &Query) {
703         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
704       })
705     .narrowScalarIf(
706       largerThan(0, 1),
707       [](const LegalityQuery &Query) {
708         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
709       });
710 
711   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
712     .scalarize(0)
713     .custom();
714 
715   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
716   // handle some operations by just promoting the register during
717   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
718   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
719     switch (AS) {
720     // FIXME: Private element size.
721     case AMDGPUAS::PRIVATE_ADDRESS:
722       return 32;
723     // FIXME: Check subtarget
724     case AMDGPUAS::LOCAL_ADDRESS:
725       return ST.useDS128() ? 128 : 64;
726 
727     // Treat constant and global as identical. SMRD loads are sometimes usable
728     // for global loads (ideally constant address space should be eliminated)
729     // depending on the context. Legality cannot be context dependent, but
730     // RegBankSelect can split the load as necessary depending on the pointer
731     // register bank/uniformity and if the memory is invariant or not written in
732     // a kernel.
733     case AMDGPUAS::CONSTANT_ADDRESS:
734     case AMDGPUAS::GLOBAL_ADDRESS:
735       return IsLoad ? 512 : 128;
736     default:
737       return 128;
738     }
739   };
740 
741   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
742                                     bool IsLoad) -> bool {
743     const LLT DstTy = Query.Types[0];
744 
745     // Split vector extloads.
746     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
747     unsigned Align = Query.MMODescrs[0].AlignInBits;
748 
749     if (MemSize < DstTy.getSizeInBits())
750       MemSize = std::max(MemSize, Align);
751 
752     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
753       return true;
754 
755     const LLT PtrTy = Query.Types[1];
756     unsigned AS = PtrTy.getAddressSpace();
757     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
758       return true;
759 
760     // Catch weird sized loads that don't evenly divide into the access sizes
761     // TODO: May be able to widen depending on alignment etc.
762     unsigned NumRegs = (MemSize + 31) / 32;
763     if (NumRegs == 3) {
764       if (!ST.hasDwordx3LoadStores())
765         return true;
766     } else {
767       // If the alignment allows, these should have been widened.
768       if (!isPowerOf2_32(NumRegs))
769         return true;
770     }
771 
772     if (Align < MemSize) {
773       const SITargetLowering *TLI = ST.getTargetLowering();
774       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
775     }
776 
777     return false;
778   };
779 
780   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
781     unsigned Size = Query.Types[0].getSizeInBits();
782     if (isPowerOf2_32(Size))
783       return false;
784 
785     if (Size == 96 && ST.hasDwordx3LoadStores())
786       return false;
787 
788     unsigned AddrSpace = Query.Types[1].getAddressSpace();
789     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
790       return false;
791 
792     unsigned Align = Query.MMODescrs[0].AlignInBits;
793     unsigned RoundedSize = NextPowerOf2(Size);
794     return (Align >= RoundedSize);
795   };
796 
797   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
798   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
799   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
800 
801   // TODO: Refine based on subtargets which support unaligned access or 128-bit
802   // LDS
803   // TODO: Unsupported flat for SI.
804 
805   for (unsigned Op : {G_LOAD, G_STORE}) {
806     const bool IsStore = Op == G_STORE;
807 
808     auto &Actions = getActionDefinitionsBuilder(Op);
809     // Whitelist the common cases.
810     // TODO: Loads to s16 on gfx9
811     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
812                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
813                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
814                                       {S128, GlobalPtr, 128, GlobalAlign32},
815                                       {S64, GlobalPtr, 64, GlobalAlign32},
816                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
817                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
818                                       {S32, GlobalPtr, 8, GlobalAlign8},
819                                       {S32, GlobalPtr, 16, GlobalAlign16},
820 
821                                       {S32, LocalPtr, 32, 32},
822                                       {S64, LocalPtr, 64, 32},
823                                       {V2S32, LocalPtr, 64, 32},
824                                       {S32, LocalPtr, 8, 8},
825                                       {S32, LocalPtr, 16, 16},
826                                       {V2S16, LocalPtr, 32, 32},
827 
828                                       {S32, PrivatePtr, 32, 32},
829                                       {S32, PrivatePtr, 8, 8},
830                                       {S32, PrivatePtr, 16, 16},
831                                       {V2S16, PrivatePtr, 32, 32},
832 
833                                       {S32, FlatPtr, 32, GlobalAlign32},
834                                       {S32, FlatPtr, 16, GlobalAlign16},
835                                       {S32, FlatPtr, 8, GlobalAlign8},
836                                       {V2S16, FlatPtr, 32, GlobalAlign32},
837 
838                                       {S32, ConstantPtr, 32, GlobalAlign32},
839                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
840                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
841                                       {S64, ConstantPtr, 64, GlobalAlign32},
842                                       {S128, ConstantPtr, 128, GlobalAlign32},
843                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
844     Actions
845         .customIf(typeIs(1, Constant32Ptr))
846         // Widen suitably aligned loads by loading extra elements.
847         .moreElementsIf([=](const LegalityQuery &Query) {
848             const LLT Ty = Query.Types[0];
849             return Op == G_LOAD && Ty.isVector() &&
850                    shouldWidenLoadResult(Query);
851           }, moreElementsToNextPow2(0))
852         .widenScalarIf([=](const LegalityQuery &Query) {
853             const LLT Ty = Query.Types[0];
854             return Op == G_LOAD && !Ty.isVector() &&
855                    shouldWidenLoadResult(Query);
856           }, widenScalarOrEltToNextPow2(0))
857         .narrowScalarIf(
858             [=](const LegalityQuery &Query) -> bool {
859               return !Query.Types[0].isVector() &&
860                      needToSplitMemOp(Query, Op == G_LOAD);
861             },
862             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
863               const LLT DstTy = Query.Types[0];
864               const LLT PtrTy = Query.Types[1];
865 
866               const unsigned DstSize = DstTy.getSizeInBits();
867               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
868 
869               // Split extloads.
870               if (DstSize > MemSize)
871                 return std::make_pair(0, LLT::scalar(MemSize));
872 
873               if (!isPowerOf2_32(DstSize)) {
874                 // We're probably decomposing an odd sized store. Try to split
875                 // to the widest type. TODO: Account for alignment. As-is it
876                 // should be OK, since the new parts will be further legalized.
877                 unsigned FloorSize = PowerOf2Floor(DstSize);
878                 return std::make_pair(0, LLT::scalar(FloorSize));
879               }
880 
881               if (DstSize > 32 && (DstSize % 32 != 0)) {
882                 // FIXME: Need a way to specify non-extload of larger size if
883                 // suitably aligned.
884                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
885               }
886 
887               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
888                                                      Op == G_LOAD);
889               if (MemSize > MaxSize)
890                 return std::make_pair(0, LLT::scalar(MaxSize));
891 
892               unsigned Align = Query.MMODescrs[0].AlignInBits;
893               return std::make_pair(0, LLT::scalar(Align));
894             })
895         .fewerElementsIf(
896             [=](const LegalityQuery &Query) -> bool {
897               return Query.Types[0].isVector() &&
898                      needToSplitMemOp(Query, Op == G_LOAD);
899             },
900             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
901               const LLT DstTy = Query.Types[0];
902               const LLT PtrTy = Query.Types[1];
903 
904               LLT EltTy = DstTy.getElementType();
905               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
906                                                      Op == G_LOAD);
907 
908               // FIXME: Handle widened to power of 2 results better. This ends
909               // up scalarizing.
910               // FIXME: 3 element stores scalarized on SI
911 
912               // Split if it's too large for the address space.
913               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
914                 unsigned NumElts = DstTy.getNumElements();
915                 unsigned EltSize = EltTy.getSizeInBits();
916 
917                 if (MaxSize % EltSize == 0) {
918                   return std::make_pair(
919                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
920                 }
921 
922                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
923 
924                 // FIXME: Refine when odd breakdowns handled
925                 // The scalars will need to be re-legalized.
926                 if (NumPieces == 1 || NumPieces >= NumElts ||
927                     NumElts % NumPieces != 0)
928                   return std::make_pair(0, EltTy);
929 
930                 return std::make_pair(0,
931                                       LLT::vector(NumElts / NumPieces, EltTy));
932               }
933 
934               // FIXME: We could probably handle weird extending loads better.
935               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
936               if (DstTy.getSizeInBits() > MemSize)
937                 return std::make_pair(0, EltTy);
938 
939               unsigned EltSize = EltTy.getSizeInBits();
940               unsigned DstSize = DstTy.getSizeInBits();
941               if (!isPowerOf2_32(DstSize)) {
942                 // We're probably decomposing an odd sized store. Try to split
943                 // to the widest type. TODO: Account for alignment. As-is it
944                 // should be OK, since the new parts will be further legalized.
945                 unsigned FloorSize = PowerOf2Floor(DstSize);
946                 return std::make_pair(
947                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
948               }
949 
950               // Need to split because of alignment.
951               unsigned Align = Query.MMODescrs[0].AlignInBits;
952               if (EltSize > Align &&
953                   (EltSize / Align < DstTy.getNumElements())) {
954                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
955               }
956 
957               // May need relegalization for the scalars.
958               return std::make_pair(0, EltTy);
959             })
960         .minScalar(0, S32);
961 
962     if (IsStore)
963       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
964 
965     // TODO: Need a bitcast lower option?
966     Actions
967         .legalIf([=](const LegalityQuery &Query) {
968           const LLT Ty0 = Query.Types[0];
969           unsigned Size = Ty0.getSizeInBits();
970           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
971           unsigned Align = Query.MMODescrs[0].AlignInBits;
972 
973           // FIXME: Widening store from alignment not valid.
974           if (MemSize < Size)
975             MemSize = std::max(MemSize, Align);
976 
977           // No extending vector loads.
978           if (Size > MemSize && Ty0.isVector())
979             return false;
980 
981           switch (MemSize) {
982           case 8:
983           case 16:
984             return Size == 32;
985           case 32:
986           case 64:
987           case 128:
988             return true;
989           case 96:
990             return ST.hasDwordx3LoadStores();
991           case 256:
992           case 512:
993             return true;
994           default:
995             return false;
996           }
997         })
998         .widenScalarToNextPow2(0)
999         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1000   }
1001 
1002   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1003                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1004                                                   {S32, GlobalPtr, 16, 2 * 8},
1005                                                   {S32, LocalPtr, 8, 8},
1006                                                   {S32, LocalPtr, 16, 16},
1007                                                   {S32, PrivatePtr, 8, 8},
1008                                                   {S32, PrivatePtr, 16, 16},
1009                                                   {S32, ConstantPtr, 8, 8},
1010                                                   {S32, ConstantPtr, 16, 2 * 8}});
1011   if (ST.hasFlatAddressSpace()) {
1012     ExtLoads.legalForTypesWithMemDesc(
1013         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1014   }
1015 
1016   ExtLoads.clampScalar(0, S32, S32)
1017           .widenScalarToNextPow2(0)
1018           .unsupportedIfMemSizeNotPow2()
1019           .lower();
1020 
1021   auto &Atomics = getActionDefinitionsBuilder(
1022     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1023      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1024      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1025      G_ATOMICRMW_UMIN})
1026     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1027                {S64, GlobalPtr}, {S64, LocalPtr}});
1028   if (ST.hasFlatAddressSpace()) {
1029     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1030   }
1031 
1032   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1033     .legalFor({{S32, LocalPtr}});
1034 
1035   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1036   // demarshalling
1037   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1038     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1039                 {S32, FlatPtr}, {S64, FlatPtr}})
1040     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1041                {S32, RegionPtr}, {S64, RegionPtr}});
1042   // TODO: Pointer types, any 32-bit or 64-bit vector
1043 
1044   // Condition should be s32 for scalar, s1 for vector.
1045   getActionDefinitionsBuilder(G_SELECT)
1046     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1047           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1048           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1049     .clampScalar(0, S16, S64)
1050     .scalarize(1)
1051     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1052     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1053     .clampMaxNumElements(0, S32, 2)
1054     .clampMaxNumElements(0, LocalPtr, 2)
1055     .clampMaxNumElements(0, PrivatePtr, 2)
1056     .scalarize(0)
1057     .widenScalarToNextPow2(0)
1058     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1059 
1060   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1061   // be more flexible with the shift amount type.
1062   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1063     .legalFor({{S32, S32}, {S64, S32}});
1064   if (ST.has16BitInsts()) {
1065     if (ST.hasVOP3PInsts()) {
1066       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1067             .clampMaxNumElements(0, S16, 2);
1068     } else
1069       Shifts.legalFor({{S16, S16}});
1070 
1071     // TODO: Support 16-bit shift amounts for all types
1072     Shifts.widenScalarIf(
1073       [=](const LegalityQuery &Query) {
1074         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1075         // 32-bit amount.
1076         const LLT ValTy = Query.Types[0];
1077         const LLT AmountTy = Query.Types[1];
1078         return ValTy.getSizeInBits() <= 16 &&
1079                AmountTy.getSizeInBits() < 16;
1080       }, changeTo(1, S16));
1081     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1082     Shifts.clampScalar(1, S32, S32);
1083     Shifts.clampScalar(0, S16, S64);
1084     Shifts.widenScalarToNextPow2(0, 16);
1085   } else {
1086     // Make sure we legalize the shift amount type first, as the general
1087     // expansion for the shifted type will produce much worse code if it hasn't
1088     // been truncated already.
1089     Shifts.clampScalar(1, S32, S32);
1090     Shifts.clampScalar(0, S32, S64);
1091     Shifts.widenScalarToNextPow2(0, 32);
1092   }
1093   Shifts.scalarize(0);
1094 
1095   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1096     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1097     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1098     unsigned IdxTypeIdx = 2;
1099 
1100     getActionDefinitionsBuilder(Op)
1101       .customIf([=](const LegalityQuery &Query) {
1102           const LLT EltTy = Query.Types[EltTypeIdx];
1103           const LLT VecTy = Query.Types[VecTypeIdx];
1104           const LLT IdxTy = Query.Types[IdxTypeIdx];
1105           return (EltTy.getSizeInBits() == 16 ||
1106                   EltTy.getSizeInBits() % 32 == 0) &&
1107                  VecTy.getSizeInBits() % 32 == 0 &&
1108                  VecTy.getSizeInBits() <= 1024 &&
1109                  IdxTy.getSizeInBits() == 32;
1110         })
1111       .clampScalar(EltTypeIdx, S32, S64)
1112       .clampScalar(VecTypeIdx, S32, S64)
1113       .clampScalar(IdxTypeIdx, S32, S32);
1114   }
1115 
1116   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1117     .unsupportedIf([=](const LegalityQuery &Query) {
1118         const LLT &EltTy = Query.Types[1].getElementType();
1119         return Query.Types[0] != EltTy;
1120       });
1121 
1122   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1123     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1124     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1125 
1126     // FIXME: Doesn't handle extract of illegal sizes.
1127     getActionDefinitionsBuilder(Op)
1128       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1129       // FIXME: Multiples of 16 should not be legal.
1130       .legalIf([=](const LegalityQuery &Query) {
1131           const LLT BigTy = Query.Types[BigTyIdx];
1132           const LLT LitTy = Query.Types[LitTyIdx];
1133           return (BigTy.getSizeInBits() % 32 == 0) &&
1134                  (LitTy.getSizeInBits() % 16 == 0);
1135         })
1136       .widenScalarIf(
1137         [=](const LegalityQuery &Query) {
1138           const LLT BigTy = Query.Types[BigTyIdx];
1139           return (BigTy.getScalarSizeInBits() < 16);
1140         },
1141         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1142       .widenScalarIf(
1143         [=](const LegalityQuery &Query) {
1144           const LLT LitTy = Query.Types[LitTyIdx];
1145           return (LitTy.getScalarSizeInBits() < 16);
1146         },
1147         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1148       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1149       .widenScalarToNextPow2(BigTyIdx, 32);
1150 
1151   }
1152 
1153   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1154     .legalForCartesianProduct(AllS32Vectors, {S32})
1155     .legalForCartesianProduct(AllS64Vectors, {S64})
1156     .clampNumElements(0, V16S32, V32S32)
1157     .clampNumElements(0, V2S64, V16S64)
1158     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1159 
1160   if (ST.hasScalarPackInsts()) {
1161     BuildVector
1162       // FIXME: Should probably widen s1 vectors straight to s32
1163       .minScalarOrElt(0, S16)
1164       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1165       .minScalar(1, S32);
1166 
1167     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1168       .legalFor({V2S16, S32})
1169       .lower();
1170     BuildVector.minScalarOrElt(0, S32);
1171   } else {
1172     BuildVector.customFor({V2S16, S16});
1173     BuildVector.minScalarOrElt(0, S32);
1174 
1175     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1176       .customFor({V2S16, S32})
1177       .lower();
1178   }
1179 
1180   BuildVector.legalIf(isRegisterType(0));
1181 
1182   // FIXME: Clamp maximum size
1183   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1184     .legalIf(isRegisterType(0));
1185 
1186   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1187   // pre-legalize.
1188   if (ST.hasVOP3PInsts()) {
1189     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1190       .customFor({V2S16, V2S16})
1191       .lower();
1192   } else
1193     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1194 
1195   // Merge/Unmerge
1196   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1197     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1198     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1199 
1200     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1201       const LLT Ty = Query.Types[TypeIdx];
1202       if (Ty.isVector()) {
1203         const LLT &EltTy = Ty.getElementType();
1204         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1205           return true;
1206         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1207           return true;
1208       }
1209       return false;
1210     };
1211 
1212     auto &Builder = getActionDefinitionsBuilder(Op)
1213       .lowerFor({{S16, V2S16}})
1214       .lowerIf([=](const LegalityQuery &Query) {
1215           const LLT BigTy = Query.Types[BigTyIdx];
1216           return BigTy.getSizeInBits() == 32;
1217         })
1218       // Try to widen to s16 first for small types.
1219       // TODO: Only do this on targets with legal s16 shifts
1220       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1221       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1222       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1223       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1224                            elementTypeIs(1, S16)),
1225                        changeTo(1, V2S16))
1226       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1227       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1228       // valid.
1229       .clampScalar(LitTyIdx, S32, S512)
1230       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1231       // Break up vectors with weird elements into scalars
1232       .fewerElementsIf(
1233         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1234         scalarize(0))
1235       .fewerElementsIf(
1236         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1237         scalarize(1))
1238       .clampScalar(BigTyIdx, S32, S1024);
1239 
1240     if (Op == G_MERGE_VALUES) {
1241       Builder.widenScalarIf(
1242         // TODO: Use 16-bit shifts if legal for 8-bit values?
1243         [=](const LegalityQuery &Query) {
1244           const LLT Ty = Query.Types[LitTyIdx];
1245           return Ty.getSizeInBits() < 32;
1246         },
1247         changeTo(LitTyIdx, S32));
1248     }
1249 
1250     Builder.widenScalarIf(
1251       [=](const LegalityQuery &Query) {
1252         const LLT Ty = Query.Types[BigTyIdx];
1253         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1254           Ty.getSizeInBits() % 16 != 0;
1255       },
1256       [=](const LegalityQuery &Query) {
1257         // Pick the next power of 2, or a multiple of 64 over 128.
1258         // Whichever is smaller.
1259         const LLT &Ty = Query.Types[BigTyIdx];
1260         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1261         if (NewSizeInBits >= 256) {
1262           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1263           if (RoundedTo < NewSizeInBits)
1264             NewSizeInBits = RoundedTo;
1265         }
1266         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1267       })
1268       .legalIf([=](const LegalityQuery &Query) {
1269           const LLT &BigTy = Query.Types[BigTyIdx];
1270           const LLT &LitTy = Query.Types[LitTyIdx];
1271 
1272           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1273             return false;
1274           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1275             return false;
1276 
1277           return BigTy.getSizeInBits() % 16 == 0 &&
1278                  LitTy.getSizeInBits() % 16 == 0 &&
1279                  BigTy.getSizeInBits() <= 1024;
1280         })
1281       // Any vectors left are the wrong size. Scalarize them.
1282       .scalarize(0)
1283       .scalarize(1);
1284   }
1285 
1286   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1287   // RegBankSelect.
1288   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1289     .legalFor({{S32}, {S64}});
1290 
1291   if (ST.hasVOP3PInsts()) {
1292     SextInReg.lowerFor({{V2S16}})
1293       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1294       // get more vector shift opportunities, since we'll get those when
1295       // expanded.
1296       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1297   } else if (ST.has16BitInsts()) {
1298     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1299   } else {
1300     // Prefer to promote to s32 before lowering if we don't have 16-bit
1301     // shifts. This avoid a lot of intermediate truncate and extend operations.
1302     SextInReg.lowerFor({{S32}, {S64}});
1303   }
1304 
1305   SextInReg
1306     .scalarize(0)
1307     .clampScalar(0, S32, S64)
1308     .lower();
1309 
1310   getActionDefinitionsBuilder(G_FSHR)
1311     .legalFor({{S32, S32}})
1312     .scalarize(0)
1313     .lower();
1314 
1315   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1316     .legalFor({S64});
1317 
1318   getActionDefinitionsBuilder({
1319       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1320       G_FCOPYSIGN,
1321 
1322       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1323       G_READ_REGISTER,
1324       G_WRITE_REGISTER,
1325 
1326       G_SADDO, G_SSUBO,
1327 
1328        // TODO: Implement
1329       G_FMINIMUM, G_FMAXIMUM,
1330       G_FSHL
1331     }).lower();
1332 
1333   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1334         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1335         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1336     .unsupported();
1337 
1338   computeTables();
1339   verify(*ST.getInstrInfo());
1340 }
1341 
1342 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1343                                          MachineRegisterInfo &MRI,
1344                                          MachineIRBuilder &B,
1345                                          GISelChangeObserver &Observer) const {
1346   switch (MI.getOpcode()) {
1347   case TargetOpcode::G_ADDRSPACE_CAST:
1348     return legalizeAddrSpaceCast(MI, MRI, B);
1349   case TargetOpcode::G_FRINT:
1350     return legalizeFrint(MI, MRI, B);
1351   case TargetOpcode::G_FCEIL:
1352     return legalizeFceil(MI, MRI, B);
1353   case TargetOpcode::G_INTRINSIC_TRUNC:
1354     return legalizeIntrinsicTrunc(MI, MRI, B);
1355   case TargetOpcode::G_SITOFP:
1356     return legalizeITOFP(MI, MRI, B, true);
1357   case TargetOpcode::G_UITOFP:
1358     return legalizeITOFP(MI, MRI, B, false);
1359   case TargetOpcode::G_FPTOSI:
1360     return legalizeFPTOI(MI, MRI, B, true);
1361   case TargetOpcode::G_FPTOUI:
1362     return legalizeFPTOI(MI, MRI, B, false);
1363   case TargetOpcode::G_FMINNUM:
1364   case TargetOpcode::G_FMAXNUM:
1365   case TargetOpcode::G_FMINNUM_IEEE:
1366   case TargetOpcode::G_FMAXNUM_IEEE:
1367     return legalizeMinNumMaxNum(MI, MRI, B);
1368   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1369     return legalizeExtractVectorElt(MI, MRI, B);
1370   case TargetOpcode::G_INSERT_VECTOR_ELT:
1371     return legalizeInsertVectorElt(MI, MRI, B);
1372   case TargetOpcode::G_SHUFFLE_VECTOR:
1373     return legalizeShuffleVector(MI, MRI, B);
1374   case TargetOpcode::G_FSIN:
1375   case TargetOpcode::G_FCOS:
1376     return legalizeSinCos(MI, MRI, B);
1377   case TargetOpcode::G_GLOBAL_VALUE:
1378     return legalizeGlobalValue(MI, MRI, B);
1379   case TargetOpcode::G_LOAD:
1380     return legalizeLoad(MI, MRI, B, Observer);
1381   case TargetOpcode::G_FMAD:
1382     return legalizeFMad(MI, MRI, B);
1383   case TargetOpcode::G_FDIV:
1384     return legalizeFDIV(MI, MRI, B);
1385   case TargetOpcode::G_UDIV:
1386   case TargetOpcode::G_UREM:
1387     return legalizeUDIV_UREM(MI, MRI, B);
1388   case TargetOpcode::G_SDIV:
1389   case TargetOpcode::G_SREM:
1390     return legalizeSDIV_SREM(MI, MRI, B);
1391   case TargetOpcode::G_ATOMIC_CMPXCHG:
1392     return legalizeAtomicCmpXChg(MI, MRI, B);
1393   case TargetOpcode::G_FLOG:
1394     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1395   case TargetOpcode::G_FLOG10:
1396     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1397   case TargetOpcode::G_FEXP:
1398     return legalizeFExp(MI, B);
1399   case TargetOpcode::G_FPOW:
1400     return legalizeFPow(MI, B);
1401   case TargetOpcode::G_FFLOOR:
1402     return legalizeFFloor(MI, MRI, B);
1403   case TargetOpcode::G_BUILD_VECTOR:
1404     return legalizeBuildVector(MI, MRI, B);
1405   default:
1406     return false;
1407   }
1408 
1409   llvm_unreachable("expected switch to return");
1410 }
1411 
1412 Register AMDGPULegalizerInfo::getSegmentAperture(
1413   unsigned AS,
1414   MachineRegisterInfo &MRI,
1415   MachineIRBuilder &B) const {
1416   MachineFunction &MF = B.getMF();
1417   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1418   const LLT S32 = LLT::scalar(32);
1419 
1420   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1421 
1422   if (ST.hasApertureRegs()) {
1423     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1424     // getreg.
1425     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1426         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1427         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1428     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1429         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1430         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1431     unsigned Encoding =
1432         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1433         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1434         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1435 
1436     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1437 
1438     B.buildInstr(AMDGPU::S_GETREG_B32)
1439       .addDef(GetReg)
1440       .addImm(Encoding);
1441     MRI.setType(GetReg, S32);
1442 
1443     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1444     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1445   }
1446 
1447   Register QueuePtr = MRI.createGenericVirtualRegister(
1448     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1449 
1450   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1451   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1452     return Register();
1453 
1454   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1455   // private_segment_aperture_base_hi.
1456   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1457 
1458   // TODO: can we be smarter about machine pointer info?
1459   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1460   MachineMemOperand *MMO = MF.getMachineMemOperand(
1461       PtrInfo,
1462       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1463           MachineMemOperand::MOInvariant,
1464       4, commonAlignment(Align(64), StructOffset));
1465 
1466   Register LoadAddr;
1467 
1468   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1469   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1470 }
1471 
1472 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1473   MachineInstr &MI, MachineRegisterInfo &MRI,
1474   MachineIRBuilder &B) const {
1475   MachineFunction &MF = B.getMF();
1476 
1477   B.setInstr(MI);
1478 
1479   const LLT S32 = LLT::scalar(32);
1480   Register Dst = MI.getOperand(0).getReg();
1481   Register Src = MI.getOperand(1).getReg();
1482 
1483   LLT DstTy = MRI.getType(Dst);
1484   LLT SrcTy = MRI.getType(Src);
1485   unsigned DestAS = DstTy.getAddressSpace();
1486   unsigned SrcAS = SrcTy.getAddressSpace();
1487 
1488   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1489   // vector element.
1490   assert(!DstTy.isVector());
1491 
1492   const AMDGPUTargetMachine &TM
1493     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1494 
1495   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1496   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1497     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1498     return true;
1499   }
1500 
1501   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1502     // Truncate.
1503     B.buildExtract(Dst, Src, 0);
1504     MI.eraseFromParent();
1505     return true;
1506   }
1507 
1508   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1509     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1510     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1511 
1512     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1513     // another. Merge operands are required to be the same type, but creating an
1514     // extra ptrtoint would be kind of pointless.
1515     auto HighAddr = B.buildConstant(
1516       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1517     B.buildMerge(Dst, {Src, HighAddr});
1518     MI.eraseFromParent();
1519     return true;
1520   }
1521 
1522   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1523     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1524            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1525     unsigned NullVal = TM.getNullPointerValue(DestAS);
1526 
1527     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1528     auto FlatNull = B.buildConstant(SrcTy, 0);
1529 
1530     // Extract low 32-bits of the pointer.
1531     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1532 
1533     auto CmpRes =
1534         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1535     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1536 
1537     MI.eraseFromParent();
1538     return true;
1539   }
1540 
1541   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1542     return false;
1543 
1544   if (!ST.hasFlatAddressSpace())
1545     return false;
1546 
1547   auto SegmentNull =
1548       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1549   auto FlatNull =
1550       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1551 
1552   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1553   if (!ApertureReg.isValid())
1554     return false;
1555 
1556   auto CmpRes =
1557       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1558 
1559   // Coerce the type of the low half of the result so we can use merge_values.
1560   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1561 
1562   // TODO: Should we allow mismatched types but matching sizes in merges to
1563   // avoid the ptrtoint?
1564   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1565   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1566 
1567   MI.eraseFromParent();
1568   return true;
1569 }
1570 
1571 bool AMDGPULegalizerInfo::legalizeFrint(
1572   MachineInstr &MI, MachineRegisterInfo &MRI,
1573   MachineIRBuilder &B) const {
1574   B.setInstr(MI);
1575 
1576   Register Src = MI.getOperand(1).getReg();
1577   LLT Ty = MRI.getType(Src);
1578   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1579 
1580   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1581   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1582 
1583   auto C1 = B.buildFConstant(Ty, C1Val);
1584   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1585 
1586   // TODO: Should this propagate fast-math-flags?
1587   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1588   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1589 
1590   auto C2 = B.buildFConstant(Ty, C2Val);
1591   auto Fabs = B.buildFAbs(Ty, Src);
1592 
1593   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1594   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1595   return true;
1596 }
1597 
1598 bool AMDGPULegalizerInfo::legalizeFceil(
1599   MachineInstr &MI, MachineRegisterInfo &MRI,
1600   MachineIRBuilder &B) const {
1601   B.setInstr(MI);
1602 
1603   const LLT S1 = LLT::scalar(1);
1604   const LLT S64 = LLT::scalar(64);
1605 
1606   Register Src = MI.getOperand(1).getReg();
1607   assert(MRI.getType(Src) == S64);
1608 
1609   // result = trunc(src)
1610   // if (src > 0.0 && src != result)
1611   //   result += 1.0
1612 
1613   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1614 
1615   const auto Zero = B.buildFConstant(S64, 0.0);
1616   const auto One = B.buildFConstant(S64, 1.0);
1617   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1618   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1619   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1620   auto Add = B.buildSelect(S64, And, One, Zero);
1621 
1622   // TODO: Should this propagate fast-math-flags?
1623   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1624   return true;
1625 }
1626 
1627 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1628                                               MachineIRBuilder &B) {
1629   const unsigned FractBits = 52;
1630   const unsigned ExpBits = 11;
1631   LLT S32 = LLT::scalar(32);
1632 
1633   auto Const0 = B.buildConstant(S32, FractBits - 32);
1634   auto Const1 = B.buildConstant(S32, ExpBits);
1635 
1636   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1637     .addUse(Const0.getReg(0))
1638     .addUse(Const1.getReg(0));
1639 
1640   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1641 }
1642 
1643 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1644   MachineInstr &MI, MachineRegisterInfo &MRI,
1645   MachineIRBuilder &B) const {
1646   B.setInstr(MI);
1647 
1648   const LLT S1 = LLT::scalar(1);
1649   const LLT S32 = LLT::scalar(32);
1650   const LLT S64 = LLT::scalar(64);
1651 
1652   Register Src = MI.getOperand(1).getReg();
1653   assert(MRI.getType(Src) == S64);
1654 
1655   // TODO: Should this use extract since the low half is unused?
1656   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1657   Register Hi = Unmerge.getReg(1);
1658 
1659   // Extract the upper half, since this is where we will find the sign and
1660   // exponent.
1661   auto Exp = extractF64Exponent(Hi, B);
1662 
1663   const unsigned FractBits = 52;
1664 
1665   // Extract the sign bit.
1666   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1667   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1668 
1669   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1670 
1671   const auto Zero32 = B.buildConstant(S32, 0);
1672 
1673   // Extend back to 64-bits.
1674   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1675 
1676   auto Shr = B.buildAShr(S64, FractMask, Exp);
1677   auto Not = B.buildNot(S64, Shr);
1678   auto Tmp0 = B.buildAnd(S64, Src, Not);
1679   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1680 
1681   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1682   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1683 
1684   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1685   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1686   return true;
1687 }
1688 
1689 bool AMDGPULegalizerInfo::legalizeITOFP(
1690   MachineInstr &MI, MachineRegisterInfo &MRI,
1691   MachineIRBuilder &B, bool Signed) const {
1692   B.setInstr(MI);
1693 
1694   Register Dst = MI.getOperand(0).getReg();
1695   Register Src = MI.getOperand(1).getReg();
1696 
1697   const LLT S64 = LLT::scalar(64);
1698   const LLT S32 = LLT::scalar(32);
1699 
1700   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1701 
1702   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1703 
1704   auto CvtHi = Signed ?
1705     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1706     B.buildUITOFP(S64, Unmerge.getReg(1));
1707 
1708   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1709 
1710   auto ThirtyTwo = B.buildConstant(S32, 32);
1711   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1712     .addUse(CvtHi.getReg(0))
1713     .addUse(ThirtyTwo.getReg(0));
1714 
1715   // TODO: Should this propagate fast-math-flags?
1716   B.buildFAdd(Dst, LdExp, CvtLo);
1717   MI.eraseFromParent();
1718   return true;
1719 }
1720 
1721 // TODO: Copied from DAG implementation. Verify logic and document how this
1722 // actually works.
1723 bool AMDGPULegalizerInfo::legalizeFPTOI(
1724   MachineInstr &MI, MachineRegisterInfo &MRI,
1725   MachineIRBuilder &B, bool Signed) const {
1726   B.setInstr(MI);
1727 
1728   Register Dst = MI.getOperand(0).getReg();
1729   Register Src = MI.getOperand(1).getReg();
1730 
1731   const LLT S64 = LLT::scalar(64);
1732   const LLT S32 = LLT::scalar(32);
1733 
1734   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1735 
1736   unsigned Flags = MI.getFlags();
1737 
1738   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1739   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1740   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1741 
1742   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1743   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1744   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1745 
1746   auto Hi = Signed ?
1747     B.buildFPTOSI(S32, FloorMul) :
1748     B.buildFPTOUI(S32, FloorMul);
1749   auto Lo = B.buildFPTOUI(S32, Fma);
1750 
1751   B.buildMerge(Dst, { Lo, Hi });
1752   MI.eraseFromParent();
1753 
1754   return true;
1755 }
1756 
1757 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1758   MachineInstr &MI, MachineRegisterInfo &MRI,
1759   MachineIRBuilder &B) const {
1760   MachineFunction &MF = B.getMF();
1761   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1762 
1763   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1764                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1765 
1766   // With ieee_mode disabled, the instructions have the correct behavior
1767   // already for G_FMINNUM/G_FMAXNUM
1768   if (!MFI->getMode().IEEE)
1769     return !IsIEEEOp;
1770 
1771   if (IsIEEEOp)
1772     return true;
1773 
1774   MachineIRBuilder HelperBuilder(MI);
1775   GISelObserverWrapper DummyObserver;
1776   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1777   HelperBuilder.setInstr(MI);
1778   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1779 }
1780 
1781 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1782   MachineInstr &MI, MachineRegisterInfo &MRI,
1783   MachineIRBuilder &B) const {
1784   // TODO: Should move some of this into LegalizerHelper.
1785 
1786   // TODO: Promote dynamic indexing of s16 to s32
1787 
1788   // FIXME: Artifact combiner probably should have replaced the truncated
1789   // constant before this, so we shouldn't need
1790   // getConstantVRegValWithLookThrough.
1791   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1792     MI.getOperand(2).getReg(), MRI);
1793   if (!IdxVal) // Dynamic case will be selected to register indexing.
1794     return true;
1795 
1796   Register Dst = MI.getOperand(0).getReg();
1797   Register Vec = MI.getOperand(1).getReg();
1798 
1799   LLT VecTy = MRI.getType(Vec);
1800   LLT EltTy = VecTy.getElementType();
1801   assert(EltTy == MRI.getType(Dst));
1802 
1803   B.setInstr(MI);
1804 
1805   if (IdxVal->Value < VecTy.getNumElements())
1806     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1807   else
1808     B.buildUndef(Dst);
1809 
1810   MI.eraseFromParent();
1811   return true;
1812 }
1813 
1814 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1815   MachineInstr &MI, MachineRegisterInfo &MRI,
1816   MachineIRBuilder &B) const {
1817   // TODO: Should move some of this into LegalizerHelper.
1818 
1819   // TODO: Promote dynamic indexing of s16 to s32
1820 
1821   // FIXME: Artifact combiner probably should have replaced the truncated
1822   // constant before this, so we shouldn't need
1823   // getConstantVRegValWithLookThrough.
1824   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1825     MI.getOperand(3).getReg(), MRI);
1826   if (!IdxVal) // Dynamic case will be selected to register indexing.
1827     return true;
1828 
1829   Register Dst = MI.getOperand(0).getReg();
1830   Register Vec = MI.getOperand(1).getReg();
1831   Register Ins = MI.getOperand(2).getReg();
1832 
1833   LLT VecTy = MRI.getType(Vec);
1834   LLT EltTy = VecTy.getElementType();
1835   assert(EltTy == MRI.getType(Ins));
1836 
1837   B.setInstr(MI);
1838 
1839   if (IdxVal->Value < VecTy.getNumElements())
1840     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1841   else
1842     B.buildUndef(Dst);
1843 
1844   MI.eraseFromParent();
1845   return true;
1846 }
1847 
1848 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1849   MachineInstr &MI, MachineRegisterInfo &MRI,
1850   MachineIRBuilder &B) const {
1851   const LLT V2S16 = LLT::vector(2, 16);
1852 
1853   Register Dst = MI.getOperand(0).getReg();
1854   Register Src0 = MI.getOperand(1).getReg();
1855   LLT DstTy = MRI.getType(Dst);
1856   LLT SrcTy = MRI.getType(Src0);
1857 
1858   if (SrcTy == V2S16 && DstTy == V2S16 &&
1859       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1860     return true;
1861 
1862   MachineIRBuilder HelperBuilder(MI);
1863   GISelObserverWrapper DummyObserver;
1864   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1865   HelperBuilder.setInstr(MI);
1866   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1867 }
1868 
1869 bool AMDGPULegalizerInfo::legalizeSinCos(
1870   MachineInstr &MI, MachineRegisterInfo &MRI,
1871   MachineIRBuilder &B) const {
1872   B.setInstr(MI);
1873 
1874   Register DstReg = MI.getOperand(0).getReg();
1875   Register SrcReg = MI.getOperand(1).getReg();
1876   LLT Ty = MRI.getType(DstReg);
1877   unsigned Flags = MI.getFlags();
1878 
1879   Register TrigVal;
1880   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1881   if (ST.hasTrigReducedRange()) {
1882     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1883     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1884       .addUse(MulVal.getReg(0))
1885       .setMIFlags(Flags).getReg(0);
1886   } else
1887     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1888 
1889   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1890     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1891   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1892     .addUse(TrigVal)
1893     .setMIFlags(Flags);
1894   MI.eraseFromParent();
1895   return true;
1896 }
1897 
1898 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1899   Register DstReg, LLT PtrTy,
1900   MachineIRBuilder &B, const GlobalValue *GV,
1901   unsigned Offset, unsigned GAFlags) const {
1902   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1903   // to the following code sequence:
1904   //
1905   // For constant address space:
1906   //   s_getpc_b64 s[0:1]
1907   //   s_add_u32 s0, s0, $symbol
1908   //   s_addc_u32 s1, s1, 0
1909   //
1910   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1911   //   a fixup or relocation is emitted to replace $symbol with a literal
1912   //   constant, which is a pc-relative offset from the encoding of the $symbol
1913   //   operand to the global variable.
1914   //
1915   // For global address space:
1916   //   s_getpc_b64 s[0:1]
1917   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1918   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1919   //
1920   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1921   //   fixups or relocations are emitted to replace $symbol@*@lo and
1922   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1923   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1924   //   operand to the global variable.
1925   //
1926   // What we want here is an offset from the value returned by s_getpc
1927   // (which is the address of the s_add_u32 instruction) to the global
1928   // variable, but since the encoding of $symbol starts 4 bytes after the start
1929   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1930   // small. This requires us to add 4 to the global variable offset in order to
1931   // compute the correct address.
1932 
1933   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1934 
1935   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1936     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1937 
1938   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1939     .addDef(PCReg);
1940 
1941   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1942   if (GAFlags == SIInstrInfo::MO_NONE)
1943     MIB.addImm(0);
1944   else
1945     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1946 
1947   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1948 
1949   if (PtrTy.getSizeInBits() == 32)
1950     B.buildExtract(DstReg, PCReg, 0);
1951   return true;
1952  }
1953 
1954 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1955   MachineInstr &MI, MachineRegisterInfo &MRI,
1956   MachineIRBuilder &B) const {
1957   Register DstReg = MI.getOperand(0).getReg();
1958   LLT Ty = MRI.getType(DstReg);
1959   unsigned AS = Ty.getAddressSpace();
1960 
1961   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1962   MachineFunction &MF = B.getMF();
1963   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1964   B.setInstr(MI);
1965 
1966   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1967     if (!MFI->isEntryFunction()) {
1968       const Function &Fn = MF.getFunction();
1969       DiagnosticInfoUnsupported BadLDSDecl(
1970         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1971         DS_Warning);
1972       Fn.getContext().diagnose(BadLDSDecl);
1973 
1974       // We currently don't have a way to correctly allocate LDS objects that
1975       // aren't directly associated with a kernel. We do force inlining of
1976       // functions that use local objects. However, if these dead functions are
1977       // not eliminated, we don't want a compile time error. Just emit a warning
1978       // and a trap, since there should be no callable path here.
1979       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1980       B.buildUndef(DstReg);
1981       MI.eraseFromParent();
1982       return true;
1983     }
1984 
1985     // TODO: We could emit code to handle the initialization somewhere.
1986     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1987       const SITargetLowering *TLI = ST.getTargetLowering();
1988       if (!TLI->shouldUseLDSConstAddress(GV)) {
1989         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1990         return true; // Leave in place;
1991       }
1992 
1993       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1994       MI.eraseFromParent();
1995       return true;
1996     }
1997 
1998     const Function &Fn = MF.getFunction();
1999     DiagnosticInfoUnsupported BadInit(
2000       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2001     Fn.getContext().diagnose(BadInit);
2002     return true;
2003   }
2004 
2005   const SITargetLowering *TLI = ST.getTargetLowering();
2006 
2007   if (TLI->shouldEmitFixup(GV)) {
2008     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2009     MI.eraseFromParent();
2010     return true;
2011   }
2012 
2013   if (TLI->shouldEmitPCReloc(GV)) {
2014     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2015     MI.eraseFromParent();
2016     return true;
2017   }
2018 
2019   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2020   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2021 
2022   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2023       MachinePointerInfo::getGOT(MF),
2024       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2025           MachineMemOperand::MOInvariant,
2026       8 /*Size*/, Align(8));
2027 
2028   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2029 
2030   if (Ty.getSizeInBits() == 32) {
2031     // Truncate if this is a 32-bit constant adrdess.
2032     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2033     B.buildExtract(DstReg, Load, 0);
2034   } else
2035     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2036 
2037   MI.eraseFromParent();
2038   return true;
2039 }
2040 
2041 bool AMDGPULegalizerInfo::legalizeLoad(
2042   MachineInstr &MI, MachineRegisterInfo &MRI,
2043   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2044   B.setInstr(MI);
2045   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2046   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2047   Observer.changingInstr(MI);
2048   MI.getOperand(1).setReg(Cast.getReg(0));
2049   Observer.changedInstr(MI);
2050   return true;
2051 }
2052 
2053 bool AMDGPULegalizerInfo::legalizeFMad(
2054   MachineInstr &MI, MachineRegisterInfo &MRI,
2055   MachineIRBuilder &B) const {
2056   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2057   assert(Ty.isScalar());
2058 
2059   MachineFunction &MF = B.getMF();
2060   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2061 
2062   // TODO: Always legal with future ftz flag.
2063   // FIXME: Do we need just output?
2064   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2065     return true;
2066   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2067     return true;
2068 
2069   MachineIRBuilder HelperBuilder(MI);
2070   GISelObserverWrapper DummyObserver;
2071   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2072   HelperBuilder.setInstr(MI);
2073   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2074 }
2075 
2076 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2077   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2078   Register DstReg = MI.getOperand(0).getReg();
2079   Register PtrReg = MI.getOperand(1).getReg();
2080   Register CmpVal = MI.getOperand(2).getReg();
2081   Register NewVal = MI.getOperand(3).getReg();
2082 
2083   assert(SITargetLowering::isFlatGlobalAddrSpace(
2084            MRI.getType(PtrReg).getAddressSpace()) &&
2085          "this should not have been custom lowered");
2086 
2087   LLT ValTy = MRI.getType(CmpVal);
2088   LLT VecTy = LLT::vector(2, ValTy);
2089 
2090   B.setInstr(MI);
2091   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2092 
2093   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2094     .addDef(DstReg)
2095     .addUse(PtrReg)
2096     .addUse(PackedVal)
2097     .setMemRefs(MI.memoperands());
2098 
2099   MI.eraseFromParent();
2100   return true;
2101 }
2102 
2103 bool AMDGPULegalizerInfo::legalizeFlog(
2104   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2105   Register Dst = MI.getOperand(0).getReg();
2106   Register Src = MI.getOperand(1).getReg();
2107   LLT Ty = B.getMRI()->getType(Dst);
2108   unsigned Flags = MI.getFlags();
2109   B.setInstr(MI);
2110 
2111   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2112   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2113 
2114   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2115   MI.eraseFromParent();
2116   return true;
2117 }
2118 
2119 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2120                                        MachineIRBuilder &B) const {
2121   Register Dst = MI.getOperand(0).getReg();
2122   Register Src = MI.getOperand(1).getReg();
2123   unsigned Flags = MI.getFlags();
2124   LLT Ty = B.getMRI()->getType(Dst);
2125   B.setInstr(MI);
2126 
2127   auto K = B.buildFConstant(Ty, numbers::log2e);
2128   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2129   B.buildFExp2(Dst, Mul, Flags);
2130   MI.eraseFromParent();
2131   return true;
2132 }
2133 
2134 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2135                                        MachineIRBuilder &B) const {
2136   Register Dst = MI.getOperand(0).getReg();
2137   Register Src0 = MI.getOperand(1).getReg();
2138   Register Src1 = MI.getOperand(2).getReg();
2139   unsigned Flags = MI.getFlags();
2140   LLT Ty = B.getMRI()->getType(Dst);
2141   B.setInstr(MI);
2142   const LLT S16 = LLT::scalar(16);
2143   const LLT S32 = LLT::scalar(32);
2144 
2145   if (Ty == S32) {
2146     auto Log = B.buildFLog2(S32, Src0, Flags);
2147     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2148       .addUse(Log.getReg(0))
2149       .addUse(Src1)
2150       .setMIFlags(Flags);
2151     B.buildFExp2(Dst, Mul, Flags);
2152   } else if (Ty == S16) {
2153     // There's no f16 fmul_legacy, so we need to convert for it.
2154     auto Log = B.buildFLog2(S16, Src0, Flags);
2155     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2156     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2157     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2158       .addUse(Ext0.getReg(0))
2159       .addUse(Ext1.getReg(0))
2160       .setMIFlags(Flags);
2161 
2162     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2163   } else
2164     return false;
2165 
2166   MI.eraseFromParent();
2167   return true;
2168 }
2169 
2170 // Find a source register, ignoring any possible source modifiers.
2171 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2172   Register ModSrc = OrigSrc;
2173   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2174     ModSrc = SrcFNeg->getOperand(1).getReg();
2175     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2176       ModSrc = SrcFAbs->getOperand(1).getReg();
2177   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2178     ModSrc = SrcFAbs->getOperand(1).getReg();
2179   return ModSrc;
2180 }
2181 
2182 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2183                                          MachineRegisterInfo &MRI,
2184                                          MachineIRBuilder &B) const {
2185   B.setInstr(MI);
2186 
2187   const LLT S1 = LLT::scalar(1);
2188   const LLT S64 = LLT::scalar(64);
2189   Register Dst = MI.getOperand(0).getReg();
2190   Register OrigSrc = MI.getOperand(1).getReg();
2191   unsigned Flags = MI.getFlags();
2192   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2193          "this should not have been custom lowered");
2194 
2195   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2196   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2197   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2198   // V_FRACT bug is:
2199   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2200   //
2201   // Convert floor(x) to (x - fract(x))
2202 
2203   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2204     .addUse(OrigSrc)
2205     .setMIFlags(Flags);
2206 
2207   // Give source modifier matching some assistance before obscuring a foldable
2208   // pattern.
2209 
2210   // TODO: We can avoid the neg on the fract? The input sign to fract
2211   // shouldn't matter?
2212   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2213 
2214   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2215 
2216   Register Min = MRI.createGenericVirtualRegister(S64);
2217 
2218   // We don't need to concern ourselves with the snan handling difference, so
2219   // use the one which will directly select.
2220   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2221   if (MFI->getMode().IEEE)
2222     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2223   else
2224     B.buildFMinNum(Min, Fract, Const, Flags);
2225 
2226   Register CorrectedFract = Min;
2227   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2228     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2229     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2230   }
2231 
2232   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2233   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2234 
2235   MI.eraseFromParent();
2236   return true;
2237 }
2238 
2239 // Turn an illegal packed v2s16 build vector into bit operations.
2240 // TODO: This should probably be a bitcast action in LegalizerHelper.
2241 bool AMDGPULegalizerInfo::legalizeBuildVector(
2242   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2243   Register Dst = MI.getOperand(0).getReg();
2244   const LLT S32 = LLT::scalar(32);
2245   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2246 
2247   Register Src0 = MI.getOperand(1).getReg();
2248   Register Src1 = MI.getOperand(2).getReg();
2249   assert(MRI.getType(Src0) == LLT::scalar(16));
2250 
2251   B.setInstr(MI);
2252   auto Merge = B.buildMerge(S32, {Src0, Src1});
2253   B.buildBitcast(Dst, Merge);
2254 
2255   MI.eraseFromParent();
2256   return true;
2257 }
2258 
2259 // Return the use branch instruction, otherwise null if the usage is invalid.
2260 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2261                                        MachineRegisterInfo &MRI,
2262                                        MachineInstr *&Br,
2263                                        MachineBasicBlock *&UncondBrTarget) {
2264   Register CondDef = MI.getOperand(0).getReg();
2265   if (!MRI.hasOneNonDBGUse(CondDef))
2266     return nullptr;
2267 
2268   MachineBasicBlock *Parent = MI.getParent();
2269   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2270   if (UseMI.getParent() != Parent ||
2271       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2272     return nullptr;
2273 
2274   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2275   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2276   if (Next == Parent->end()) {
2277     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2278     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2279       return nullptr;
2280     UncondBrTarget = &*NextMBB;
2281   } else {
2282     if (Next->getOpcode() != AMDGPU::G_BR)
2283       return nullptr;
2284     Br = &*Next;
2285     UncondBrTarget = Br->getOperand(0).getMBB();
2286   }
2287 
2288   return &UseMI;
2289 }
2290 
2291 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2292                                                MachineRegisterInfo &MRI,
2293                                                Register LiveIn,
2294                                                Register PhyReg) const {
2295   assert(PhyReg.isPhysical() && "Physical register expected");
2296 
2297   // Insert the live-in copy, if required, by defining destination virtual
2298   // register.
2299   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2300   if (!MRI.getVRegDef(LiveIn)) {
2301     // FIXME: Should have scoped insert pt
2302     MachineBasicBlock &OrigInsBB = B.getMBB();
2303     auto OrigInsPt = B.getInsertPt();
2304 
2305     MachineBasicBlock &EntryMBB = B.getMF().front();
2306     EntryMBB.addLiveIn(PhyReg);
2307     B.setInsertPt(EntryMBB, EntryMBB.begin());
2308     B.buildCopy(LiveIn, PhyReg);
2309 
2310     B.setInsertPt(OrigInsBB, OrigInsPt);
2311   }
2312 
2313   return LiveIn;
2314 }
2315 
2316 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2317                                                 MachineRegisterInfo &MRI,
2318                                                 Register PhyReg, LLT Ty,
2319                                                 bool InsertLiveInCopy) const {
2320   assert(PhyReg.isPhysical() && "Physical register expected");
2321 
2322   // Get or create virtual live-in regester
2323   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2324   if (!LiveIn) {
2325     LiveIn = MRI.createGenericVirtualRegister(Ty);
2326     MRI.addLiveIn(PhyReg, LiveIn);
2327   }
2328 
2329   // When the actual true copy required is from virtual register to physical
2330   // register (to be inserted later), live-in copy insertion from physical
2331   // to register virtual register is not required
2332   if (!InsertLiveInCopy)
2333     return LiveIn;
2334 
2335   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2336 }
2337 
2338 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2339     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2340   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2341   const ArgDescriptor *Arg;
2342   const TargetRegisterClass *RC;
2343   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2344   if (!Arg) {
2345     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2346     return nullptr;
2347   }
2348   return Arg;
2349 }
2350 
2351 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2352                                          const ArgDescriptor *Arg) const {
2353   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2354     return false; // TODO: Handle these
2355 
2356   Register SrcReg = Arg->getRegister();
2357   assert(SrcReg.isPhysical() && "Physical register expected");
2358   assert(DstReg.isVirtual() && "Virtual register expected");
2359 
2360   MachineRegisterInfo &MRI = *B.getMRI();
2361 
2362   LLT Ty = MRI.getType(DstReg);
2363   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2364 
2365   if (Arg->isMasked()) {
2366     // TODO: Should we try to emit this once in the entry block?
2367     const LLT S32 = LLT::scalar(32);
2368     const unsigned Mask = Arg->getMask();
2369     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2370 
2371     Register AndMaskSrc = LiveIn;
2372 
2373     if (Shift != 0) {
2374       auto ShiftAmt = B.buildConstant(S32, Shift);
2375       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2376     }
2377 
2378     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2379   } else {
2380     B.buildCopy(DstReg, LiveIn);
2381   }
2382 
2383   return true;
2384 }
2385 
2386 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2387     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2388     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2389   B.setInstr(MI);
2390 
2391   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2392   if (!Arg)
2393     return false;
2394 
2395   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2396     return false;
2397 
2398   MI.eraseFromParent();
2399   return true;
2400 }
2401 
2402 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2403                                        MachineRegisterInfo &MRI,
2404                                        MachineIRBuilder &B) const {
2405   B.setInstr(MI);
2406   Register Dst = MI.getOperand(0).getReg();
2407   LLT DstTy = MRI.getType(Dst);
2408   LLT S16 = LLT::scalar(16);
2409   LLT S32 = LLT::scalar(32);
2410   LLT S64 = LLT::scalar(64);
2411 
2412   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2413     return true;
2414 
2415   if (DstTy == S16)
2416     return legalizeFDIV16(MI, MRI, B);
2417   if (DstTy == S32)
2418     return legalizeFDIV32(MI, MRI, B);
2419   if (DstTy == S64)
2420     return legalizeFDIV64(MI, MRI, B);
2421 
2422   return false;
2423 }
2424 
2425 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2426   const LLT S32 = LLT::scalar(32);
2427 
2428   auto Cvt0 = B.buildUITOFP(S32, Src);
2429   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2430   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2431   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2432   return B.buildFPTOUI(S32, Mul).getReg(0);
2433 }
2434 
2435 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2436                                                   Register DstReg,
2437                                                   Register Num,
2438                                                   Register Den,
2439                                                   bool IsRem) const {
2440   const LLT S1 = LLT::scalar(1);
2441   const LLT S32 = LLT::scalar(32);
2442 
2443   // RCP =  URECIP(Den) = 2^32 / Den + e
2444   // e is rounding error.
2445   auto RCP = buildDivRCP(B, Den);
2446 
2447   // RCP_LO = mul(RCP, Den)
2448   auto RCP_LO = B.buildMul(S32, RCP, Den);
2449 
2450   // RCP_HI = mulhu (RCP, Den) */
2451   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2452 
2453   // NEG_RCP_LO = -RCP_LO
2454   auto Zero = B.buildConstant(S32, 0);
2455   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2456 
2457   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2458   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2459   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2460 
2461   // Calculate the rounding error from the URECIP instruction
2462   // E = mulhu(ABS_RCP_LO, RCP)
2463   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2464 
2465   // RCP_A_E = RCP + E
2466   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2467 
2468   // RCP_S_E = RCP - E
2469   auto RCP_S_E = B.buildSub(S32, RCP, E);
2470 
2471   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2472   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2473 
2474   // Quotient = mulhu(Tmp0, Num)stmp
2475   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2476 
2477   // Num_S_Remainder = Quotient * Den
2478   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2479 
2480   // Remainder = Num - Num_S_Remainder
2481   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2482 
2483   // Remainder_GE_Den = Remainder >= Den
2484   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2485 
2486   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2487   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2488                                        Num, Num_S_Remainder);
2489 
2490   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2491   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2492 
2493   // Calculate Division result:
2494 
2495   // Quotient_A_One = Quotient + 1
2496   auto One = B.buildConstant(S32, 1);
2497   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2498 
2499   // Quotient_S_One = Quotient - 1
2500   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2501 
2502   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2503   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2504 
2505   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2506   if (IsRem) {
2507     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2508 
2509     // Calculate Rem result:
2510     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2511 
2512     // Remainder_A_Den = Remainder + Den
2513     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2514 
2515     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2516     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2517 
2518     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2519     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2520   } else {
2521     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2522   }
2523 }
2524 
2525 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2526                                               MachineRegisterInfo &MRI,
2527                                               MachineIRBuilder &B) const {
2528   B.setInstr(MI);
2529   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2530   Register DstReg = MI.getOperand(0).getReg();
2531   Register Num = MI.getOperand(1).getReg();
2532   Register Den = MI.getOperand(2).getReg();
2533   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2534   MI.eraseFromParent();
2535   return true;
2536 }
2537 
2538 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2539 //
2540 // Return lo, hi of result
2541 //
2542 // %cvt.lo = G_UITOFP Val.lo
2543 // %cvt.hi = G_UITOFP Val.hi
2544 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2545 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2546 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2547 // %mul2 = G_FMUL %mul1, 2**(-32)
2548 // %trunc = G_INTRINSIC_TRUNC %mul2
2549 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2550 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2551 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2552                                                        Register Val) {
2553   const LLT S32 = LLT::scalar(32);
2554   auto Unmerge = B.buildUnmerge(S32, Val);
2555 
2556   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2557   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2558 
2559   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2560                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2561 
2562   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2563   auto Mul1 =
2564       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2565 
2566   // 2**(-32)
2567   auto Mul2 =
2568       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2569   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2570 
2571   // -(2**32)
2572   auto Mad2 = B.buildFMAD(S32, Trunc,
2573                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2574 
2575   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2576   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2577 
2578   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2579 }
2580 
2581 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2582                                               MachineRegisterInfo &MRI,
2583                                               MachineIRBuilder &B) const {
2584   B.setInstr(MI);
2585 
2586   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2587   const LLT S32 = LLT::scalar(32);
2588   const LLT S64 = LLT::scalar(64);
2589   const LLT S1 = LLT::scalar(1);
2590   Register Numer = MI.getOperand(1).getReg();
2591   Register Denom = MI.getOperand(2).getReg();
2592   Register RcpLo, RcpHi;
2593 
2594   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2595 
2596   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2597 
2598   auto Zero64 = B.buildConstant(S64, 0);
2599   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2600 
2601   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2602   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2603 
2604   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2605   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2606   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2607 
2608   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2609   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2610   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2611   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2612 
2613   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2614   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2615   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2616   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2617   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2618 
2619   auto Zero32 = B.buildConstant(S32, 0);
2620   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2621   auto Add2_HiC =
2622       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2623   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2624   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2625 
2626   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2627   Register NumerLo = UnmergeNumer.getReg(0);
2628   Register NumerHi = UnmergeNumer.getReg(1);
2629 
2630   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2631   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2632   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2633   Register Mul3_Lo = UnmergeMul3.getReg(0);
2634   Register Mul3_Hi = UnmergeMul3.getReg(1);
2635   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2636   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2637   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2638   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2639 
2640   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2641   Register DenomLo = UnmergeDenom.getReg(0);
2642   Register DenomHi = UnmergeDenom.getReg(1);
2643 
2644   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2645   auto C1 = B.buildSExt(S32, CmpHi);
2646 
2647   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2648   auto C2 = B.buildSExt(S32, CmpLo);
2649 
2650   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2651   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2652 
2653   // TODO: Here and below portions of the code can be enclosed into if/endif.
2654   // Currently control flow is unconditional and we have 4 selects after
2655   // potential endif to substitute PHIs.
2656 
2657   // if C3 != 0 ...
2658   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2659   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2660   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2661   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2662 
2663   auto One64 = B.buildConstant(S64, 1);
2664   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2665 
2666   auto C4 =
2667       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2668   auto C5 =
2669       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2670   auto C6 = B.buildSelect(
2671       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2672 
2673   // if (C6 != 0)
2674   auto Add4 = B.buildAdd(S64, Add3, One64);
2675   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2676 
2677   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2678   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2679   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2680 
2681   // endif C6
2682   // endif C3
2683 
2684   if (IsDiv) {
2685     auto Sel1 = B.buildSelect(
2686         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2687     B.buildSelect(MI.getOperand(0),
2688                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2689   } else {
2690     auto Sel2 = B.buildSelect(
2691         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2692     B.buildSelect(MI.getOperand(0),
2693                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2694   }
2695 
2696   MI.eraseFromParent();
2697   return true;
2698 }
2699 
2700 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2701                                             MachineRegisterInfo &MRI,
2702                                             MachineIRBuilder &B) const {
2703   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2704   if (Ty == LLT::scalar(32))
2705     return legalizeUDIV_UREM32(MI, MRI, B);
2706   if (Ty == LLT::scalar(64))
2707     return legalizeUDIV_UREM64(MI, MRI, B);
2708   return false;
2709 }
2710 
2711 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2712                                               MachineRegisterInfo &MRI,
2713                                               MachineIRBuilder &B) const {
2714   B.setInstr(MI);
2715   const LLT S32 = LLT::scalar(32);
2716 
2717   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2718   Register DstReg = MI.getOperand(0).getReg();
2719   Register LHS = MI.getOperand(1).getReg();
2720   Register RHS = MI.getOperand(2).getReg();
2721 
2722   auto ThirtyOne = B.buildConstant(S32, 31);
2723   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2724   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2725 
2726   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2727   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2728 
2729   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2730   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2731 
2732   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2733   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2734 
2735   if (IsRem) {
2736     auto RSign = LHSign; // Remainder sign is the same as LHS
2737     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2738     B.buildSub(DstReg, UDivRem, RSign);
2739   } else {
2740     auto DSign = B.buildXor(S32, LHSign, RHSign);
2741     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2742     B.buildSub(DstReg, UDivRem, DSign);
2743   }
2744 
2745   MI.eraseFromParent();
2746   return true;
2747 }
2748 
2749 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2750                                             MachineRegisterInfo &MRI,
2751                                             MachineIRBuilder &B) const {
2752   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2753     return legalizeSDIV_SREM32(MI, MRI, B);
2754   return false;
2755 }
2756 
2757 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2758                                                  MachineRegisterInfo &MRI,
2759                                                  MachineIRBuilder &B) const {
2760   Register Res = MI.getOperand(0).getReg();
2761   Register LHS = MI.getOperand(1).getReg();
2762   Register RHS = MI.getOperand(2).getReg();
2763 
2764   uint16_t Flags = MI.getFlags();
2765 
2766   LLT ResTy = MRI.getType(Res);
2767   LLT S32 = LLT::scalar(32);
2768   LLT S64 = LLT::scalar(64);
2769 
2770   const MachineFunction &MF = B.getMF();
2771   bool Unsafe =
2772     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2773 
2774   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2775     return false;
2776 
2777   if (!Unsafe && ResTy == S32 &&
2778       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2779     return false;
2780 
2781   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2782     // 1 / x -> RCP(x)
2783     if (CLHS->isExactlyValue(1.0)) {
2784       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2785         .addUse(RHS)
2786         .setMIFlags(Flags);
2787 
2788       MI.eraseFromParent();
2789       return true;
2790     }
2791 
2792     // -1 / x -> RCP( FNEG(x) )
2793     if (CLHS->isExactlyValue(-1.0)) {
2794       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2795       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2796         .addUse(FNeg.getReg(0))
2797         .setMIFlags(Flags);
2798 
2799       MI.eraseFromParent();
2800       return true;
2801     }
2802   }
2803 
2804   // x / y -> x * (1.0 / y)
2805   if (Unsafe) {
2806     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2807       .addUse(RHS)
2808       .setMIFlags(Flags);
2809     B.buildFMul(Res, LHS, RCP, Flags);
2810 
2811     MI.eraseFromParent();
2812     return true;
2813   }
2814 
2815   return false;
2816 }
2817 
2818 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2819                                          MachineRegisterInfo &MRI,
2820                                          MachineIRBuilder &B) const {
2821   B.setInstr(MI);
2822   Register Res = MI.getOperand(0).getReg();
2823   Register LHS = MI.getOperand(1).getReg();
2824   Register RHS = MI.getOperand(2).getReg();
2825 
2826   uint16_t Flags = MI.getFlags();
2827 
2828   LLT S16 = LLT::scalar(16);
2829   LLT S32 = LLT::scalar(32);
2830 
2831   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2832   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2833 
2834   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2835     .addUse(RHSExt.getReg(0))
2836     .setMIFlags(Flags);
2837 
2838   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2839   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2840 
2841   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2842     .addUse(RDst.getReg(0))
2843     .addUse(RHS)
2844     .addUse(LHS)
2845     .setMIFlags(Flags);
2846 
2847   MI.eraseFromParent();
2848   return true;
2849 }
2850 
2851 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2852 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2853 static void toggleSPDenormMode(bool Enable,
2854                                MachineIRBuilder &B,
2855                                const GCNSubtarget &ST,
2856                                AMDGPU::SIModeRegisterDefaults Mode) {
2857   // Set SP denorm mode to this value.
2858   unsigned SPDenormMode =
2859     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2860 
2861   if (ST.hasDenormModeInst()) {
2862     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2863     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2864 
2865     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2866     B.buildInstr(AMDGPU::S_DENORM_MODE)
2867       .addImm(NewDenormModeValue);
2868 
2869   } else {
2870     // Select FP32 bit field in mode register.
2871     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2872                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2873                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2874 
2875     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2876       .addImm(SPDenormMode)
2877       .addImm(SPDenormModeBitField);
2878   }
2879 }
2880 
2881 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2882                                          MachineRegisterInfo &MRI,
2883                                          MachineIRBuilder &B) const {
2884   B.setInstr(MI);
2885   Register Res = MI.getOperand(0).getReg();
2886   Register LHS = MI.getOperand(1).getReg();
2887   Register RHS = MI.getOperand(2).getReg();
2888   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2889   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2890 
2891   uint16_t Flags = MI.getFlags();
2892 
2893   LLT S32 = LLT::scalar(32);
2894   LLT S1 = LLT::scalar(1);
2895 
2896   auto One = B.buildFConstant(S32, 1.0f);
2897 
2898   auto DenominatorScaled =
2899     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2900       .addUse(LHS)
2901       .addUse(RHS)
2902       .addImm(0)
2903       .setMIFlags(Flags);
2904   auto NumeratorScaled =
2905     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2906       .addUse(LHS)
2907       .addUse(RHS)
2908       .addImm(1)
2909       .setMIFlags(Flags);
2910 
2911   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2912     .addUse(DenominatorScaled.getReg(0))
2913     .setMIFlags(Flags);
2914   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2915 
2916   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2917   // aren't modeled as reading it.
2918   if (!Mode.allFP32Denormals())
2919     toggleSPDenormMode(true, B, ST, Mode);
2920 
2921   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2922   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2923   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2924   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2925   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2926   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2927 
2928   if (!Mode.allFP32Denormals())
2929     toggleSPDenormMode(false, B, ST, Mode);
2930 
2931   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2932     .addUse(Fma4.getReg(0))
2933     .addUse(Fma1.getReg(0))
2934     .addUse(Fma3.getReg(0))
2935     .addUse(NumeratorScaled.getReg(1))
2936     .setMIFlags(Flags);
2937 
2938   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2939     .addUse(Fmas.getReg(0))
2940     .addUse(RHS)
2941     .addUse(LHS)
2942     .setMIFlags(Flags);
2943 
2944   MI.eraseFromParent();
2945   return true;
2946 }
2947 
2948 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2949                                          MachineRegisterInfo &MRI,
2950                                          MachineIRBuilder &B) const {
2951   B.setInstr(MI);
2952   Register Res = MI.getOperand(0).getReg();
2953   Register LHS = MI.getOperand(1).getReg();
2954   Register RHS = MI.getOperand(2).getReg();
2955 
2956   uint16_t Flags = MI.getFlags();
2957 
2958   LLT S64 = LLT::scalar(64);
2959   LLT S1 = LLT::scalar(1);
2960 
2961   auto One = B.buildFConstant(S64, 1.0);
2962 
2963   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2964     .addUse(LHS)
2965     .addUse(RHS)
2966     .addImm(0)
2967     .setMIFlags(Flags);
2968 
2969   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2970 
2971   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2972     .addUse(DivScale0.getReg(0))
2973     .setMIFlags(Flags);
2974 
2975   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2976   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2977   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2978 
2979   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2980     .addUse(LHS)
2981     .addUse(RHS)
2982     .addImm(1)
2983     .setMIFlags(Flags);
2984 
2985   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2986   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
2987   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2988 
2989   Register Scale;
2990   if (!ST.hasUsableDivScaleConditionOutput()) {
2991     // Workaround a hardware bug on SI where the condition output from div_scale
2992     // is not usable.
2993 
2994     LLT S32 = LLT::scalar(32);
2995 
2996     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2997     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2998     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2999     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3000 
3001     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3002                               Scale1Unmerge.getReg(1));
3003     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3004                               Scale0Unmerge.getReg(1));
3005     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3006   } else {
3007     Scale = DivScale1.getReg(1);
3008   }
3009 
3010   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3011     .addUse(Fma4.getReg(0))
3012     .addUse(Fma3.getReg(0))
3013     .addUse(Mul.getReg(0))
3014     .addUse(Scale)
3015     .setMIFlags(Flags);
3016 
3017   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3018     .addUse(Fmas.getReg(0))
3019     .addUse(RHS)
3020     .addUse(LHS)
3021     .setMIFlags(Flags);
3022 
3023   MI.eraseFromParent();
3024   return true;
3025 }
3026 
3027 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3028                                                  MachineRegisterInfo &MRI,
3029                                                  MachineIRBuilder &B) const {
3030   B.setInstr(MI);
3031   Register Res = MI.getOperand(0).getReg();
3032   Register LHS = MI.getOperand(2).getReg();
3033   Register RHS = MI.getOperand(3).getReg();
3034   uint16_t Flags = MI.getFlags();
3035 
3036   LLT S32 = LLT::scalar(32);
3037   LLT S1 = LLT::scalar(1);
3038 
3039   auto Abs = B.buildFAbs(S32, RHS, Flags);
3040   const APFloat C0Val(1.0f);
3041 
3042   auto C0 = B.buildConstant(S32, 0x6f800000);
3043   auto C1 = B.buildConstant(S32, 0x2f800000);
3044   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3045 
3046   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3047   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3048 
3049   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3050 
3051   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3052     .addUse(Mul0.getReg(0))
3053     .setMIFlags(Flags);
3054 
3055   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3056 
3057   B.buildFMul(Res, Sel, Mul1, Flags);
3058 
3059   MI.eraseFromParent();
3060   return true;
3061 }
3062 
3063 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3064                                                  MachineRegisterInfo &MRI,
3065                                                  MachineIRBuilder &B) const {
3066   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3067   if (!MFI->isEntryFunction()) {
3068     return legalizePreloadedArgIntrin(MI, MRI, B,
3069                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3070   }
3071 
3072   B.setInstr(MI);
3073 
3074   uint64_t Offset =
3075     ST.getTargetLowering()->getImplicitParameterOffset(
3076       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3077   Register DstReg = MI.getOperand(0).getReg();
3078   LLT DstTy = MRI.getType(DstReg);
3079   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3080 
3081   const ArgDescriptor *Arg;
3082   const TargetRegisterClass *RC;
3083   std::tie(Arg, RC)
3084     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3085   if (!Arg)
3086     return false;
3087 
3088   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3089   if (!loadInputValue(KernargPtrReg, B, Arg))
3090     return false;
3091 
3092   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3093   MI.eraseFromParent();
3094   return true;
3095 }
3096 
3097 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3098                                               MachineRegisterInfo &MRI,
3099                                               MachineIRBuilder &B,
3100                                               unsigned AddrSpace) const {
3101   B.setInstr(MI);
3102   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3103   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3104   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3105   MI.eraseFromParent();
3106   return true;
3107 }
3108 
3109 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3110 // offset (the offset that is included in bounds checking and swizzling, to be
3111 // split between the instruction's voffset and immoffset fields) and soffset
3112 // (the offset that is excluded from bounds checking and swizzling, to go in
3113 // the instruction's soffset field).  This function takes the first kind of
3114 // offset and figures out how to split it between voffset and immoffset.
3115 std::tuple<Register, unsigned, unsigned>
3116 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3117                                         Register OrigOffset) const {
3118   const unsigned MaxImm = 4095;
3119   Register BaseReg;
3120   unsigned TotalConstOffset;
3121   MachineInstr *OffsetDef;
3122   const LLT S32 = LLT::scalar(32);
3123 
3124   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3125     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3126 
3127   unsigned ImmOffset = TotalConstOffset;
3128 
3129   // If the immediate value is too big for the immoffset field, put the value
3130   // and -4096 into the immoffset field so that the value that is copied/added
3131   // for the voffset field is a multiple of 4096, and it stands more chance
3132   // of being CSEd with the copy/add for another similar load/store.
3133   // However, do not do that rounding down to a multiple of 4096 if that is a
3134   // negative number, as it appears to be illegal to have a negative offset
3135   // in the vgpr, even if adding the immediate offset makes it positive.
3136   unsigned Overflow = ImmOffset & ~MaxImm;
3137   ImmOffset -= Overflow;
3138   if ((int32_t)Overflow < 0) {
3139     Overflow += ImmOffset;
3140     ImmOffset = 0;
3141   }
3142 
3143   if (Overflow != 0) {
3144     if (!BaseReg) {
3145       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3146     } else {
3147       auto OverflowVal = B.buildConstant(S32, Overflow);
3148       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3149     }
3150   }
3151 
3152   if (!BaseReg)
3153     BaseReg = B.buildConstant(S32, 0).getReg(0);
3154 
3155   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3156 }
3157 
3158 /// Handle register layout difference for f16 images for some subtargets.
3159 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3160                                              MachineRegisterInfo &MRI,
3161                                              Register Reg) const {
3162   if (!ST.hasUnpackedD16VMem())
3163     return Reg;
3164 
3165   const LLT S16 = LLT::scalar(16);
3166   const LLT S32 = LLT::scalar(32);
3167   LLT StoreVT = MRI.getType(Reg);
3168   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3169 
3170   auto Unmerge = B.buildUnmerge(S16, Reg);
3171 
3172   SmallVector<Register, 4> WideRegs;
3173   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3174     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3175 
3176   int NumElts = StoreVT.getNumElements();
3177 
3178   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3179 }
3180 
3181 Register AMDGPULegalizerInfo::fixStoreSourceType(
3182   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3183   MachineRegisterInfo *MRI = B.getMRI();
3184   LLT Ty = MRI->getType(VData);
3185 
3186   const LLT S16 = LLT::scalar(16);
3187 
3188   // Fixup illegal register types for i8 stores.
3189   if (Ty == LLT::scalar(8) || Ty == S16) {
3190     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3191     return AnyExt;
3192   }
3193 
3194   if (Ty.isVector()) {
3195     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3196       if (IsFormat)
3197         return handleD16VData(B, *MRI, VData);
3198     }
3199   }
3200 
3201   return VData;
3202 }
3203 
3204 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3205                                               MachineRegisterInfo &MRI,
3206                                               MachineIRBuilder &B,
3207                                               bool IsTyped,
3208                                               bool IsFormat) const {
3209   B.setInstr(MI);
3210 
3211   Register VData = MI.getOperand(1).getReg();
3212   LLT Ty = MRI.getType(VData);
3213   LLT EltTy = Ty.getScalarType();
3214   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3215   const LLT S32 = LLT::scalar(32);
3216 
3217   VData = fixStoreSourceType(B, VData, IsFormat);
3218   Register RSrc = MI.getOperand(2).getReg();
3219 
3220   MachineMemOperand *MMO = *MI.memoperands_begin();
3221   const int MemSize = MMO->getSize();
3222 
3223   unsigned ImmOffset;
3224   unsigned TotalOffset;
3225 
3226   // The typed intrinsics add an immediate after the registers.
3227   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3228 
3229   // The struct intrinsic variants add one additional operand over raw.
3230   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3231   Register VIndex;
3232   int OpOffset = 0;
3233   if (HasVIndex) {
3234     VIndex = MI.getOperand(3).getReg();
3235     OpOffset = 1;
3236   }
3237 
3238   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3239   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3240 
3241   unsigned Format = 0;
3242   if (IsTyped) {
3243     Format = MI.getOperand(5 + OpOffset).getImm();
3244     ++OpOffset;
3245   }
3246 
3247   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3248 
3249   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3250   if (TotalOffset != 0)
3251     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3252 
3253   unsigned Opc;
3254   if (IsTyped) {
3255     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3256                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3257   } else if (IsFormat) {
3258     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3259                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3260   } else {
3261     switch (MemSize) {
3262     case 1:
3263       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3264       break;
3265     case 2:
3266       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3267       break;
3268     default:
3269       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3270       break;
3271     }
3272   }
3273 
3274   if (!VIndex)
3275     VIndex = B.buildConstant(S32, 0).getReg(0);
3276 
3277   auto MIB = B.buildInstr(Opc)
3278     .addUse(VData)              // vdata
3279     .addUse(RSrc)               // rsrc
3280     .addUse(VIndex)             // vindex
3281     .addUse(VOffset)            // voffset
3282     .addUse(SOffset)            // soffset
3283     .addImm(ImmOffset);         // offset(imm)
3284 
3285   if (IsTyped)
3286     MIB.addImm(Format);
3287 
3288   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3289      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3290      .addMemOperand(MMO);
3291 
3292   MI.eraseFromParent();
3293   return true;
3294 }
3295 
3296 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3297                                              MachineRegisterInfo &MRI,
3298                                              MachineIRBuilder &B,
3299                                              bool IsFormat,
3300                                              bool IsTyped) const {
3301   B.setInstr(MI);
3302 
3303   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3304   MachineMemOperand *MMO = *MI.memoperands_begin();
3305   const int MemSize = MMO->getSize();
3306   const LLT S32 = LLT::scalar(32);
3307 
3308   Register Dst = MI.getOperand(0).getReg();
3309   Register RSrc = MI.getOperand(2).getReg();
3310 
3311   // The typed intrinsics add an immediate after the registers.
3312   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3313 
3314   // The struct intrinsic variants add one additional operand over raw.
3315   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3316   Register VIndex;
3317   int OpOffset = 0;
3318   if (HasVIndex) {
3319     VIndex = MI.getOperand(3).getReg();
3320     OpOffset = 1;
3321   }
3322 
3323   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3324   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3325 
3326   unsigned Format = 0;
3327   if (IsTyped) {
3328     Format = MI.getOperand(5 + OpOffset).getImm();
3329     ++OpOffset;
3330   }
3331 
3332   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3333   unsigned ImmOffset;
3334   unsigned TotalOffset;
3335 
3336   LLT Ty = MRI.getType(Dst);
3337   LLT EltTy = Ty.getScalarType();
3338   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3339   const bool Unpacked = ST.hasUnpackedD16VMem();
3340 
3341   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3342   if (TotalOffset != 0)
3343     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3344 
3345   unsigned Opc;
3346 
3347   if (IsTyped) {
3348     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3349                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3350   } else if (IsFormat) {
3351     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3352                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3353   } else {
3354     switch (MemSize) {
3355     case 1:
3356       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3357       break;
3358     case 2:
3359       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3360       break;
3361     default:
3362       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3363       break;
3364     }
3365   }
3366 
3367   Register LoadDstReg;
3368 
3369   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3370   LLT UnpackedTy = Ty.changeElementSize(32);
3371 
3372   if (IsExtLoad)
3373     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3374   else if (Unpacked && IsD16 && Ty.isVector())
3375     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3376   else
3377     LoadDstReg = Dst;
3378 
3379   if (!VIndex)
3380     VIndex = B.buildConstant(S32, 0).getReg(0);
3381 
3382   auto MIB = B.buildInstr(Opc)
3383     .addDef(LoadDstReg)         // vdata
3384     .addUse(RSrc)               // rsrc
3385     .addUse(VIndex)             // vindex
3386     .addUse(VOffset)            // voffset
3387     .addUse(SOffset)            // soffset
3388     .addImm(ImmOffset);         // offset(imm)
3389 
3390   if (IsTyped)
3391     MIB.addImm(Format);
3392 
3393   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3394      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3395      .addMemOperand(MMO);
3396 
3397   if (LoadDstReg != Dst) {
3398     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3399 
3400     // Widen result for extending loads was widened.
3401     if (IsExtLoad)
3402       B.buildTrunc(Dst, LoadDstReg);
3403     else {
3404       // Repack to original 16-bit vector result
3405       // FIXME: G_TRUNC should work, but legalization currently fails
3406       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3407       SmallVector<Register, 4> Repack;
3408       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3409         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3410       B.buildMerge(Dst, Repack);
3411     }
3412   }
3413 
3414   MI.eraseFromParent();
3415   return true;
3416 }
3417 
3418 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3419                                                MachineIRBuilder &B,
3420                                                bool IsInc) const {
3421   B.setInstr(MI);
3422   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3423                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3424   B.buildInstr(Opc)
3425     .addDef(MI.getOperand(0).getReg())
3426     .addUse(MI.getOperand(2).getReg())
3427     .addUse(MI.getOperand(3).getReg())
3428     .cloneMemRefs(MI);
3429   MI.eraseFromParent();
3430   return true;
3431 }
3432 
3433 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3434   switch (IntrID) {
3435   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3436   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3437     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3438   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3439   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3440     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3441   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3442   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3443     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3444   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3445   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3446     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3447   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3448   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3449     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3450   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3451   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3452     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3453   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3454   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3455     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3456   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3457   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3458     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3459   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3460   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3461     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3462   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3463   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3464     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3465   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3466   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3467     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3468   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3469   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3470     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3471   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3472   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3473     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3474   default:
3475     llvm_unreachable("unhandled atomic opcode");
3476   }
3477 }
3478 
3479 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3480                                                MachineIRBuilder &B,
3481                                                Intrinsic::ID IID) const {
3482   B.setInstr(MI);
3483 
3484   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3485                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3486 
3487   Register Dst = MI.getOperand(0).getReg();
3488   Register VData = MI.getOperand(2).getReg();
3489 
3490   Register CmpVal;
3491   int OpOffset = 0;
3492 
3493   if (IsCmpSwap) {
3494     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3495     ++OpOffset;
3496   }
3497 
3498   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3499   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3500 
3501   // The struct intrinsic variants add one additional operand over raw.
3502   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3503   Register VIndex;
3504   if (HasVIndex) {
3505     VIndex = MI.getOperand(4 + OpOffset).getReg();
3506     ++OpOffset;
3507   }
3508 
3509   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3510   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3511   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3512 
3513   MachineMemOperand *MMO = *MI.memoperands_begin();
3514 
3515   unsigned ImmOffset;
3516   unsigned TotalOffset;
3517   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3518   if (TotalOffset != 0)
3519     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3520 
3521   if (!VIndex)
3522     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3523 
3524   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3525     .addDef(Dst)
3526     .addUse(VData); // vdata
3527 
3528   if (IsCmpSwap)
3529     MIB.addReg(CmpVal);
3530 
3531   MIB.addUse(RSrc)               // rsrc
3532      .addUse(VIndex)             // vindex
3533      .addUse(VOffset)            // voffset
3534      .addUse(SOffset)            // soffset
3535      .addImm(ImmOffset)          // offset(imm)
3536      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3537      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3538      .addMemOperand(MMO);
3539 
3540   MI.eraseFromParent();
3541   return true;
3542 }
3543 
3544 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3545 /// vector with s16 typed elements.
3546 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3547                                         SmallVectorImpl<Register> &PackedAddrs,
3548                                         int AddrIdx, int DimIdx, int NumVAddrs,
3549                                         int NumGradients) {
3550   const LLT S16 = LLT::scalar(16);
3551   const LLT V2S16 = LLT::vector(2, 16);
3552 
3553   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3554     MachineOperand &SrcOp = MI.getOperand(I);
3555     if (!SrcOp.isReg())
3556       continue; // _L to _LZ may have eliminated this.
3557 
3558     Register AddrReg = SrcOp.getReg();
3559 
3560     if (I < DimIdx) {
3561       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3562       PackedAddrs.push_back(AddrReg);
3563     } else {
3564       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3565       // derivatives dx/dh and dx/dv are packed with undef.
3566       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3567           ((NumGradients / 2) % 2 == 1 &&
3568            (I == DimIdx + (NumGradients / 2) - 1 ||
3569             I == DimIdx + NumGradients - 1)) ||
3570           // Check for _L to _LZ optimization
3571           !MI.getOperand(I + 1).isReg()) {
3572         PackedAddrs.push_back(
3573             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3574                 .getReg(0));
3575       } else {
3576         PackedAddrs.push_back(
3577             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3578                 .getReg(0));
3579         ++I;
3580       }
3581     }
3582   }
3583 }
3584 
3585 /// Convert from separate vaddr components to a single vector address register,
3586 /// and replace the remaining operands with $noreg.
3587 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3588                                      int DimIdx, int NumVAddrs) {
3589   const LLT S32 = LLT::scalar(32);
3590 
3591   SmallVector<Register, 8> AddrRegs;
3592   for (int I = 0; I != NumVAddrs; ++I) {
3593     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3594     if (SrcOp.isReg()) {
3595       AddrRegs.push_back(SrcOp.getReg());
3596       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3597     }
3598   }
3599 
3600   int NumAddrRegs = AddrRegs.size();
3601   if (NumAddrRegs != 1) {
3602     // Round up to 8 elements for v5-v7
3603     // FIXME: Missing intermediate sized register classes and instructions.
3604     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3605       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3606       auto Undef = B.buildUndef(S32);
3607       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3608       NumAddrRegs = RoundedNumRegs;
3609     }
3610 
3611     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3612     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3613   }
3614 
3615   for (int I = 1; I != NumVAddrs; ++I) {
3616     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3617     if (SrcOp.isReg())
3618       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3619   }
3620 }
3621 
3622 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3623 ///
3624 /// Depending on the subtarget, load/store with 16-bit element data need to be
3625 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3626 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3627 /// registers.
3628 ///
3629 /// We don't want to directly select image instructions just yet, but also want
3630 /// to exposes all register repacking to the legalizer/combiners. We also don't
3631 /// want a selected instrution entering RegBankSelect. In order to avoid
3632 /// defining a multitude of intermediate image instructions, directly hack on
3633 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3634 /// now unnecessary arguments with $noreg.
3635 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3636     MachineInstr &MI, MachineIRBuilder &B,
3637     GISelChangeObserver &Observer,
3638     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3639   B.setInstr(MI);
3640 
3641   const int NumDefs = MI.getNumExplicitDefs();
3642   bool IsTFE = NumDefs == 2;
3643   // We are only processing the operands of d16 image operations on subtargets
3644   // that use the unpacked register layout, or need to repack the TFE result.
3645 
3646   // TODO: Do we need to guard against already legalized intrinsics?
3647   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3648     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3649 
3650   MachineRegisterInfo *MRI = B.getMRI();
3651   const LLT S32 = LLT::scalar(32);
3652   const LLT S16 = LLT::scalar(16);
3653   const LLT V2S16 = LLT::vector(2, 16);
3654 
3655   // Index of first address argument
3656   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3657 
3658   // Check for 16 bit addresses and pack if true.
3659   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3660   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3661   const bool IsA16 = AddrTy == S16;
3662 
3663   int NumVAddrs, NumGradients;
3664   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3665   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3666     getDMaskIdx(BaseOpcode, NumDefs);
3667   unsigned DMask = 0;
3668 
3669   int DMaskLanes = 0;
3670   if (!BaseOpcode->Atomic) {
3671     DMask = MI.getOperand(DMaskIdx).getImm();
3672     if (BaseOpcode->Gather4) {
3673       DMaskLanes = 4;
3674     } else if (DMask != 0) {
3675       DMaskLanes = countPopulation(DMask);
3676     } else if (!IsTFE && !BaseOpcode->Store) {
3677       // If dmask is 0, this is a no-op load. This can be eliminated.
3678       B.buildUndef(MI.getOperand(0));
3679       MI.eraseFromParent();
3680       return true;
3681     }
3682   }
3683 
3684   Observer.changingInstr(MI);
3685   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3686 
3687   unsigned NewOpcode = NumDefs == 0 ?
3688     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3689 
3690   // Track that we legalized this
3691   MI.setDesc(B.getTII().get(NewOpcode));
3692 
3693   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3694   // dmask to be at least 1 otherwise the instruction will fail
3695   if (IsTFE && DMask == 0) {
3696     DMask = 0x1;
3697     DMaskLanes = 1;
3698     MI.getOperand(DMaskIdx).setImm(DMask);
3699   }
3700 
3701   if (BaseOpcode->Atomic) {
3702     Register VData0 = MI.getOperand(2).getReg();
3703     LLT Ty = MRI->getType(VData0);
3704 
3705     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3706     if (Ty.isVector())
3707       return false;
3708 
3709     if (BaseOpcode->AtomicX2) {
3710       Register VData1 = MI.getOperand(3).getReg();
3711       // The two values are packed in one register.
3712       LLT PackedTy = LLT::vector(2, Ty);
3713       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3714       MI.getOperand(2).setReg(Concat.getReg(0));
3715       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3716     }
3717   }
3718 
3719   int CorrectedNumVAddrs = NumVAddrs;
3720 
3721   // Optimize _L to _LZ when _L is zero
3722   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3723         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3724     const ConstantFP *ConstantLod;
3725     const int LodIdx = AddrIdx + NumVAddrs - 1;
3726 
3727     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3728       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3729         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3730         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3731           LZMappingInfo->LZ, ImageDimIntr->Dim);
3732 
3733         // The starting indexes should remain in the same place.
3734         --NumVAddrs;
3735         --CorrectedNumVAddrs;
3736 
3737         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3738           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3739         MI.RemoveOperand(LodIdx);
3740       }
3741     }
3742   }
3743 
3744   // Optimize _mip away, when 'lod' is zero
3745   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3746     int64_t ConstantLod;
3747     const int LodIdx = AddrIdx + NumVAddrs - 1;
3748 
3749     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3750       if (ConstantLod == 0) {
3751         // TODO: Change intrinsic opcode and remove operand instead or replacing
3752         // it with 0, as the _L to _LZ handling is done above.
3753         MI.getOperand(LodIdx).ChangeToImmediate(0);
3754         --CorrectedNumVAddrs;
3755       }
3756     }
3757   }
3758 
3759   // If the register allocator cannot place the address registers contiguously
3760   // without introducing moves, then using the non-sequential address encoding
3761   // is always preferable, since it saves VALU instructions and is usually a
3762   // wash in terms of code size or even better.
3763   //
3764   // However, we currently have no way of hinting to the register allocator
3765   // that MIMG addresses should be placed contiguously when it is possible to
3766   // do so, so force non-NSA for the common 2-address case as a heuristic.
3767   //
3768   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3769   // allocation when possible.
3770   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3771 
3772   // Rewrite the addressing register layout before doing anything else.
3773   if (IsA16) {
3774     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3775     // should be introduced.
3776     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3777       return false;
3778 
3779     if (NumVAddrs > 1) {
3780       SmallVector<Register, 4> PackedRegs;
3781       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3782                                   NumGradients);
3783 
3784       if (!UseNSA && PackedRegs.size() > 1) {
3785         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3786         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3787         PackedRegs[0] = Concat.getReg(0);
3788         PackedRegs.resize(1);
3789       }
3790 
3791       const int NumPacked = PackedRegs.size();
3792       for (int I = 0; I != NumVAddrs; ++I) {
3793         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3794         if (!SrcOp.isReg()) {
3795           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3796           continue;
3797         }
3798 
3799         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3800 
3801         if (I < NumPacked)
3802           SrcOp.setReg(PackedRegs[I]);
3803         else
3804           SrcOp.setReg(AMDGPU::NoRegister);
3805       }
3806     }
3807   } else if (!UseNSA && NumVAddrs > 1) {
3808     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3809   }
3810 
3811 
3812   if (BaseOpcode->Store) { // No TFE for stores?
3813     // TODO: Handle dmask trim
3814     Register VData = MI.getOperand(1).getReg();
3815     LLT Ty = MRI->getType(VData);
3816     if (!Ty.isVector() || Ty.getElementType() != S16)
3817       return true;
3818 
3819     B.setInstr(MI);
3820 
3821     Register RepackedReg = handleD16VData(B, *MRI, VData);
3822     if (RepackedReg != VData) {
3823       MI.getOperand(1).setReg(RepackedReg);
3824     }
3825 
3826     return true;
3827   }
3828 
3829   Register DstReg = MI.getOperand(0).getReg();
3830   LLT Ty = MRI->getType(DstReg);
3831   const LLT EltTy = Ty.getScalarType();
3832   const bool IsD16 = Ty.getScalarType() == S16;
3833   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3834 
3835   // Confirm that the return type is large enough for the dmask specified
3836   if (NumElts < DMaskLanes)
3837     return false;
3838 
3839   if (NumElts > 4 || DMaskLanes > 4)
3840     return false;
3841 
3842   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3843   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3844 
3845   // The raw dword aligned data component of the load. The only legal cases
3846   // where this matters should be when using the packed D16 format, for
3847   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3848   LLT RoundedTy;
3849 
3850   // S32 vector to to cover all data, plus TFE result element.
3851   LLT TFETy;
3852 
3853   // Register type to use for each loaded component. Will be S32 or V2S16.
3854   LLT RegTy;
3855 
3856   if (IsD16 && ST.hasUnpackedD16VMem()) {
3857     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3858     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3859     RegTy = S32;
3860   } else {
3861     unsigned EltSize = EltTy.getSizeInBits();
3862     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3863     unsigned RoundedSize = 32 * RoundedElts;
3864     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3865     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3866     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3867   }
3868 
3869   // The return type does not need adjustment.
3870   // TODO: Should we change s16 case to s32 or <2 x s16>?
3871   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3872     return true;
3873 
3874   Register Dst1Reg;
3875 
3876   // Insert after the instruction.
3877   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3878 
3879   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3880   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3881   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3882   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3883 
3884   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3885 
3886   MI.getOperand(0).setReg(NewResultReg);
3887 
3888   // In the IR, TFE is supposed to be used with a 2 element struct return
3889   // type. The intruction really returns these two values in one contiguous
3890   // register, with one additional dword beyond the loaded data. Rewrite the
3891   // return type to use a single register result.
3892 
3893   if (IsTFE) {
3894     Dst1Reg = MI.getOperand(1).getReg();
3895     if (MRI->getType(Dst1Reg) != S32)
3896       return false;
3897 
3898     // TODO: Make sure the TFE operand bit is set.
3899     MI.RemoveOperand(1);
3900 
3901     // Handle the easy case that requires no repack instructions.
3902     if (Ty == S32) {
3903       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3904       return true;
3905     }
3906   }
3907 
3908   // Now figure out how to copy the new result register back into the old
3909   // result.
3910   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3911 
3912   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3913 
3914   if (ResultNumRegs == 1) {
3915     assert(!IsTFE);
3916     ResultRegs[0] = NewResultReg;
3917   } else {
3918     // We have to repack into a new vector of some kind.
3919     for (int I = 0; I != NumDataRegs; ++I)
3920       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3921     B.buildUnmerge(ResultRegs, NewResultReg);
3922 
3923     // Drop the final TFE element to get the data part. The TFE result is
3924     // directly written to the right place already.
3925     if (IsTFE)
3926       ResultRegs.resize(NumDataRegs);
3927   }
3928 
3929   // For an s16 scalar result, we form an s32 result with a truncate regardless
3930   // of packed vs. unpacked.
3931   if (IsD16 && !Ty.isVector()) {
3932     B.buildTrunc(DstReg, ResultRegs[0]);
3933     return true;
3934   }
3935 
3936   // Avoid a build/concat_vector of 1 entry.
3937   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3938     B.buildBitcast(DstReg, ResultRegs[0]);
3939     return true;
3940   }
3941 
3942   assert(Ty.isVector());
3943 
3944   if (IsD16) {
3945     // For packed D16 results with TFE enabled, all the data components are
3946     // S32. Cast back to the expected type.
3947     //
3948     // TODO: We don't really need to use load s32 elements. We would only need one
3949     // cast for the TFE result if a multiple of v2s16 was used.
3950     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3951       for (Register &Reg : ResultRegs)
3952         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3953     } else if (ST.hasUnpackedD16VMem()) {
3954       for (Register &Reg : ResultRegs)
3955         Reg = B.buildTrunc(S16, Reg).getReg(0);
3956     }
3957   }
3958 
3959   auto padWithUndef = [&](LLT Ty, int NumElts) {
3960     if (NumElts == 0)
3961       return;
3962     Register Undef = B.buildUndef(Ty).getReg(0);
3963     for (int I = 0; I != NumElts; ++I)
3964       ResultRegs.push_back(Undef);
3965   };
3966 
3967   // Pad out any elements eliminated due to the dmask.
3968   LLT ResTy = MRI->getType(ResultRegs[0]);
3969   if (!ResTy.isVector()) {
3970     padWithUndef(ResTy, NumElts - ResultRegs.size());
3971     B.buildBuildVector(DstReg, ResultRegs);
3972     return true;
3973   }
3974 
3975   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3976   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3977 
3978   // Deal with the one annoying legal case.
3979   const LLT V3S16 = LLT::vector(3, 16);
3980   if (Ty == V3S16) {
3981     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3982     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3983     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3984     return true;
3985   }
3986 
3987   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3988   B.buildConcatVectors(DstReg, ResultRegs);
3989   return true;
3990 }
3991 
3992 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3993   MachineInstr &MI, MachineIRBuilder &B,
3994   GISelChangeObserver &Observer) const {
3995   Register Dst = MI.getOperand(0).getReg();
3996   LLT Ty = B.getMRI()->getType(Dst);
3997   unsigned Size = Ty.getSizeInBits();
3998   MachineFunction &MF = B.getMF();
3999 
4000   Observer.changingInstr(MI);
4001 
4002   // FIXME: We don't really need this intermediate instruction. The intrinsic
4003   // should be fixed to have a memory operand. Since it's readnone, we're not
4004   // allowed to add one.
4005   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4006   MI.RemoveOperand(1); // Remove intrinsic ID
4007 
4008   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4009   // TODO: Should this use datalayout alignment?
4010   const unsigned MemSize = (Size + 7) / 8;
4011   const Align MemAlign(4);
4012   MachineMemOperand *MMO = MF.getMachineMemOperand(
4013       MachinePointerInfo(),
4014       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4015           MachineMemOperand::MOInvariant,
4016       MemSize, MemAlign);
4017   MI.addMemOperand(MF, MMO);
4018 
4019   // There are no 96-bit result scalar loads, but widening to 128-bit should
4020   // always be legal. We may need to restore this to a 96-bit result if it turns
4021   // out this needs to be converted to a vector load during RegBankSelect.
4022   if (!isPowerOf2_32(Size)) {
4023     LegalizerHelper Helper(MF, *this, Observer, B);
4024     B.setInstr(MI);
4025 
4026     if (Ty.isVector())
4027       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4028     else
4029       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4030   }
4031 
4032   Observer.changedInstr(MI);
4033   return true;
4034 }
4035 
4036 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4037                                                 MachineRegisterInfo &MRI,
4038                                                 MachineIRBuilder &B) const {
4039   B.setInstr(MI);
4040 
4041   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4042   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4043       !ST.isTrapHandlerEnabled()) {
4044     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4045   } else {
4046     // Pass queue pointer to trap handler as input, and insert trap instruction
4047     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4048     const ArgDescriptor *Arg =
4049         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4050     if (!Arg)
4051       return false;
4052     MachineRegisterInfo &MRI = *B.getMRI();
4053     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4054     Register LiveIn = getLiveInRegister(
4055         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4056         /*InsertLiveInCopy=*/false);
4057     if (!loadInputValue(LiveIn, B, Arg))
4058       return false;
4059     B.buildCopy(SGPR01, LiveIn);
4060     B.buildInstr(AMDGPU::S_TRAP)
4061         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4062         .addReg(SGPR01, RegState::Implicit);
4063   }
4064 
4065   MI.eraseFromParent();
4066   return true;
4067 }
4068 
4069 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4070     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4071   B.setInstr(MI);
4072 
4073   // Is non-HSA path or trap-handler disabled? then, report a warning
4074   // accordingly
4075   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4076       !ST.isTrapHandlerEnabled()) {
4077     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4078                                      "debugtrap handler not supported",
4079                                      MI.getDebugLoc(), DS_Warning);
4080     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4081     Ctx.diagnose(NoTrap);
4082   } else {
4083     // Insert debug-trap instruction
4084     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4085   }
4086 
4087   MI.eraseFromParent();
4088   return true;
4089 }
4090 
4091 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4092                                             MachineIRBuilder &B,
4093                                             GISelChangeObserver &Observer) const {
4094   MachineRegisterInfo &MRI = *B.getMRI();
4095 
4096   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4097   auto IntrID = MI.getIntrinsicID();
4098   switch (IntrID) {
4099   case Intrinsic::amdgcn_if:
4100   case Intrinsic::amdgcn_else: {
4101     MachineInstr *Br = nullptr;
4102     MachineBasicBlock *UncondBrTarget = nullptr;
4103     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4104       const SIRegisterInfo *TRI
4105         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4106 
4107       B.setInstr(*BrCond);
4108       Register Def = MI.getOperand(1).getReg();
4109       Register Use = MI.getOperand(3).getReg();
4110 
4111       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4112       if (IntrID == Intrinsic::amdgcn_if) {
4113         B.buildInstr(AMDGPU::SI_IF)
4114           .addDef(Def)
4115           .addUse(Use)
4116           .addMBB(UncondBrTarget);
4117       } else {
4118         B.buildInstr(AMDGPU::SI_ELSE)
4119           .addDef(Def)
4120           .addUse(Use)
4121           .addMBB(UncondBrTarget)
4122           .addImm(0);
4123       }
4124 
4125       if (Br) {
4126         Br->getOperand(0).setMBB(CondBrTarget);
4127       } else {
4128         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4129         // since we're swapping branch targets it needs to be reinserted.
4130         // FIXME: IRTranslator should probably not do this
4131         B.buildBr(*CondBrTarget);
4132       }
4133 
4134       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4135       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4136       MI.eraseFromParent();
4137       BrCond->eraseFromParent();
4138       return true;
4139     }
4140 
4141     return false;
4142   }
4143   case Intrinsic::amdgcn_loop: {
4144     MachineInstr *Br = nullptr;
4145     MachineBasicBlock *UncondBrTarget = nullptr;
4146     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4147       const SIRegisterInfo *TRI
4148         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4149 
4150       B.setInstr(*BrCond);
4151 
4152       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4153       Register Reg = MI.getOperand(2).getReg();
4154       B.buildInstr(AMDGPU::SI_LOOP)
4155         .addUse(Reg)
4156         .addMBB(UncondBrTarget);
4157 
4158       if (Br)
4159         Br->getOperand(0).setMBB(CondBrTarget);
4160       else
4161         B.buildBr(*CondBrTarget);
4162 
4163       MI.eraseFromParent();
4164       BrCond->eraseFromParent();
4165       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4166       return true;
4167     }
4168 
4169     return false;
4170   }
4171   case Intrinsic::amdgcn_kernarg_segment_ptr:
4172     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4173       B.setInstr(MI);
4174       // This only makes sense to call in a kernel, so just lower to null.
4175       B.buildConstant(MI.getOperand(0).getReg(), 0);
4176       MI.eraseFromParent();
4177       return true;
4178     }
4179 
4180     return legalizePreloadedArgIntrin(
4181       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4182   case Intrinsic::amdgcn_implicitarg_ptr:
4183     return legalizeImplicitArgPtr(MI, MRI, B);
4184   case Intrinsic::amdgcn_workitem_id_x:
4185     return legalizePreloadedArgIntrin(MI, MRI, B,
4186                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4187   case Intrinsic::amdgcn_workitem_id_y:
4188     return legalizePreloadedArgIntrin(MI, MRI, B,
4189                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4190   case Intrinsic::amdgcn_workitem_id_z:
4191     return legalizePreloadedArgIntrin(MI, MRI, B,
4192                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4193   case Intrinsic::amdgcn_workgroup_id_x:
4194     return legalizePreloadedArgIntrin(MI, MRI, B,
4195                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4196   case Intrinsic::amdgcn_workgroup_id_y:
4197     return legalizePreloadedArgIntrin(MI, MRI, B,
4198                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4199   case Intrinsic::amdgcn_workgroup_id_z:
4200     return legalizePreloadedArgIntrin(MI, MRI, B,
4201                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4202   case Intrinsic::amdgcn_dispatch_ptr:
4203     return legalizePreloadedArgIntrin(MI, MRI, B,
4204                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4205   case Intrinsic::amdgcn_queue_ptr:
4206     return legalizePreloadedArgIntrin(MI, MRI, B,
4207                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4208   case Intrinsic::amdgcn_implicit_buffer_ptr:
4209     return legalizePreloadedArgIntrin(
4210       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4211   case Intrinsic::amdgcn_dispatch_id:
4212     return legalizePreloadedArgIntrin(MI, MRI, B,
4213                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4214   case Intrinsic::amdgcn_fdiv_fast:
4215     return legalizeFDIVFastIntrin(MI, MRI, B);
4216   case Intrinsic::amdgcn_is_shared:
4217     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4218   case Intrinsic::amdgcn_is_private:
4219     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4220   case Intrinsic::amdgcn_wavefrontsize: {
4221     B.setInstr(MI);
4222     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4223     MI.eraseFromParent();
4224     return true;
4225   }
4226   case Intrinsic::amdgcn_s_buffer_load:
4227     return legalizeSBufferLoad(MI, B, Observer);
4228   case Intrinsic::amdgcn_raw_buffer_store:
4229   case Intrinsic::amdgcn_struct_buffer_store:
4230     return legalizeBufferStore(MI, MRI, B, false, false);
4231   case Intrinsic::amdgcn_raw_buffer_store_format:
4232   case Intrinsic::amdgcn_struct_buffer_store_format:
4233     return legalizeBufferStore(MI, MRI, B, false, true);
4234   case Intrinsic::amdgcn_raw_tbuffer_store:
4235   case Intrinsic::amdgcn_struct_tbuffer_store:
4236     return legalizeBufferStore(MI, MRI, B, true, true);
4237   case Intrinsic::amdgcn_raw_buffer_load:
4238   case Intrinsic::amdgcn_struct_buffer_load:
4239     return legalizeBufferLoad(MI, MRI, B, false, false);
4240   case Intrinsic::amdgcn_raw_buffer_load_format:
4241   case Intrinsic::amdgcn_struct_buffer_load_format:
4242     return legalizeBufferLoad(MI, MRI, B, true, false);
4243   case Intrinsic::amdgcn_raw_tbuffer_load:
4244   case Intrinsic::amdgcn_struct_tbuffer_load:
4245     return legalizeBufferLoad(MI, MRI, B, true, true);
4246   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4247   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4248   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4249   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4250   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4251   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4252   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4253   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4254   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4255   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4256   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4257   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4258   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4259   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4260   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4261   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4262   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4263   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4264   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4265   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4266   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4267   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4268   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4269   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4270   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4271   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4272     return legalizeBufferAtomic(MI, B, IntrID);
4273   case Intrinsic::amdgcn_atomic_inc:
4274     return legalizeAtomicIncDec(MI, B, true);
4275   case Intrinsic::amdgcn_atomic_dec:
4276     return legalizeAtomicIncDec(MI, B, false);
4277   case Intrinsic::trap:
4278     return legalizeTrapIntrinsic(MI, MRI, B);
4279   case Intrinsic::debugtrap:
4280     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4281   default: {
4282     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4283             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4284       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4285     return true;
4286   }
4287   }
4288 
4289   return true;
4290 }
4291