1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
375     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378     .clampScalarOrElt(0, S32, S1024)
379     .legalIf(isMultiple32(0))
380     .widenScalarToNextPow2(0, 32)
381     .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
564     .scalarize(0)
565     .alwaysLegal();
566 
567   auto &CmpBuilder =
568     getActionDefinitionsBuilder(G_ICMP)
569     // The compare output type differs based on the register bank of the output,
570     // so make both s1 and s32 legal.
571     //
572     // Scalar compares producing output in scc will be promoted to s32, as that
573     // is the allocatable register type that will be needed for the copy from
574     // scc. This will be promoted during RegBankSelect, and we assume something
575     // before that won't try to use s32 result types.
576     //
577     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
578     // bank.
579     .legalForCartesianProduct(
580       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
581     .legalForCartesianProduct(
582       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
583   if (ST.has16BitInsts()) {
584     CmpBuilder.legalFor({{S1, S16}});
585   }
586 
587   CmpBuilder
588     .widenScalarToNextPow2(1)
589     .clampScalar(1, S32, S64)
590     .scalarize(0)
591     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
592 
593   getActionDefinitionsBuilder(G_FCMP)
594     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
595     .widenScalarToNextPow2(1)
596     .clampScalar(1, S32, S64)
597     .scalarize(0);
598 
599   // FIXME: fpow has a selection pattern that should move to custom lowering.
600   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
601   if (ST.has16BitInsts())
602     Exp2Ops.legalFor({S32, S16});
603   else
604     Exp2Ops.legalFor({S32});
605   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
606   Exp2Ops.scalarize(0);
607 
608   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
609   if (ST.has16BitInsts())
610     ExpOps.customFor({{S32}, {S16}});
611   else
612     ExpOps.customFor({S32});
613   ExpOps.clampScalar(0, MinScalarFPTy, S32)
614         .scalarize(0);
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder(G_CTPOP)
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   // The hardware instructions return a different result on 0 than the generic
626   // instructions expect. The hardware produces -1, but these produce the
627   // bitwidth.
628   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
629     .scalarize(0)
630     .clampScalar(0, S32, S32)
631     .clampScalar(1, S32, S64)
632     .widenScalarToNextPow2(0, 32)
633     .widenScalarToNextPow2(1, 32)
634     .lower();
635 
636   // The 64-bit versions produce 32-bit results, but only on the SALU.
637   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
638     .legalFor({{S32, S32}, {S32, S64}})
639     .clampScalar(0, S32, S32)
640     .clampScalar(1, S32, S64)
641     .scalarize(0)
642     .widenScalarToNextPow2(0, 32)
643     .widenScalarToNextPow2(1, 32);
644 
645   getActionDefinitionsBuilder(G_BITREVERSE)
646     .legalFor({S32})
647     .clampScalar(0, S32, S32)
648     .scalarize(0);
649 
650   if (ST.has16BitInsts()) {
651     getActionDefinitionsBuilder(G_BSWAP)
652       .legalFor({S16, S32, V2S16})
653       .clampMaxNumElements(0, S16, 2)
654       // FIXME: Fixing non-power-of-2 before clamp is workaround for
655       // narrowScalar limitation.
656       .widenScalarToNextPow2(0)
657       .clampScalar(0, S16, S32)
658       .scalarize(0);
659 
660     if (ST.hasVOP3PInsts()) {
661       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
662         .legalFor({S32, S16, V2S16})
663         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
664         .clampMaxNumElements(0, S16, 2)
665         .minScalar(0, S16)
666         .widenScalarToNextPow2(0)
667         .scalarize(0)
668         .lower();
669     } else {
670       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671         .legalFor({S32, S16})
672         .widenScalarToNextPow2(0)
673         .minScalar(0, S16)
674         .scalarize(0)
675         .lower();
676     }
677   } else {
678     // TODO: Should have same legality without v_perm_b32
679     getActionDefinitionsBuilder(G_BSWAP)
680       .legalFor({S32})
681       .lowerIf(narrowerThan(0, 32))
682       // FIXME: Fixing non-power-of-2 before clamp is workaround for
683       // narrowScalar limitation.
684       .widenScalarToNextPow2(0)
685       .maxScalar(0, S32)
686       .scalarize(0)
687       .lower();
688 
689     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
690       .legalFor({S32})
691       .minScalar(0, S32)
692       .widenScalarToNextPow2(0)
693       .scalarize(0)
694       .lower();
695   }
696 
697   getActionDefinitionsBuilder(G_INTTOPTR)
698     // List the common cases
699     .legalForCartesianProduct(AddrSpaces64, {S64})
700     .legalForCartesianProduct(AddrSpaces32, {S32})
701     .scalarize(0)
702     // Accept any address space as long as the size matches
703     .legalIf(sameSize(0, 1))
704     .widenScalarIf(smallerThan(1, 0),
705       [](const LegalityQuery &Query) {
706         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
707       })
708     .narrowScalarIf(greaterThan(1, 0),
709       [](const LegalityQuery &Query) {
710         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
711       });
712 
713   getActionDefinitionsBuilder(G_PTRTOINT)
714     // List the common cases
715     .legalForCartesianProduct(AddrSpaces64, {S64})
716     .legalForCartesianProduct(AddrSpaces32, {S32})
717     .scalarize(0)
718     // Accept any address space as long as the size matches
719     .legalIf(sameSize(0, 1))
720     .widenScalarIf(smallerThan(0, 1),
721       [](const LegalityQuery &Query) {
722         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
723       })
724     .narrowScalarIf(
725       greaterThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       });
729 
730   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
731     .scalarize(0)
732     .custom();
733 
734   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
735   // handle some operations by just promoting the register during
736   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
737   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
738     switch (AS) {
739     // FIXME: Private element size.
740     case AMDGPUAS::PRIVATE_ADDRESS:
741       return 32;
742     // FIXME: Check subtarget
743     case AMDGPUAS::LOCAL_ADDRESS:
744       return ST.useDS128() ? 128 : 64;
745 
746     // Treat constant and global as identical. SMRD loads are sometimes usable
747     // for global loads (ideally constant address space should be eliminated)
748     // depending on the context. Legality cannot be context dependent, but
749     // RegBankSelect can split the load as necessary depending on the pointer
750     // register bank/uniformity and if the memory is invariant or not written in
751     // a kernel.
752     case AMDGPUAS::CONSTANT_ADDRESS:
753     case AMDGPUAS::GLOBAL_ADDRESS:
754       return IsLoad ? 512 : 128;
755     default:
756       return 128;
757     }
758   };
759 
760   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
761                                     bool IsLoad) -> bool {
762     const LLT DstTy = Query.Types[0];
763 
764     // Split vector extloads.
765     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
766     unsigned Align = Query.MMODescrs[0].AlignInBits;
767 
768     if (MemSize < DstTy.getSizeInBits())
769       MemSize = std::max(MemSize, Align);
770 
771     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
772       return true;
773 
774     const LLT PtrTy = Query.Types[1];
775     unsigned AS = PtrTy.getAddressSpace();
776     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
777       return true;
778 
779     // Catch weird sized loads that don't evenly divide into the access sizes
780     // TODO: May be able to widen depending on alignment etc.
781     unsigned NumRegs = (MemSize + 31) / 32;
782     if (NumRegs == 3) {
783       if (!ST.hasDwordx3LoadStores())
784         return true;
785     } else {
786       // If the alignment allows, these should have been widened.
787       if (!isPowerOf2_32(NumRegs))
788         return true;
789     }
790 
791     if (Align < MemSize) {
792       const SITargetLowering *TLI = ST.getTargetLowering();
793       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
794     }
795 
796     return false;
797   };
798 
799   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
800     unsigned Size = Query.Types[0].getSizeInBits();
801     if (isPowerOf2_32(Size))
802       return false;
803 
804     if (Size == 96 && ST.hasDwordx3LoadStores())
805       return false;
806 
807     unsigned AddrSpace = Query.Types[1].getAddressSpace();
808     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
809       return false;
810 
811     unsigned Align = Query.MMODescrs[0].AlignInBits;
812     unsigned RoundedSize = NextPowerOf2(Size);
813     return (Align >= RoundedSize);
814   };
815 
816   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
817   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
818   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
819 
820   // TODO: Refine based on subtargets which support unaligned access or 128-bit
821   // LDS
822   // TODO: Unsupported flat for SI.
823 
824   for (unsigned Op : {G_LOAD, G_STORE}) {
825     const bool IsStore = Op == G_STORE;
826 
827     auto &Actions = getActionDefinitionsBuilder(Op);
828     // Whitelist the common cases.
829     // TODO: Loads to s16 on gfx9
830     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
831                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
832                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
833                                       {S128, GlobalPtr, 128, GlobalAlign32},
834                                       {S64, GlobalPtr, 64, GlobalAlign32},
835                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
836                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
837                                       {S32, GlobalPtr, 8, GlobalAlign8},
838                                       {S32, GlobalPtr, 16, GlobalAlign16},
839 
840                                       {S32, LocalPtr, 32, 32},
841                                       {S64, LocalPtr, 64, 32},
842                                       {V2S32, LocalPtr, 64, 32},
843                                       {S32, LocalPtr, 8, 8},
844                                       {S32, LocalPtr, 16, 16},
845                                       {V2S16, LocalPtr, 32, 32},
846 
847                                       {S32, PrivatePtr, 32, 32},
848                                       {S32, PrivatePtr, 8, 8},
849                                       {S32, PrivatePtr, 16, 16},
850                                       {V2S16, PrivatePtr, 32, 32},
851 
852                                       {S32, FlatPtr, 32, GlobalAlign32},
853                                       {S32, FlatPtr, 16, GlobalAlign16},
854                                       {S32, FlatPtr, 8, GlobalAlign8},
855                                       {V2S16, FlatPtr, 32, GlobalAlign32},
856 
857                                       {S32, ConstantPtr, 32, GlobalAlign32},
858                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
859                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
860                                       {S64, ConstantPtr, 64, GlobalAlign32},
861                                       {S128, ConstantPtr, 128, GlobalAlign32},
862                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
863     Actions
864         .customIf(typeIs(1, Constant32Ptr))
865         // Widen suitably aligned loads by loading extra elements.
866         .moreElementsIf([=](const LegalityQuery &Query) {
867             const LLT Ty = Query.Types[0];
868             return Op == G_LOAD && Ty.isVector() &&
869                    shouldWidenLoadResult(Query);
870           }, moreElementsToNextPow2(0))
871         .widenScalarIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && !Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, widenScalarOrEltToNextPow2(0))
876         .narrowScalarIf(
877             [=](const LegalityQuery &Query) -> bool {
878               return !Query.Types[0].isVector() &&
879                      needToSplitMemOp(Query, Op == G_LOAD);
880             },
881             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
882               const LLT DstTy = Query.Types[0];
883               const LLT PtrTy = Query.Types[1];
884 
885               const unsigned DstSize = DstTy.getSizeInBits();
886               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
887 
888               // Split extloads.
889               if (DstSize > MemSize)
890                 return std::make_pair(0, LLT::scalar(MemSize));
891 
892               if (!isPowerOf2_32(DstSize)) {
893                 // We're probably decomposing an odd sized store. Try to split
894                 // to the widest type. TODO: Account for alignment. As-is it
895                 // should be OK, since the new parts will be further legalized.
896                 unsigned FloorSize = PowerOf2Floor(DstSize);
897                 return std::make_pair(0, LLT::scalar(FloorSize));
898               }
899 
900               if (DstSize > 32 && (DstSize % 32 != 0)) {
901                 // FIXME: Need a way to specify non-extload of larger size if
902                 // suitably aligned.
903                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
904               }
905 
906               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
907                                                      Op == G_LOAD);
908               if (MemSize > MaxSize)
909                 return std::make_pair(0, LLT::scalar(MaxSize));
910 
911               unsigned Align = Query.MMODescrs[0].AlignInBits;
912               return std::make_pair(0, LLT::scalar(Align));
913             })
914         .fewerElementsIf(
915             [=](const LegalityQuery &Query) -> bool {
916               return Query.Types[0].isVector() &&
917                      needToSplitMemOp(Query, Op == G_LOAD);
918             },
919             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
920               const LLT DstTy = Query.Types[0];
921               const LLT PtrTy = Query.Types[1];
922 
923               LLT EltTy = DstTy.getElementType();
924               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
925                                                      Op == G_LOAD);
926 
927               // FIXME: Handle widened to power of 2 results better. This ends
928               // up scalarizing.
929               // FIXME: 3 element stores scalarized on SI
930 
931               // Split if it's too large for the address space.
932               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
933                 unsigned NumElts = DstTy.getNumElements();
934                 unsigned EltSize = EltTy.getSizeInBits();
935 
936                 if (MaxSize % EltSize == 0) {
937                   return std::make_pair(
938                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
939                 }
940 
941                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
942 
943                 // FIXME: Refine when odd breakdowns handled
944                 // The scalars will need to be re-legalized.
945                 if (NumPieces == 1 || NumPieces >= NumElts ||
946                     NumElts % NumPieces != 0)
947                   return std::make_pair(0, EltTy);
948 
949                 return std::make_pair(0,
950                                       LLT::vector(NumElts / NumPieces, EltTy));
951               }
952 
953               // FIXME: We could probably handle weird extending loads better.
954               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
955               if (DstTy.getSizeInBits() > MemSize)
956                 return std::make_pair(0, EltTy);
957 
958               unsigned EltSize = EltTy.getSizeInBits();
959               unsigned DstSize = DstTy.getSizeInBits();
960               if (!isPowerOf2_32(DstSize)) {
961                 // We're probably decomposing an odd sized store. Try to split
962                 // to the widest type. TODO: Account for alignment. As-is it
963                 // should be OK, since the new parts will be further legalized.
964                 unsigned FloorSize = PowerOf2Floor(DstSize);
965                 return std::make_pair(
966                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
967               }
968 
969               // Need to split because of alignment.
970               unsigned Align = Query.MMODescrs[0].AlignInBits;
971               if (EltSize > Align &&
972                   (EltSize / Align < DstTy.getNumElements())) {
973                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
974               }
975 
976               // May need relegalization for the scalars.
977               return std::make_pair(0, EltTy);
978             })
979         .minScalar(0, S32);
980 
981     if (IsStore)
982       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
983 
984     // TODO: Need a bitcast lower option?
985     Actions
986         .legalIf([=](const LegalityQuery &Query) {
987           const LLT Ty0 = Query.Types[0];
988           unsigned Size = Ty0.getSizeInBits();
989           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
990           unsigned Align = Query.MMODescrs[0].AlignInBits;
991 
992           // FIXME: Widening store from alignment not valid.
993           if (MemSize < Size)
994             MemSize = std::max(MemSize, Align);
995 
996           // No extending vector loads.
997           if (Size > MemSize && Ty0.isVector())
998             return false;
999 
1000           switch (MemSize) {
1001           case 8:
1002           case 16:
1003             return Size == 32;
1004           case 32:
1005           case 64:
1006           case 128:
1007             return true;
1008           case 96:
1009             return ST.hasDwordx3LoadStores();
1010           case 256:
1011           case 512:
1012             return true;
1013           default:
1014             return false;
1015           }
1016         })
1017         .widenScalarToNextPow2(0)
1018         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1019   }
1020 
1021   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1022                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1023                                                   {S32, GlobalPtr, 16, 2 * 8},
1024                                                   {S32, LocalPtr, 8, 8},
1025                                                   {S32, LocalPtr, 16, 16},
1026                                                   {S32, PrivatePtr, 8, 8},
1027                                                   {S32, PrivatePtr, 16, 16},
1028                                                   {S32, ConstantPtr, 8, 8},
1029                                                   {S32, ConstantPtr, 16, 2 * 8}});
1030   if (ST.hasFlatAddressSpace()) {
1031     ExtLoads.legalForTypesWithMemDesc(
1032         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1033   }
1034 
1035   ExtLoads.clampScalar(0, S32, S32)
1036           .widenScalarToNextPow2(0)
1037           .unsupportedIfMemSizeNotPow2()
1038           .lower();
1039 
1040   auto &Atomics = getActionDefinitionsBuilder(
1041     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1042      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1043      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1044      G_ATOMICRMW_UMIN})
1045     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1046                {S64, GlobalPtr}, {S64, LocalPtr}});
1047   if (ST.hasFlatAddressSpace()) {
1048     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1049   }
1050 
1051   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1052     .legalFor({{S32, LocalPtr}});
1053 
1054   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1055   // demarshalling
1056   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1057     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1058                 {S32, FlatPtr}, {S64, FlatPtr}})
1059     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1060                {S32, RegionPtr}, {S64, RegionPtr}});
1061   // TODO: Pointer types, any 32-bit or 64-bit vector
1062 
1063   // Condition should be s32 for scalar, s1 for vector.
1064   getActionDefinitionsBuilder(G_SELECT)
1065     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1066           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1067           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1068     .clampScalar(0, S16, S64)
1069     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1070     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1071     .scalarize(1)
1072     .clampMaxNumElements(0, S32, 2)
1073     .clampMaxNumElements(0, LocalPtr, 2)
1074     .clampMaxNumElements(0, PrivatePtr, 2)
1075     .scalarize(0)
1076     .widenScalarToNextPow2(0)
1077     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1078 
1079   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1080   // be more flexible with the shift amount type.
1081   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1082     .legalFor({{S32, S32}, {S64, S32}});
1083   if (ST.has16BitInsts()) {
1084     if (ST.hasVOP3PInsts()) {
1085       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1086             .clampMaxNumElements(0, S16, 2);
1087     } else
1088       Shifts.legalFor({{S16, S32}, {S16, S16}});
1089 
1090     // TODO: Support 16-bit shift amounts
1091     Shifts.clampScalar(1, S32, S32);
1092     Shifts.clampScalar(0, S16, S64);
1093     Shifts.widenScalarToNextPow2(0, 16);
1094   } else {
1095     // Make sure we legalize the shift amount type first, as the general
1096     // expansion for the shifted type will produce much worse code if it hasn't
1097     // been truncated already.
1098     Shifts.clampScalar(1, S32, S32);
1099     Shifts.clampScalar(0, S32, S64);
1100     Shifts.widenScalarToNextPow2(0, 32);
1101   }
1102   Shifts.scalarize(0);
1103 
1104   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1105     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1106     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1107     unsigned IdxTypeIdx = 2;
1108 
1109     getActionDefinitionsBuilder(Op)
1110       .customIf([=](const LegalityQuery &Query) {
1111           const LLT EltTy = Query.Types[EltTypeIdx];
1112           const LLT VecTy = Query.Types[VecTypeIdx];
1113           const LLT IdxTy = Query.Types[IdxTypeIdx];
1114           return (EltTy.getSizeInBits() == 16 ||
1115                   EltTy.getSizeInBits() % 32 == 0) &&
1116                  VecTy.getSizeInBits() % 32 == 0 &&
1117                  VecTy.getSizeInBits() <= 1024 &&
1118                  IdxTy.getSizeInBits() == 32;
1119         })
1120       .clampScalar(EltTypeIdx, S32, S64)
1121       .clampScalar(VecTypeIdx, S32, S64)
1122       .clampScalar(IdxTypeIdx, S32, S32);
1123   }
1124 
1125   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1126     .unsupportedIf([=](const LegalityQuery &Query) {
1127         const LLT &EltTy = Query.Types[1].getElementType();
1128         return Query.Types[0] != EltTy;
1129       });
1130 
1131   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1132     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1133     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1134 
1135     // FIXME: Doesn't handle extract of illegal sizes.
1136     getActionDefinitionsBuilder(Op)
1137       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1138       // FIXME: Multiples of 16 should not be legal.
1139       .legalIf([=](const LegalityQuery &Query) {
1140           const LLT BigTy = Query.Types[BigTyIdx];
1141           const LLT LitTy = Query.Types[LitTyIdx];
1142           return (BigTy.getSizeInBits() % 32 == 0) &&
1143                  (LitTy.getSizeInBits() % 16 == 0);
1144         })
1145       .widenScalarIf(
1146         [=](const LegalityQuery &Query) {
1147           const LLT BigTy = Query.Types[BigTyIdx];
1148           return (BigTy.getScalarSizeInBits() < 16);
1149         },
1150         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1151       .widenScalarIf(
1152         [=](const LegalityQuery &Query) {
1153           const LLT LitTy = Query.Types[LitTyIdx];
1154           return (LitTy.getScalarSizeInBits() < 16);
1155         },
1156         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1157       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1158       .widenScalarToNextPow2(BigTyIdx, 32);
1159 
1160   }
1161 
1162   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1163     .legalForCartesianProduct(AllS32Vectors, {S32})
1164     .legalForCartesianProduct(AllS64Vectors, {S64})
1165     .clampNumElements(0, V16S32, V32S32)
1166     .clampNumElements(0, V2S64, V16S64)
1167     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1168 
1169   if (ST.hasScalarPackInsts()) {
1170     BuildVector
1171       // FIXME: Should probably widen s1 vectors straight to s32
1172       .minScalarOrElt(0, S16)
1173       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1174       .minScalar(1, S32);
1175 
1176     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1177       .legalFor({V2S16, S32})
1178       .lower();
1179     BuildVector.minScalarOrElt(0, S32);
1180   } else {
1181     BuildVector.customFor({V2S16, S16});
1182     BuildVector.minScalarOrElt(0, S32);
1183 
1184     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1185       .customFor({V2S16, S32})
1186       .lower();
1187   }
1188 
1189   BuildVector.legalIf(isRegisterType(0));
1190 
1191   // FIXME: Clamp maximum size
1192   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1193     .legalIf(isRegisterType(0));
1194 
1195   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1196   // pre-legalize.
1197   if (ST.hasVOP3PInsts()) {
1198     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1199       .customFor({V2S16, V2S16})
1200       .lower();
1201   } else
1202     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1203 
1204   // Merge/Unmerge
1205   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1206     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1207     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1208 
1209     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1210       const LLT Ty = Query.Types[TypeIdx];
1211       if (Ty.isVector()) {
1212         const LLT &EltTy = Ty.getElementType();
1213         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1214           return true;
1215         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1216           return true;
1217       }
1218       return false;
1219     };
1220 
1221     auto &Builder = getActionDefinitionsBuilder(Op)
1222       // Try to widen to s16 first for small types.
1223       // TODO: Only do this on targets with legal s16 shifts
1224       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1225 
1226       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1227       .lowerFor({{S16, V2S16}})
1228       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1229       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1230                            elementTypeIs(1, S16)),
1231                        changeTo(1, V2S16))
1232       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1233       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1234       // valid.
1235       .clampScalar(LitTyIdx, S32, S512)
1236       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1237       // Break up vectors with weird elements into scalars
1238       .fewerElementsIf(
1239         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1240         scalarize(0))
1241       .fewerElementsIf(
1242         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1243         scalarize(1))
1244       .clampScalar(BigTyIdx, S32, S1024);
1245 
1246     if (Op == G_MERGE_VALUES) {
1247       Builder.widenScalarIf(
1248         // TODO: Use 16-bit shifts if legal for 8-bit values?
1249         [=](const LegalityQuery &Query) {
1250           const LLT Ty = Query.Types[LitTyIdx];
1251           return Ty.getSizeInBits() < 32;
1252         },
1253         changeTo(LitTyIdx, S32));
1254     }
1255 
1256     Builder.widenScalarIf(
1257       [=](const LegalityQuery &Query) {
1258         const LLT Ty = Query.Types[BigTyIdx];
1259         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1260           Ty.getSizeInBits() % 16 != 0;
1261       },
1262       [=](const LegalityQuery &Query) {
1263         // Pick the next power of 2, or a multiple of 64 over 128.
1264         // Whichever is smaller.
1265         const LLT &Ty = Query.Types[BigTyIdx];
1266         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1267         if (NewSizeInBits >= 256) {
1268           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1269           if (RoundedTo < NewSizeInBits)
1270             NewSizeInBits = RoundedTo;
1271         }
1272         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1273       })
1274       .legalIf([=](const LegalityQuery &Query) {
1275           const LLT &BigTy = Query.Types[BigTyIdx];
1276           const LLT &LitTy = Query.Types[LitTyIdx];
1277 
1278           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1279             return false;
1280           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1281             return false;
1282 
1283           return BigTy.getSizeInBits() % 16 == 0 &&
1284                  LitTy.getSizeInBits() % 16 == 0 &&
1285                  BigTy.getSizeInBits() <= 1024;
1286         })
1287       // Any vectors left are the wrong size. Scalarize them.
1288       .scalarize(0)
1289       .scalarize(1);
1290   }
1291 
1292   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1293   // RegBankSelect.
1294   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1295     .legalFor({{S32}, {S64}});
1296 
1297   if (ST.hasVOP3PInsts()) {
1298     SextInReg.lowerFor({{V2S16}})
1299       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1300       // get more vector shift opportunities, since we'll get those when
1301       // expanded.
1302       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1303   } else if (ST.has16BitInsts()) {
1304     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1305   } else {
1306     // Prefer to promote to s32 before lowering if we don't have 16-bit
1307     // shifts. This avoid a lot of intermediate truncate and extend operations.
1308     SextInReg.lowerFor({{S32}, {S64}});
1309   }
1310 
1311   SextInReg
1312     .scalarize(0)
1313     .clampScalar(0, S32, S64)
1314     .lower();
1315 
1316   getActionDefinitionsBuilder(G_FSHR)
1317     .legalFor({{S32, S32}})
1318     .scalarize(0)
1319     .lower();
1320 
1321   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1322     .legalFor({S64});
1323 
1324   getActionDefinitionsBuilder({
1325       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1326       G_FCOPYSIGN,
1327 
1328       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1329       G_READ_REGISTER,
1330       G_WRITE_REGISTER,
1331 
1332       G_SADDO, G_SSUBO,
1333 
1334        // TODO: Implement
1335       G_FMINIMUM, G_FMAXIMUM,
1336       G_FSHL
1337     }).lower();
1338 
1339   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1340         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1341         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1342     .unsupported();
1343 
1344   computeTables();
1345   verify(*ST.getInstrInfo());
1346 }
1347 
1348 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1349                                          MachineRegisterInfo &MRI,
1350                                          MachineIRBuilder &B,
1351                                          GISelChangeObserver &Observer) const {
1352   switch (MI.getOpcode()) {
1353   case TargetOpcode::G_ADDRSPACE_CAST:
1354     return legalizeAddrSpaceCast(MI, MRI, B);
1355   case TargetOpcode::G_FRINT:
1356     return legalizeFrint(MI, MRI, B);
1357   case TargetOpcode::G_FCEIL:
1358     return legalizeFceil(MI, MRI, B);
1359   case TargetOpcode::G_INTRINSIC_TRUNC:
1360     return legalizeIntrinsicTrunc(MI, MRI, B);
1361   case TargetOpcode::G_SITOFP:
1362     return legalizeITOFP(MI, MRI, B, true);
1363   case TargetOpcode::G_UITOFP:
1364     return legalizeITOFP(MI, MRI, B, false);
1365   case TargetOpcode::G_FPTOSI:
1366     return legalizeFPTOI(MI, MRI, B, true);
1367   case TargetOpcode::G_FPTOUI:
1368     return legalizeFPTOI(MI, MRI, B, false);
1369   case TargetOpcode::G_FMINNUM:
1370   case TargetOpcode::G_FMAXNUM:
1371   case TargetOpcode::G_FMINNUM_IEEE:
1372   case TargetOpcode::G_FMAXNUM_IEEE:
1373     return legalizeMinNumMaxNum(MI, MRI, B);
1374   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1375     return legalizeExtractVectorElt(MI, MRI, B);
1376   case TargetOpcode::G_INSERT_VECTOR_ELT:
1377     return legalizeInsertVectorElt(MI, MRI, B);
1378   case TargetOpcode::G_SHUFFLE_VECTOR:
1379     return legalizeShuffleVector(MI, MRI, B);
1380   case TargetOpcode::G_FSIN:
1381   case TargetOpcode::G_FCOS:
1382     return legalizeSinCos(MI, MRI, B);
1383   case TargetOpcode::G_GLOBAL_VALUE:
1384     return legalizeGlobalValue(MI, MRI, B);
1385   case TargetOpcode::G_LOAD:
1386     return legalizeLoad(MI, MRI, B, Observer);
1387   case TargetOpcode::G_FMAD:
1388     return legalizeFMad(MI, MRI, B);
1389   case TargetOpcode::G_FDIV:
1390     return legalizeFDIV(MI, MRI, B);
1391   case TargetOpcode::G_UDIV:
1392   case TargetOpcode::G_UREM:
1393     return legalizeUDIV_UREM(MI, MRI, B);
1394   case TargetOpcode::G_SDIV:
1395   case TargetOpcode::G_SREM:
1396     return legalizeSDIV_SREM(MI, MRI, B);
1397   case TargetOpcode::G_ATOMIC_CMPXCHG:
1398     return legalizeAtomicCmpXChg(MI, MRI, B);
1399   case TargetOpcode::G_FLOG:
1400     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1401   case TargetOpcode::G_FLOG10:
1402     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1403   case TargetOpcode::G_FEXP:
1404     return legalizeFExp(MI, B);
1405   case TargetOpcode::G_FPOW:
1406     return legalizeFPow(MI, B);
1407   case TargetOpcode::G_FFLOOR:
1408     return legalizeFFloor(MI, MRI, B);
1409   case TargetOpcode::G_BUILD_VECTOR:
1410     return legalizeBuildVector(MI, MRI, B);
1411   default:
1412     return false;
1413   }
1414 
1415   llvm_unreachable("expected switch to return");
1416 }
1417 
1418 Register AMDGPULegalizerInfo::getSegmentAperture(
1419   unsigned AS,
1420   MachineRegisterInfo &MRI,
1421   MachineIRBuilder &B) const {
1422   MachineFunction &MF = B.getMF();
1423   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1424   const LLT S32 = LLT::scalar(32);
1425 
1426   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1427 
1428   if (ST.hasApertureRegs()) {
1429     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1430     // getreg.
1431     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1432         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1433         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1434     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1435         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1436         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1437     unsigned Encoding =
1438         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1439         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1440         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1441 
1442     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1443 
1444     B.buildInstr(AMDGPU::S_GETREG_B32)
1445       .addDef(GetReg)
1446       .addImm(Encoding);
1447     MRI.setType(GetReg, S32);
1448 
1449     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1450     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1451   }
1452 
1453   Register QueuePtr = MRI.createGenericVirtualRegister(
1454     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1455 
1456   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1457   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1458     return Register();
1459 
1460   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1461   // private_segment_aperture_base_hi.
1462   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1463 
1464   // TODO: can we be smarter about machine pointer info?
1465   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1466   MachineMemOperand *MMO = MF.getMachineMemOperand(
1467       PtrInfo,
1468       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1469           MachineMemOperand::MOInvariant,
1470       4, commonAlignment(Align(64), StructOffset));
1471 
1472   Register LoadAddr;
1473 
1474   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1475   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1476 }
1477 
1478 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1479   MachineInstr &MI, MachineRegisterInfo &MRI,
1480   MachineIRBuilder &B) const {
1481   MachineFunction &MF = B.getMF();
1482 
1483   B.setInstr(MI);
1484 
1485   const LLT S32 = LLT::scalar(32);
1486   Register Dst = MI.getOperand(0).getReg();
1487   Register Src = MI.getOperand(1).getReg();
1488 
1489   LLT DstTy = MRI.getType(Dst);
1490   LLT SrcTy = MRI.getType(Src);
1491   unsigned DestAS = DstTy.getAddressSpace();
1492   unsigned SrcAS = SrcTy.getAddressSpace();
1493 
1494   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1495   // vector element.
1496   assert(!DstTy.isVector());
1497 
1498   const AMDGPUTargetMachine &TM
1499     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1500 
1501   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1502   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1503     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1504     return true;
1505   }
1506 
1507   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1508     // Truncate.
1509     B.buildExtract(Dst, Src, 0);
1510     MI.eraseFromParent();
1511     return true;
1512   }
1513 
1514   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1515     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1516     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1517 
1518     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1519     // another. Merge operands are required to be the same type, but creating an
1520     // extra ptrtoint would be kind of pointless.
1521     auto HighAddr = B.buildConstant(
1522       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1523     B.buildMerge(Dst, {Src, HighAddr});
1524     MI.eraseFromParent();
1525     return true;
1526   }
1527 
1528   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1529     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1530            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1531     unsigned NullVal = TM.getNullPointerValue(DestAS);
1532 
1533     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1534     auto FlatNull = B.buildConstant(SrcTy, 0);
1535 
1536     // Extract low 32-bits of the pointer.
1537     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1538 
1539     auto CmpRes =
1540         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1541     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1542 
1543     MI.eraseFromParent();
1544     return true;
1545   }
1546 
1547   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1548     return false;
1549 
1550   if (!ST.hasFlatAddressSpace())
1551     return false;
1552 
1553   auto SegmentNull =
1554       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1555   auto FlatNull =
1556       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1557 
1558   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1559   if (!ApertureReg.isValid())
1560     return false;
1561 
1562   auto CmpRes =
1563       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1564 
1565   // Coerce the type of the low half of the result so we can use merge_values.
1566   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1567 
1568   // TODO: Should we allow mismatched types but matching sizes in merges to
1569   // avoid the ptrtoint?
1570   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1571   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1572 
1573   MI.eraseFromParent();
1574   return true;
1575 }
1576 
1577 bool AMDGPULegalizerInfo::legalizeFrint(
1578   MachineInstr &MI, MachineRegisterInfo &MRI,
1579   MachineIRBuilder &B) const {
1580   B.setInstr(MI);
1581 
1582   Register Src = MI.getOperand(1).getReg();
1583   LLT Ty = MRI.getType(Src);
1584   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1585 
1586   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1587   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1588 
1589   auto C1 = B.buildFConstant(Ty, C1Val);
1590   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1591 
1592   // TODO: Should this propagate fast-math-flags?
1593   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1594   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1595 
1596   auto C2 = B.buildFConstant(Ty, C2Val);
1597   auto Fabs = B.buildFAbs(Ty, Src);
1598 
1599   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1600   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1601   return true;
1602 }
1603 
1604 bool AMDGPULegalizerInfo::legalizeFceil(
1605   MachineInstr &MI, MachineRegisterInfo &MRI,
1606   MachineIRBuilder &B) const {
1607   B.setInstr(MI);
1608 
1609   const LLT S1 = LLT::scalar(1);
1610   const LLT S64 = LLT::scalar(64);
1611 
1612   Register Src = MI.getOperand(1).getReg();
1613   assert(MRI.getType(Src) == S64);
1614 
1615   // result = trunc(src)
1616   // if (src > 0.0 && src != result)
1617   //   result += 1.0
1618 
1619   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1620 
1621   const auto Zero = B.buildFConstant(S64, 0.0);
1622   const auto One = B.buildFConstant(S64, 1.0);
1623   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1624   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1625   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1626   auto Add = B.buildSelect(S64, And, One, Zero);
1627 
1628   // TODO: Should this propagate fast-math-flags?
1629   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1630   return true;
1631 }
1632 
1633 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1634                                               MachineIRBuilder &B) {
1635   const unsigned FractBits = 52;
1636   const unsigned ExpBits = 11;
1637   LLT S32 = LLT::scalar(32);
1638 
1639   auto Const0 = B.buildConstant(S32, FractBits - 32);
1640   auto Const1 = B.buildConstant(S32, ExpBits);
1641 
1642   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1643     .addUse(Const0.getReg(0))
1644     .addUse(Const1.getReg(0));
1645 
1646   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1647 }
1648 
1649 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1650   MachineInstr &MI, MachineRegisterInfo &MRI,
1651   MachineIRBuilder &B) const {
1652   B.setInstr(MI);
1653 
1654   const LLT S1 = LLT::scalar(1);
1655   const LLT S32 = LLT::scalar(32);
1656   const LLT S64 = LLT::scalar(64);
1657 
1658   Register Src = MI.getOperand(1).getReg();
1659   assert(MRI.getType(Src) == S64);
1660 
1661   // TODO: Should this use extract since the low half is unused?
1662   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1663   Register Hi = Unmerge.getReg(1);
1664 
1665   // Extract the upper half, since this is where we will find the sign and
1666   // exponent.
1667   auto Exp = extractF64Exponent(Hi, B);
1668 
1669   const unsigned FractBits = 52;
1670 
1671   // Extract the sign bit.
1672   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1673   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1674 
1675   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1676 
1677   const auto Zero32 = B.buildConstant(S32, 0);
1678 
1679   // Extend back to 64-bits.
1680   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1681 
1682   auto Shr = B.buildAShr(S64, FractMask, Exp);
1683   auto Not = B.buildNot(S64, Shr);
1684   auto Tmp0 = B.buildAnd(S64, Src, Not);
1685   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1686 
1687   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1688   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1689 
1690   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1691   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1692   return true;
1693 }
1694 
1695 bool AMDGPULegalizerInfo::legalizeITOFP(
1696   MachineInstr &MI, MachineRegisterInfo &MRI,
1697   MachineIRBuilder &B, bool Signed) const {
1698   B.setInstr(MI);
1699 
1700   Register Dst = MI.getOperand(0).getReg();
1701   Register Src = MI.getOperand(1).getReg();
1702 
1703   const LLT S64 = LLT::scalar(64);
1704   const LLT S32 = LLT::scalar(32);
1705 
1706   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1707 
1708   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1709 
1710   auto CvtHi = Signed ?
1711     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1712     B.buildUITOFP(S64, Unmerge.getReg(1));
1713 
1714   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1715 
1716   auto ThirtyTwo = B.buildConstant(S32, 32);
1717   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1718     .addUse(CvtHi.getReg(0))
1719     .addUse(ThirtyTwo.getReg(0));
1720 
1721   // TODO: Should this propagate fast-math-flags?
1722   B.buildFAdd(Dst, LdExp, CvtLo);
1723   MI.eraseFromParent();
1724   return true;
1725 }
1726 
1727 // TODO: Copied from DAG implementation. Verify logic and document how this
1728 // actually works.
1729 bool AMDGPULegalizerInfo::legalizeFPTOI(
1730   MachineInstr &MI, MachineRegisterInfo &MRI,
1731   MachineIRBuilder &B, bool Signed) const {
1732   B.setInstr(MI);
1733 
1734   Register Dst = MI.getOperand(0).getReg();
1735   Register Src = MI.getOperand(1).getReg();
1736 
1737   const LLT S64 = LLT::scalar(64);
1738   const LLT S32 = LLT::scalar(32);
1739 
1740   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1741 
1742   unsigned Flags = MI.getFlags();
1743 
1744   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1745   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1746   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1747 
1748   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1749   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1750   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1751 
1752   auto Hi = Signed ?
1753     B.buildFPTOSI(S32, FloorMul) :
1754     B.buildFPTOUI(S32, FloorMul);
1755   auto Lo = B.buildFPTOUI(S32, Fma);
1756 
1757   B.buildMerge(Dst, { Lo, Hi });
1758   MI.eraseFromParent();
1759 
1760   return true;
1761 }
1762 
1763 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1764   MachineInstr &MI, MachineRegisterInfo &MRI,
1765   MachineIRBuilder &B) const {
1766   MachineFunction &MF = B.getMF();
1767   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1768 
1769   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1770                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1771 
1772   // With ieee_mode disabled, the instructions have the correct behavior
1773   // already for G_FMINNUM/G_FMAXNUM
1774   if (!MFI->getMode().IEEE)
1775     return !IsIEEEOp;
1776 
1777   if (IsIEEEOp)
1778     return true;
1779 
1780   MachineIRBuilder HelperBuilder(MI);
1781   GISelObserverWrapper DummyObserver;
1782   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1783   HelperBuilder.setInstr(MI);
1784   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1785 }
1786 
1787 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1788   MachineInstr &MI, MachineRegisterInfo &MRI,
1789   MachineIRBuilder &B) const {
1790   // TODO: Should move some of this into LegalizerHelper.
1791 
1792   // TODO: Promote dynamic indexing of s16 to s32
1793 
1794   // FIXME: Artifact combiner probably should have replaced the truncated
1795   // constant before this, so we shouldn't need
1796   // getConstantVRegValWithLookThrough.
1797   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1798     MI.getOperand(2).getReg(), MRI);
1799   if (!IdxVal) // Dynamic case will be selected to register indexing.
1800     return true;
1801 
1802   Register Dst = MI.getOperand(0).getReg();
1803   Register Vec = MI.getOperand(1).getReg();
1804 
1805   LLT VecTy = MRI.getType(Vec);
1806   LLT EltTy = VecTy.getElementType();
1807   assert(EltTy == MRI.getType(Dst));
1808 
1809   B.setInstr(MI);
1810 
1811   if (IdxVal->Value < VecTy.getNumElements())
1812     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1813   else
1814     B.buildUndef(Dst);
1815 
1816   MI.eraseFromParent();
1817   return true;
1818 }
1819 
1820 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1821   MachineInstr &MI, MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B) const {
1823   // TODO: Should move some of this into LegalizerHelper.
1824 
1825   // TODO: Promote dynamic indexing of s16 to s32
1826 
1827   // FIXME: Artifact combiner probably should have replaced the truncated
1828   // constant before this, so we shouldn't need
1829   // getConstantVRegValWithLookThrough.
1830   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1831     MI.getOperand(3).getReg(), MRI);
1832   if (!IdxVal) // Dynamic case will be selected to register indexing.
1833     return true;
1834 
1835   Register Dst = MI.getOperand(0).getReg();
1836   Register Vec = MI.getOperand(1).getReg();
1837   Register Ins = MI.getOperand(2).getReg();
1838 
1839   LLT VecTy = MRI.getType(Vec);
1840   LLT EltTy = VecTy.getElementType();
1841   assert(EltTy == MRI.getType(Ins));
1842 
1843   B.setInstr(MI);
1844 
1845   if (IdxVal->Value < VecTy.getNumElements())
1846     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1847   else
1848     B.buildUndef(Dst);
1849 
1850   MI.eraseFromParent();
1851   return true;
1852 }
1853 
1854 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1855   MachineInstr &MI, MachineRegisterInfo &MRI,
1856   MachineIRBuilder &B) const {
1857   const LLT V2S16 = LLT::vector(2, 16);
1858 
1859   Register Dst = MI.getOperand(0).getReg();
1860   Register Src0 = MI.getOperand(1).getReg();
1861   LLT DstTy = MRI.getType(Dst);
1862   LLT SrcTy = MRI.getType(Src0);
1863 
1864   if (SrcTy == V2S16 && DstTy == V2S16 &&
1865       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1866     return true;
1867 
1868   MachineIRBuilder HelperBuilder(MI);
1869   GISelObserverWrapper DummyObserver;
1870   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1871   HelperBuilder.setInstr(MI);
1872   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1873 }
1874 
1875 bool AMDGPULegalizerInfo::legalizeSinCos(
1876   MachineInstr &MI, MachineRegisterInfo &MRI,
1877   MachineIRBuilder &B) const {
1878   B.setInstr(MI);
1879 
1880   Register DstReg = MI.getOperand(0).getReg();
1881   Register SrcReg = MI.getOperand(1).getReg();
1882   LLT Ty = MRI.getType(DstReg);
1883   unsigned Flags = MI.getFlags();
1884 
1885   Register TrigVal;
1886   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1887   if (ST.hasTrigReducedRange()) {
1888     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1889     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1890       .addUse(MulVal.getReg(0))
1891       .setMIFlags(Flags).getReg(0);
1892   } else
1893     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1894 
1895   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1896     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1897   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1898     .addUse(TrigVal)
1899     .setMIFlags(Flags);
1900   MI.eraseFromParent();
1901   return true;
1902 }
1903 
1904 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1905   Register DstReg, LLT PtrTy,
1906   MachineIRBuilder &B, const GlobalValue *GV,
1907   unsigned Offset, unsigned GAFlags) const {
1908   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1909   // to the following code sequence:
1910   //
1911   // For constant address space:
1912   //   s_getpc_b64 s[0:1]
1913   //   s_add_u32 s0, s0, $symbol
1914   //   s_addc_u32 s1, s1, 0
1915   //
1916   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1917   //   a fixup or relocation is emitted to replace $symbol with a literal
1918   //   constant, which is a pc-relative offset from the encoding of the $symbol
1919   //   operand to the global variable.
1920   //
1921   // For global address space:
1922   //   s_getpc_b64 s[0:1]
1923   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1924   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1925   //
1926   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1927   //   fixups or relocations are emitted to replace $symbol@*@lo and
1928   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1929   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1930   //   operand to the global variable.
1931   //
1932   // What we want here is an offset from the value returned by s_getpc
1933   // (which is the address of the s_add_u32 instruction) to the global
1934   // variable, but since the encoding of $symbol starts 4 bytes after the start
1935   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1936   // small. This requires us to add 4 to the global variable offset in order to
1937   // compute the correct address.
1938 
1939   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1940 
1941   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1942     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1943 
1944   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1945     .addDef(PCReg);
1946 
1947   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1948   if (GAFlags == SIInstrInfo::MO_NONE)
1949     MIB.addImm(0);
1950   else
1951     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1952 
1953   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1954 
1955   if (PtrTy.getSizeInBits() == 32)
1956     B.buildExtract(DstReg, PCReg, 0);
1957   return true;
1958  }
1959 
1960 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1961   MachineInstr &MI, MachineRegisterInfo &MRI,
1962   MachineIRBuilder &B) const {
1963   Register DstReg = MI.getOperand(0).getReg();
1964   LLT Ty = MRI.getType(DstReg);
1965   unsigned AS = Ty.getAddressSpace();
1966 
1967   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1968   MachineFunction &MF = B.getMF();
1969   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1970   B.setInstr(MI);
1971 
1972   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1973     if (!MFI->isEntryFunction()) {
1974       const Function &Fn = MF.getFunction();
1975       DiagnosticInfoUnsupported BadLDSDecl(
1976         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1977         DS_Warning);
1978       Fn.getContext().diagnose(BadLDSDecl);
1979 
1980       // We currently don't have a way to correctly allocate LDS objects that
1981       // aren't directly associated with a kernel. We do force inlining of
1982       // functions that use local objects. However, if these dead functions are
1983       // not eliminated, we don't want a compile time error. Just emit a warning
1984       // and a trap, since there should be no callable path here.
1985       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1986       B.buildUndef(DstReg);
1987       MI.eraseFromParent();
1988       return true;
1989     }
1990 
1991     // TODO: We could emit code to handle the initialization somewhere.
1992     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1993       const SITargetLowering *TLI = ST.getTargetLowering();
1994       if (!TLI->shouldUseLDSConstAddress(GV)) {
1995         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1996         return true; // Leave in place;
1997       }
1998 
1999       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2000       MI.eraseFromParent();
2001       return true;
2002     }
2003 
2004     const Function &Fn = MF.getFunction();
2005     DiagnosticInfoUnsupported BadInit(
2006       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2007     Fn.getContext().diagnose(BadInit);
2008     return true;
2009   }
2010 
2011   const SITargetLowering *TLI = ST.getTargetLowering();
2012 
2013   if (TLI->shouldEmitFixup(GV)) {
2014     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2015     MI.eraseFromParent();
2016     return true;
2017   }
2018 
2019   if (TLI->shouldEmitPCReloc(GV)) {
2020     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2021     MI.eraseFromParent();
2022     return true;
2023   }
2024 
2025   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2026   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2027 
2028   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2029       MachinePointerInfo::getGOT(MF),
2030       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2031           MachineMemOperand::MOInvariant,
2032       8 /*Size*/, Align(8));
2033 
2034   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2035 
2036   if (Ty.getSizeInBits() == 32) {
2037     // Truncate if this is a 32-bit constant adrdess.
2038     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2039     B.buildExtract(DstReg, Load, 0);
2040   } else
2041     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2042 
2043   MI.eraseFromParent();
2044   return true;
2045 }
2046 
2047 bool AMDGPULegalizerInfo::legalizeLoad(
2048   MachineInstr &MI, MachineRegisterInfo &MRI,
2049   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2050   B.setInstr(MI);
2051   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2052   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2053   Observer.changingInstr(MI);
2054   MI.getOperand(1).setReg(Cast.getReg(0));
2055   Observer.changedInstr(MI);
2056   return true;
2057 }
2058 
2059 bool AMDGPULegalizerInfo::legalizeFMad(
2060   MachineInstr &MI, MachineRegisterInfo &MRI,
2061   MachineIRBuilder &B) const {
2062   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2063   assert(Ty.isScalar());
2064 
2065   MachineFunction &MF = B.getMF();
2066   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2067 
2068   // TODO: Always legal with future ftz flag.
2069   // FIXME: Do we need just output?
2070   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2071     return true;
2072   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2073     return true;
2074 
2075   MachineIRBuilder HelperBuilder(MI);
2076   GISelObserverWrapper DummyObserver;
2077   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2078   HelperBuilder.setInstr(MI);
2079   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2080 }
2081 
2082 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2083   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2084   Register DstReg = MI.getOperand(0).getReg();
2085   Register PtrReg = MI.getOperand(1).getReg();
2086   Register CmpVal = MI.getOperand(2).getReg();
2087   Register NewVal = MI.getOperand(3).getReg();
2088 
2089   assert(SITargetLowering::isFlatGlobalAddrSpace(
2090            MRI.getType(PtrReg).getAddressSpace()) &&
2091          "this should not have been custom lowered");
2092 
2093   LLT ValTy = MRI.getType(CmpVal);
2094   LLT VecTy = LLT::vector(2, ValTy);
2095 
2096   B.setInstr(MI);
2097   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2098 
2099   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2100     .addDef(DstReg)
2101     .addUse(PtrReg)
2102     .addUse(PackedVal)
2103     .setMemRefs(MI.memoperands());
2104 
2105   MI.eraseFromParent();
2106   return true;
2107 }
2108 
2109 bool AMDGPULegalizerInfo::legalizeFlog(
2110   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2111   Register Dst = MI.getOperand(0).getReg();
2112   Register Src = MI.getOperand(1).getReg();
2113   LLT Ty = B.getMRI()->getType(Dst);
2114   unsigned Flags = MI.getFlags();
2115   B.setInstr(MI);
2116 
2117   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2118   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2119 
2120   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2121   MI.eraseFromParent();
2122   return true;
2123 }
2124 
2125 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2126                                        MachineIRBuilder &B) const {
2127   Register Dst = MI.getOperand(0).getReg();
2128   Register Src = MI.getOperand(1).getReg();
2129   unsigned Flags = MI.getFlags();
2130   LLT Ty = B.getMRI()->getType(Dst);
2131   B.setInstr(MI);
2132 
2133   auto K = B.buildFConstant(Ty, numbers::log2e);
2134   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2135   B.buildFExp2(Dst, Mul, Flags);
2136   MI.eraseFromParent();
2137   return true;
2138 }
2139 
2140 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2141                                        MachineIRBuilder &B) const {
2142   Register Dst = MI.getOperand(0).getReg();
2143   Register Src0 = MI.getOperand(1).getReg();
2144   Register Src1 = MI.getOperand(2).getReg();
2145   unsigned Flags = MI.getFlags();
2146   LLT Ty = B.getMRI()->getType(Dst);
2147   B.setInstr(MI);
2148   const LLT S16 = LLT::scalar(16);
2149   const LLT S32 = LLT::scalar(32);
2150 
2151   if (Ty == S32) {
2152     auto Log = B.buildFLog2(S32, Src0, Flags);
2153     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2154       .addUse(Log.getReg(0))
2155       .addUse(Src1)
2156       .setMIFlags(Flags);
2157     B.buildFExp2(Dst, Mul, Flags);
2158   } else if (Ty == S16) {
2159     // There's no f16 fmul_legacy, so we need to convert for it.
2160     auto Log = B.buildFLog2(S16, Src0, Flags);
2161     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2162     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2163     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2164       .addUse(Ext0.getReg(0))
2165       .addUse(Ext1.getReg(0))
2166       .setMIFlags(Flags);
2167 
2168     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2169   } else
2170     return false;
2171 
2172   MI.eraseFromParent();
2173   return true;
2174 }
2175 
2176 // Find a source register, ignoring any possible source modifiers.
2177 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2178   Register ModSrc = OrigSrc;
2179   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2180     ModSrc = SrcFNeg->getOperand(1).getReg();
2181     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2182       ModSrc = SrcFAbs->getOperand(1).getReg();
2183   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2184     ModSrc = SrcFAbs->getOperand(1).getReg();
2185   return ModSrc;
2186 }
2187 
2188 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2189                                          MachineRegisterInfo &MRI,
2190                                          MachineIRBuilder &B) const {
2191   B.setInstr(MI);
2192 
2193   const LLT S1 = LLT::scalar(1);
2194   const LLT S64 = LLT::scalar(64);
2195   Register Dst = MI.getOperand(0).getReg();
2196   Register OrigSrc = MI.getOperand(1).getReg();
2197   unsigned Flags = MI.getFlags();
2198   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2199          "this should not have been custom lowered");
2200 
2201   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2202   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2203   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2204   // V_FRACT bug is:
2205   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2206   //
2207   // Convert floor(x) to (x - fract(x))
2208 
2209   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2210     .addUse(OrigSrc)
2211     .setMIFlags(Flags);
2212 
2213   // Give source modifier matching some assistance before obscuring a foldable
2214   // pattern.
2215 
2216   // TODO: We can avoid the neg on the fract? The input sign to fract
2217   // shouldn't matter?
2218   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2219 
2220   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2221 
2222   Register Min = MRI.createGenericVirtualRegister(S64);
2223 
2224   // We don't need to concern ourselves with the snan handling difference, so
2225   // use the one which will directly select.
2226   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2227   if (MFI->getMode().IEEE)
2228     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2229   else
2230     B.buildFMinNum(Min, Fract, Const, Flags);
2231 
2232   Register CorrectedFract = Min;
2233   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2234     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2235     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2236   }
2237 
2238   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2239   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2240 
2241   MI.eraseFromParent();
2242   return true;
2243 }
2244 
2245 // Turn an illegal packed v2s16 build vector into bit operations.
2246 // TODO: This should probably be a bitcast action in LegalizerHelper.
2247 bool AMDGPULegalizerInfo::legalizeBuildVector(
2248   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2249   Register Dst = MI.getOperand(0).getReg();
2250   const LLT S32 = LLT::scalar(32);
2251   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2252 
2253   Register Src0 = MI.getOperand(1).getReg();
2254   Register Src1 = MI.getOperand(2).getReg();
2255   assert(MRI.getType(Src0) == LLT::scalar(16));
2256 
2257   B.setInstr(MI);
2258   auto Merge = B.buildMerge(S32, {Src0, Src1});
2259   B.buildBitcast(Dst, Merge);
2260 
2261   MI.eraseFromParent();
2262   return true;
2263 }
2264 
2265 // Return the use branch instruction, otherwise null if the usage is invalid.
2266 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2267                                        MachineRegisterInfo &MRI,
2268                                        MachineInstr *&Br) {
2269   Register CondDef = MI.getOperand(0).getReg();
2270   if (!MRI.hasOneNonDBGUse(CondDef))
2271     return nullptr;
2272 
2273   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2274   if (UseMI.getParent() != MI.getParent() ||
2275       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2276     return nullptr;
2277 
2278   // Make sure the cond br is followed by a G_BR
2279   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2280   if (Next != MI.getParent()->end()) {
2281     if (Next->getOpcode() != AMDGPU::G_BR)
2282       return nullptr;
2283     Br = &*Next;
2284   }
2285 
2286   return &UseMI;
2287 }
2288 
2289 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2290                                                MachineRegisterInfo &MRI,
2291                                                Register LiveIn,
2292                                                Register PhyReg) const {
2293   assert(PhyReg.isPhysical() && "Physical register expected");
2294 
2295   // Insert the live-in copy, if required, by defining destination virtual
2296   // register.
2297   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2298   if (!MRI.getVRegDef(LiveIn)) {
2299     // FIXME: Should have scoped insert pt
2300     MachineBasicBlock &OrigInsBB = B.getMBB();
2301     auto OrigInsPt = B.getInsertPt();
2302 
2303     MachineBasicBlock &EntryMBB = B.getMF().front();
2304     EntryMBB.addLiveIn(PhyReg);
2305     B.setInsertPt(EntryMBB, EntryMBB.begin());
2306     B.buildCopy(LiveIn, PhyReg);
2307 
2308     B.setInsertPt(OrigInsBB, OrigInsPt);
2309   }
2310 
2311   return LiveIn;
2312 }
2313 
2314 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2315                                                 MachineRegisterInfo &MRI,
2316                                                 Register PhyReg, LLT Ty,
2317                                                 bool InsertLiveInCopy) const {
2318   assert(PhyReg.isPhysical() && "Physical register expected");
2319 
2320   // Get or create virtual live-in regester
2321   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2322   if (!LiveIn) {
2323     LiveIn = MRI.createGenericVirtualRegister(Ty);
2324     MRI.addLiveIn(PhyReg, LiveIn);
2325   }
2326 
2327   // When the actual true copy required is from virtual register to physical
2328   // register (to be inserted later), live-in copy insertion from physical
2329   // to register virtual register is not required
2330   if (!InsertLiveInCopy)
2331     return LiveIn;
2332 
2333   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2334 }
2335 
2336 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2337     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2338   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2339   const ArgDescriptor *Arg;
2340   const TargetRegisterClass *RC;
2341   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2342   if (!Arg) {
2343     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2344     return nullptr;
2345   }
2346   return Arg;
2347 }
2348 
2349 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2350                                          const ArgDescriptor *Arg) const {
2351   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2352     return false; // TODO: Handle these
2353 
2354   Register SrcReg = Arg->getRegister();
2355   assert(SrcReg.isPhysical() && "Physical register expected");
2356   assert(DstReg.isVirtual() && "Virtual register expected");
2357 
2358   MachineRegisterInfo &MRI = *B.getMRI();
2359 
2360   LLT Ty = MRI.getType(DstReg);
2361   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2362 
2363   if (Arg->isMasked()) {
2364     // TODO: Should we try to emit this once in the entry block?
2365     const LLT S32 = LLT::scalar(32);
2366     const unsigned Mask = Arg->getMask();
2367     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2368 
2369     Register AndMaskSrc = LiveIn;
2370 
2371     if (Shift != 0) {
2372       auto ShiftAmt = B.buildConstant(S32, Shift);
2373       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2374     }
2375 
2376     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2377   } else {
2378     B.buildCopy(DstReg, LiveIn);
2379   }
2380 
2381   return true;
2382 }
2383 
2384 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2385     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2386     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2387   B.setInstr(MI);
2388 
2389   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2390   if (!Arg)
2391     return false;
2392 
2393   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2394     return false;
2395 
2396   MI.eraseFromParent();
2397   return true;
2398 }
2399 
2400 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2401                                        MachineRegisterInfo &MRI,
2402                                        MachineIRBuilder &B) const {
2403   B.setInstr(MI);
2404   Register Dst = MI.getOperand(0).getReg();
2405   LLT DstTy = MRI.getType(Dst);
2406   LLT S16 = LLT::scalar(16);
2407   LLT S32 = LLT::scalar(32);
2408   LLT S64 = LLT::scalar(64);
2409 
2410   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2411     return true;
2412 
2413   if (DstTy == S16)
2414     return legalizeFDIV16(MI, MRI, B);
2415   if (DstTy == S32)
2416     return legalizeFDIV32(MI, MRI, B);
2417   if (DstTy == S64)
2418     return legalizeFDIV64(MI, MRI, B);
2419 
2420   return false;
2421 }
2422 
2423 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2424   const LLT S32 = LLT::scalar(32);
2425 
2426   auto Cvt0 = B.buildUITOFP(S32, Src);
2427   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2428   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2429   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2430   return B.buildFPTOUI(S32, Mul).getReg(0);
2431 }
2432 
2433 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2434                                                   Register DstReg,
2435                                                   Register Num,
2436                                                   Register Den,
2437                                                   bool IsRem) const {
2438   const LLT S1 = LLT::scalar(1);
2439   const LLT S32 = LLT::scalar(32);
2440 
2441   // RCP =  URECIP(Den) = 2^32 / Den + e
2442   // e is rounding error.
2443   auto RCP = buildDivRCP(B, Den);
2444 
2445   // RCP_LO = mul(RCP, Den)
2446   auto RCP_LO = B.buildMul(S32, RCP, Den);
2447 
2448   // RCP_HI = mulhu (RCP, Den) */
2449   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2450 
2451   // NEG_RCP_LO = -RCP_LO
2452   auto Zero = B.buildConstant(S32, 0);
2453   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2454 
2455   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2456   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2457   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2458 
2459   // Calculate the rounding error from the URECIP instruction
2460   // E = mulhu(ABS_RCP_LO, RCP)
2461   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2462 
2463   // RCP_A_E = RCP + E
2464   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2465 
2466   // RCP_S_E = RCP - E
2467   auto RCP_S_E = B.buildSub(S32, RCP, E);
2468 
2469   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2470   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2471 
2472   // Quotient = mulhu(Tmp0, Num)stmp
2473   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2474 
2475   // Num_S_Remainder = Quotient * Den
2476   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2477 
2478   // Remainder = Num - Num_S_Remainder
2479   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2480 
2481   // Remainder_GE_Den = Remainder >= Den
2482   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2483 
2484   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2485   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2486                                        Num, Num_S_Remainder);
2487 
2488   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2489   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2490 
2491   // Calculate Division result:
2492 
2493   // Quotient_A_One = Quotient + 1
2494   auto One = B.buildConstant(S32, 1);
2495   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2496 
2497   // Quotient_S_One = Quotient - 1
2498   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2499 
2500   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2501   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2502 
2503   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2504   if (IsRem) {
2505     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2506 
2507     // Calculate Rem result:
2508     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2509 
2510     // Remainder_A_Den = Remainder + Den
2511     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2512 
2513     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2514     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2515 
2516     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2517     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2518   } else {
2519     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2520   }
2521 }
2522 
2523 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2524                                               MachineRegisterInfo &MRI,
2525                                               MachineIRBuilder &B) const {
2526   B.setInstr(MI);
2527   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2528   Register DstReg = MI.getOperand(0).getReg();
2529   Register Num = MI.getOperand(1).getReg();
2530   Register Den = MI.getOperand(2).getReg();
2531   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2532   MI.eraseFromParent();
2533   return true;
2534 }
2535 
2536 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2537 //
2538 // Return lo, hi of result
2539 //
2540 // %cvt.lo = G_UITOFP Val.lo
2541 // %cvt.hi = G_UITOFP Val.hi
2542 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2543 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2544 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2545 // %mul2 = G_FMUL %mul1, 2**(-32)
2546 // %trunc = G_INTRINSIC_TRUNC %mul2
2547 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2548 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2549 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2550                                                        Register Val) {
2551   const LLT S32 = LLT::scalar(32);
2552   auto Unmerge = B.buildUnmerge(S32, Val);
2553 
2554   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2555   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2556 
2557   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2558                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2559 
2560   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2561   auto Mul1 =
2562       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2563 
2564   // 2**(-32)
2565   auto Mul2 =
2566       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2567   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2568 
2569   // -(2**32)
2570   auto Mad2 = B.buildFMAD(S32, Trunc,
2571                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2572 
2573   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2574   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2575 
2576   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2577 }
2578 
2579 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2580                                               MachineRegisterInfo &MRI,
2581                                               MachineIRBuilder &B) const {
2582   B.setInstr(MI);
2583 
2584   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2585   const LLT S32 = LLT::scalar(32);
2586   const LLT S64 = LLT::scalar(64);
2587   const LLT S1 = LLT::scalar(1);
2588   Register Numer = MI.getOperand(1).getReg();
2589   Register Denom = MI.getOperand(2).getReg();
2590   Register RcpLo, RcpHi;
2591 
2592   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2593 
2594   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2595 
2596   auto Zero64 = B.buildConstant(S64, 0);
2597   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2598 
2599   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2600   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2601 
2602   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2603   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2604   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2605 
2606   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2607   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2608   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2609   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2610 
2611   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2612   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2613   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2614   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2615   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2616 
2617   auto Zero32 = B.buildConstant(S32, 0);
2618   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2619   auto Add2_HiC =
2620       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2621   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2622   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2623 
2624   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2625   Register NumerLo = UnmergeNumer.getReg(0);
2626   Register NumerHi = UnmergeNumer.getReg(1);
2627 
2628   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2629   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2630   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2631   Register Mul3_Lo = UnmergeMul3.getReg(0);
2632   Register Mul3_Hi = UnmergeMul3.getReg(1);
2633   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2634   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2635   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2636   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2637 
2638   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2639   Register DenomLo = UnmergeDenom.getReg(0);
2640   Register DenomHi = UnmergeDenom.getReg(1);
2641 
2642   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2643   auto C1 = B.buildSExt(S32, CmpHi);
2644 
2645   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2646   auto C2 = B.buildSExt(S32, CmpLo);
2647 
2648   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2649   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2650 
2651   // TODO: Here and below portions of the code can be enclosed into if/endif.
2652   // Currently control flow is unconditional and we have 4 selects after
2653   // potential endif to substitute PHIs.
2654 
2655   // if C3 != 0 ...
2656   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2657   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2658   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2659   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2660 
2661   auto One64 = B.buildConstant(S64, 1);
2662   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2663 
2664   auto C4 =
2665       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2666   auto C5 =
2667       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2668   auto C6 = B.buildSelect(
2669       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2670 
2671   // if (C6 != 0)
2672   auto Add4 = B.buildAdd(S64, Add3, One64);
2673   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2674 
2675   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2676   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2677   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2678 
2679   // endif C6
2680   // endif C3
2681 
2682   if (IsDiv) {
2683     auto Sel1 = B.buildSelect(
2684         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2685     B.buildSelect(MI.getOperand(0),
2686                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2687   } else {
2688     auto Sel2 = B.buildSelect(
2689         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2690     B.buildSelect(MI.getOperand(0),
2691                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2692   }
2693 
2694   MI.eraseFromParent();
2695   return true;
2696 }
2697 
2698 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2699                                             MachineRegisterInfo &MRI,
2700                                             MachineIRBuilder &B) const {
2701   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2702   if (Ty == LLT::scalar(32))
2703     return legalizeUDIV_UREM32(MI, MRI, B);
2704   if (Ty == LLT::scalar(64))
2705     return legalizeUDIV_UREM64(MI, MRI, B);
2706   return false;
2707 }
2708 
2709 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2710                                               MachineRegisterInfo &MRI,
2711                                               MachineIRBuilder &B) const {
2712   B.setInstr(MI);
2713   const LLT S32 = LLT::scalar(32);
2714 
2715   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2716   Register DstReg = MI.getOperand(0).getReg();
2717   Register LHS = MI.getOperand(1).getReg();
2718   Register RHS = MI.getOperand(2).getReg();
2719 
2720   auto ThirtyOne = B.buildConstant(S32, 31);
2721   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2722   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2723 
2724   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2725   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2726 
2727   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2728   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2729 
2730   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2731   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2732 
2733   if (IsRem) {
2734     auto RSign = LHSign; // Remainder sign is the same as LHS
2735     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2736     B.buildSub(DstReg, UDivRem, RSign);
2737   } else {
2738     auto DSign = B.buildXor(S32, LHSign, RHSign);
2739     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2740     B.buildSub(DstReg, UDivRem, DSign);
2741   }
2742 
2743   MI.eraseFromParent();
2744   return true;
2745 }
2746 
2747 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2748                                             MachineRegisterInfo &MRI,
2749                                             MachineIRBuilder &B) const {
2750   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2751     return legalizeSDIV_SREM32(MI, MRI, B);
2752   return false;
2753 }
2754 
2755 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2756                                                  MachineRegisterInfo &MRI,
2757                                                  MachineIRBuilder &B) const {
2758   Register Res = MI.getOperand(0).getReg();
2759   Register LHS = MI.getOperand(1).getReg();
2760   Register RHS = MI.getOperand(2).getReg();
2761 
2762   uint16_t Flags = MI.getFlags();
2763 
2764   LLT ResTy = MRI.getType(Res);
2765   LLT S32 = LLT::scalar(32);
2766   LLT S64 = LLT::scalar(64);
2767 
2768   const MachineFunction &MF = B.getMF();
2769   bool Unsafe =
2770     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2771 
2772   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2773     return false;
2774 
2775   if (!Unsafe && ResTy == S32 &&
2776       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2777     return false;
2778 
2779   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2780     // 1 / x -> RCP(x)
2781     if (CLHS->isExactlyValue(1.0)) {
2782       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2783         .addUse(RHS)
2784         .setMIFlags(Flags);
2785 
2786       MI.eraseFromParent();
2787       return true;
2788     }
2789 
2790     // -1 / x -> RCP( FNEG(x) )
2791     if (CLHS->isExactlyValue(-1.0)) {
2792       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2793       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2794         .addUse(FNeg.getReg(0))
2795         .setMIFlags(Flags);
2796 
2797       MI.eraseFromParent();
2798       return true;
2799     }
2800   }
2801 
2802   // x / y -> x * (1.0 / y)
2803   if (Unsafe) {
2804     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2805       .addUse(RHS)
2806       .setMIFlags(Flags);
2807     B.buildFMul(Res, LHS, RCP, Flags);
2808 
2809     MI.eraseFromParent();
2810     return true;
2811   }
2812 
2813   return false;
2814 }
2815 
2816 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2817                                          MachineRegisterInfo &MRI,
2818                                          MachineIRBuilder &B) const {
2819   B.setInstr(MI);
2820   Register Res = MI.getOperand(0).getReg();
2821   Register LHS = MI.getOperand(1).getReg();
2822   Register RHS = MI.getOperand(2).getReg();
2823 
2824   uint16_t Flags = MI.getFlags();
2825 
2826   LLT S16 = LLT::scalar(16);
2827   LLT S32 = LLT::scalar(32);
2828 
2829   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2830   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2831 
2832   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2833     .addUse(RHSExt.getReg(0))
2834     .setMIFlags(Flags);
2835 
2836   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2837   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2838 
2839   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2840     .addUse(RDst.getReg(0))
2841     .addUse(RHS)
2842     .addUse(LHS)
2843     .setMIFlags(Flags);
2844 
2845   MI.eraseFromParent();
2846   return true;
2847 }
2848 
2849 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2850 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2851 static void toggleSPDenormMode(bool Enable,
2852                                MachineIRBuilder &B,
2853                                const GCNSubtarget &ST,
2854                                AMDGPU::SIModeRegisterDefaults Mode) {
2855   // Set SP denorm mode to this value.
2856   unsigned SPDenormMode =
2857     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2858 
2859   if (ST.hasDenormModeInst()) {
2860     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2861     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2862 
2863     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2864     B.buildInstr(AMDGPU::S_DENORM_MODE)
2865       .addImm(NewDenormModeValue);
2866 
2867   } else {
2868     // Select FP32 bit field in mode register.
2869     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2870                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2871                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2872 
2873     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2874       .addImm(SPDenormMode)
2875       .addImm(SPDenormModeBitField);
2876   }
2877 }
2878 
2879 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2880                                          MachineRegisterInfo &MRI,
2881                                          MachineIRBuilder &B) const {
2882   B.setInstr(MI);
2883   Register Res = MI.getOperand(0).getReg();
2884   Register LHS = MI.getOperand(1).getReg();
2885   Register RHS = MI.getOperand(2).getReg();
2886   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2887   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2888 
2889   uint16_t Flags = MI.getFlags();
2890 
2891   LLT S32 = LLT::scalar(32);
2892   LLT S1 = LLT::scalar(1);
2893 
2894   auto One = B.buildFConstant(S32, 1.0f);
2895 
2896   auto DenominatorScaled =
2897     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2898       .addUse(RHS)
2899       .addUse(LHS)
2900       .addImm(1)
2901       .setMIFlags(Flags);
2902   auto NumeratorScaled =
2903     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2904       .addUse(LHS)
2905       .addUse(RHS)
2906       .addImm(0)
2907       .setMIFlags(Flags);
2908 
2909   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2910     .addUse(DenominatorScaled.getReg(0))
2911     .setMIFlags(Flags);
2912   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2913 
2914   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2915   // aren't modeled as reading it.
2916   if (!Mode.allFP32Denormals())
2917     toggleSPDenormMode(true, B, ST, Mode);
2918 
2919   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2920   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2921   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2922   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2923   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2924   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2925 
2926   if (!Mode.allFP32Denormals())
2927     toggleSPDenormMode(false, B, ST, Mode);
2928 
2929   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2930     .addUse(Fma4.getReg(0))
2931     .addUse(Fma1.getReg(0))
2932     .addUse(Fma3.getReg(0))
2933     .addUse(NumeratorScaled.getReg(1))
2934     .setMIFlags(Flags);
2935 
2936   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2937     .addUse(Fmas.getReg(0))
2938     .addUse(RHS)
2939     .addUse(LHS)
2940     .setMIFlags(Flags);
2941 
2942   MI.eraseFromParent();
2943   return true;
2944 }
2945 
2946 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2947                                          MachineRegisterInfo &MRI,
2948                                          MachineIRBuilder &B) const {
2949   B.setInstr(MI);
2950   Register Res = MI.getOperand(0).getReg();
2951   Register LHS = MI.getOperand(1).getReg();
2952   Register RHS = MI.getOperand(2).getReg();
2953 
2954   uint16_t Flags = MI.getFlags();
2955 
2956   LLT S64 = LLT::scalar(64);
2957   LLT S1 = LLT::scalar(1);
2958 
2959   auto One = B.buildFConstant(S64, 1.0);
2960 
2961   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2962     .addUse(LHS)
2963     .addUse(RHS)
2964     .addImm(1)
2965     .setMIFlags(Flags);
2966 
2967   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2968 
2969   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2970     .addUse(DivScale0.getReg(0))
2971     .setMIFlags(Flags);
2972 
2973   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2974   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2975   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2976 
2977   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2978     .addUse(LHS)
2979     .addUse(RHS)
2980     .addImm(0)
2981     .setMIFlags(Flags);
2982 
2983   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2984   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2985   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2986 
2987   Register Scale;
2988   if (!ST.hasUsableDivScaleConditionOutput()) {
2989     // Workaround a hardware bug on SI where the condition output from div_scale
2990     // is not usable.
2991 
2992     LLT S32 = LLT::scalar(32);
2993 
2994     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2995     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2996     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2997     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2998 
2999     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3000                               Scale1Unmerge.getReg(1));
3001     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3002                               Scale0Unmerge.getReg(1));
3003     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3004   } else {
3005     Scale = DivScale1.getReg(1);
3006   }
3007 
3008   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3009     .addUse(Fma4.getReg(0))
3010     .addUse(Fma3.getReg(0))
3011     .addUse(Mul.getReg(0))
3012     .addUse(Scale)
3013     .setMIFlags(Flags);
3014 
3015   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3016     .addUse(Fmas.getReg(0))
3017     .addUse(RHS)
3018     .addUse(LHS)
3019     .setMIFlags(Flags);
3020 
3021   MI.eraseFromParent();
3022   return true;
3023 }
3024 
3025 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3026                                                  MachineRegisterInfo &MRI,
3027                                                  MachineIRBuilder &B) const {
3028   B.setInstr(MI);
3029   Register Res = MI.getOperand(0).getReg();
3030   Register LHS = MI.getOperand(2).getReg();
3031   Register RHS = MI.getOperand(3).getReg();
3032   uint16_t Flags = MI.getFlags();
3033 
3034   LLT S32 = LLT::scalar(32);
3035   LLT S1 = LLT::scalar(1);
3036 
3037   auto Abs = B.buildFAbs(S32, RHS, Flags);
3038   const APFloat C0Val(1.0f);
3039 
3040   auto C0 = B.buildConstant(S32, 0x6f800000);
3041   auto C1 = B.buildConstant(S32, 0x2f800000);
3042   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3043 
3044   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3045   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3046 
3047   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3048 
3049   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3050     .addUse(Mul0.getReg(0))
3051     .setMIFlags(Flags);
3052 
3053   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3054 
3055   B.buildFMul(Res, Sel, Mul1, Flags);
3056 
3057   MI.eraseFromParent();
3058   return true;
3059 }
3060 
3061 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3062                                                  MachineRegisterInfo &MRI,
3063                                                  MachineIRBuilder &B) const {
3064   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3065   if (!MFI->isEntryFunction()) {
3066     return legalizePreloadedArgIntrin(MI, MRI, B,
3067                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3068   }
3069 
3070   B.setInstr(MI);
3071 
3072   uint64_t Offset =
3073     ST.getTargetLowering()->getImplicitParameterOffset(
3074       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3075   Register DstReg = MI.getOperand(0).getReg();
3076   LLT DstTy = MRI.getType(DstReg);
3077   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3078 
3079   const ArgDescriptor *Arg;
3080   const TargetRegisterClass *RC;
3081   std::tie(Arg, RC)
3082     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3083   if (!Arg)
3084     return false;
3085 
3086   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3087   if (!loadInputValue(KernargPtrReg, B, Arg))
3088     return false;
3089 
3090   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3091   MI.eraseFromParent();
3092   return true;
3093 }
3094 
3095 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3096                                               MachineRegisterInfo &MRI,
3097                                               MachineIRBuilder &B,
3098                                               unsigned AddrSpace) const {
3099   B.setInstr(MI);
3100   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3101   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3102   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3103   MI.eraseFromParent();
3104   return true;
3105 }
3106 
3107 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3108 // offset (the offset that is included in bounds checking and swizzling, to be
3109 // split between the instruction's voffset and immoffset fields) and soffset
3110 // (the offset that is excluded from bounds checking and swizzling, to go in
3111 // the instruction's soffset field).  This function takes the first kind of
3112 // offset and figures out how to split it between voffset and immoffset.
3113 std::tuple<Register, unsigned, unsigned>
3114 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3115                                         Register OrigOffset) const {
3116   const unsigned MaxImm = 4095;
3117   Register BaseReg;
3118   unsigned TotalConstOffset;
3119   MachineInstr *OffsetDef;
3120   const LLT S32 = LLT::scalar(32);
3121 
3122   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3123     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3124 
3125   unsigned ImmOffset = TotalConstOffset;
3126 
3127   // If the immediate value is too big for the immoffset field, put the value
3128   // and -4096 into the immoffset field so that the value that is copied/added
3129   // for the voffset field is a multiple of 4096, and it stands more chance
3130   // of being CSEd with the copy/add for another similar load/store.
3131   // However, do not do that rounding down to a multiple of 4096 if that is a
3132   // negative number, as it appears to be illegal to have a negative offset
3133   // in the vgpr, even if adding the immediate offset makes it positive.
3134   unsigned Overflow = ImmOffset & ~MaxImm;
3135   ImmOffset -= Overflow;
3136   if ((int32_t)Overflow < 0) {
3137     Overflow += ImmOffset;
3138     ImmOffset = 0;
3139   }
3140 
3141   if (Overflow != 0) {
3142     if (!BaseReg) {
3143       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3144     } else {
3145       auto OverflowVal = B.buildConstant(S32, Overflow);
3146       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3147     }
3148   }
3149 
3150   if (!BaseReg)
3151     BaseReg = B.buildConstant(S32, 0).getReg(0);
3152 
3153   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3154 }
3155 
3156 /// Handle register layout difference for f16 images for some subtargets.
3157 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3158                                              MachineRegisterInfo &MRI,
3159                                              Register Reg) const {
3160   if (!ST.hasUnpackedD16VMem())
3161     return Reg;
3162 
3163   const LLT S16 = LLT::scalar(16);
3164   const LLT S32 = LLT::scalar(32);
3165   LLT StoreVT = MRI.getType(Reg);
3166   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3167 
3168   auto Unmerge = B.buildUnmerge(S16, Reg);
3169 
3170   SmallVector<Register, 4> WideRegs;
3171   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3172     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3173 
3174   int NumElts = StoreVT.getNumElements();
3175 
3176   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3177 }
3178 
3179 Register AMDGPULegalizerInfo::fixStoreSourceType(
3180   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3181   MachineRegisterInfo *MRI = B.getMRI();
3182   LLT Ty = MRI->getType(VData);
3183 
3184   const LLT S16 = LLT::scalar(16);
3185 
3186   // Fixup illegal register types for i8 stores.
3187   if (Ty == LLT::scalar(8) || Ty == S16) {
3188     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3189     return AnyExt;
3190   }
3191 
3192   if (Ty.isVector()) {
3193     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3194       if (IsFormat)
3195         return handleD16VData(B, *MRI, VData);
3196     }
3197   }
3198 
3199   return VData;
3200 }
3201 
3202 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3203                                               MachineRegisterInfo &MRI,
3204                                               MachineIRBuilder &B,
3205                                               bool IsTyped,
3206                                               bool IsFormat) const {
3207   B.setInstr(MI);
3208 
3209   Register VData = MI.getOperand(1).getReg();
3210   LLT Ty = MRI.getType(VData);
3211   LLT EltTy = Ty.getScalarType();
3212   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3213   const LLT S32 = LLT::scalar(32);
3214 
3215   VData = fixStoreSourceType(B, VData, IsFormat);
3216   Register RSrc = MI.getOperand(2).getReg();
3217 
3218   MachineMemOperand *MMO = *MI.memoperands_begin();
3219   const int MemSize = MMO->getSize();
3220 
3221   unsigned ImmOffset;
3222   unsigned TotalOffset;
3223 
3224   // The typed intrinsics add an immediate after the registers.
3225   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3226 
3227   // The struct intrinsic variants add one additional operand over raw.
3228   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3229   Register VIndex;
3230   int OpOffset = 0;
3231   if (HasVIndex) {
3232     VIndex = MI.getOperand(3).getReg();
3233     OpOffset = 1;
3234   }
3235 
3236   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3237   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3238 
3239   unsigned Format = 0;
3240   if (IsTyped) {
3241     Format = MI.getOperand(5 + OpOffset).getImm();
3242     ++OpOffset;
3243   }
3244 
3245   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3246 
3247   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3248   if (TotalOffset != 0)
3249     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3250 
3251   unsigned Opc;
3252   if (IsTyped) {
3253     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3254                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3255   } else if (IsFormat) {
3256     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3257                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3258   } else {
3259     switch (MemSize) {
3260     case 1:
3261       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3262       break;
3263     case 2:
3264       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3265       break;
3266     default:
3267       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3268       break;
3269     }
3270   }
3271 
3272   if (!VIndex)
3273     VIndex = B.buildConstant(S32, 0).getReg(0);
3274 
3275   auto MIB = B.buildInstr(Opc)
3276     .addUse(VData)              // vdata
3277     .addUse(RSrc)               // rsrc
3278     .addUse(VIndex)             // vindex
3279     .addUse(VOffset)            // voffset
3280     .addUse(SOffset)            // soffset
3281     .addImm(ImmOffset);         // offset(imm)
3282 
3283   if (IsTyped)
3284     MIB.addImm(Format);
3285 
3286   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3287      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3288      .addMemOperand(MMO);
3289 
3290   MI.eraseFromParent();
3291   return true;
3292 }
3293 
3294 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3295                                              MachineRegisterInfo &MRI,
3296                                              MachineIRBuilder &B,
3297                                              bool IsFormat,
3298                                              bool IsTyped) const {
3299   B.setInstr(MI);
3300 
3301   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3302   MachineMemOperand *MMO = *MI.memoperands_begin();
3303   const int MemSize = MMO->getSize();
3304   const LLT S32 = LLT::scalar(32);
3305 
3306   Register Dst = MI.getOperand(0).getReg();
3307   Register RSrc = MI.getOperand(2).getReg();
3308 
3309   // The typed intrinsics add an immediate after the registers.
3310   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3311 
3312   // The struct intrinsic variants add one additional operand over raw.
3313   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3314   Register VIndex;
3315   int OpOffset = 0;
3316   if (HasVIndex) {
3317     VIndex = MI.getOperand(3).getReg();
3318     OpOffset = 1;
3319   }
3320 
3321   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3322   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3323 
3324   unsigned Format = 0;
3325   if (IsTyped) {
3326     Format = MI.getOperand(5 + OpOffset).getImm();
3327     ++OpOffset;
3328   }
3329 
3330   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3331   unsigned ImmOffset;
3332   unsigned TotalOffset;
3333 
3334   LLT Ty = MRI.getType(Dst);
3335   LLT EltTy = Ty.getScalarType();
3336   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3337   const bool Unpacked = ST.hasUnpackedD16VMem();
3338 
3339   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3340   if (TotalOffset != 0)
3341     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3342 
3343   unsigned Opc;
3344 
3345   if (IsTyped) {
3346     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3347                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3348   } else if (IsFormat) {
3349     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3350                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3351   } else {
3352     switch (MemSize) {
3353     case 1:
3354       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3355       break;
3356     case 2:
3357       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3358       break;
3359     default:
3360       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3361       break;
3362     }
3363   }
3364 
3365   Register LoadDstReg;
3366 
3367   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3368   LLT UnpackedTy = Ty.changeElementSize(32);
3369 
3370   if (IsExtLoad)
3371     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3372   else if (Unpacked && IsD16 && Ty.isVector())
3373     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3374   else
3375     LoadDstReg = Dst;
3376 
3377   if (!VIndex)
3378     VIndex = B.buildConstant(S32, 0).getReg(0);
3379 
3380   auto MIB = B.buildInstr(Opc)
3381     .addDef(LoadDstReg)         // vdata
3382     .addUse(RSrc)               // rsrc
3383     .addUse(VIndex)             // vindex
3384     .addUse(VOffset)            // voffset
3385     .addUse(SOffset)            // soffset
3386     .addImm(ImmOffset);         // offset(imm)
3387 
3388   if (IsTyped)
3389     MIB.addImm(Format);
3390 
3391   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3392      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3393      .addMemOperand(MMO);
3394 
3395   if (LoadDstReg != Dst) {
3396     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3397 
3398     // Widen result for extending loads was widened.
3399     if (IsExtLoad)
3400       B.buildTrunc(Dst, LoadDstReg);
3401     else {
3402       // Repack to original 16-bit vector result
3403       // FIXME: G_TRUNC should work, but legalization currently fails
3404       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3405       SmallVector<Register, 4> Repack;
3406       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3407         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3408       B.buildMerge(Dst, Repack);
3409     }
3410   }
3411 
3412   MI.eraseFromParent();
3413   return true;
3414 }
3415 
3416 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3417                                                MachineIRBuilder &B,
3418                                                bool IsInc) const {
3419   B.setInstr(MI);
3420   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3421                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3422   B.buildInstr(Opc)
3423     .addDef(MI.getOperand(0).getReg())
3424     .addUse(MI.getOperand(2).getReg())
3425     .addUse(MI.getOperand(3).getReg())
3426     .cloneMemRefs(MI);
3427   MI.eraseFromParent();
3428   return true;
3429 }
3430 
3431 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3432   switch (IntrID) {
3433   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3434   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3435     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3436   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3437   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3438     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3439   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3440   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3441     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3442   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3443   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3444     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3445   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3446   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3447     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3448   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3449   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3450     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3451   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3452   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3453     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3454   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3455   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3456     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3457   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3458   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3459     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3460   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3461   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3462     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3463   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3464   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3465     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3466   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3467   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3468     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3469   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3470   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3471     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3472   default:
3473     llvm_unreachable("unhandled atomic opcode");
3474   }
3475 }
3476 
3477 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3478                                                MachineIRBuilder &B,
3479                                                Intrinsic::ID IID) const {
3480   B.setInstr(MI);
3481 
3482   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3483                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3484 
3485   Register Dst = MI.getOperand(0).getReg();
3486   Register VData = MI.getOperand(2).getReg();
3487 
3488   Register CmpVal;
3489   int OpOffset = 0;
3490 
3491   if (IsCmpSwap) {
3492     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3493     ++OpOffset;
3494   }
3495 
3496   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3497   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3498 
3499   // The struct intrinsic variants add one additional operand over raw.
3500   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3501   Register VIndex;
3502   if (HasVIndex) {
3503     VIndex = MI.getOperand(4 + OpOffset).getReg();
3504     ++OpOffset;
3505   }
3506 
3507   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3508   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3509   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3510 
3511   MachineMemOperand *MMO = *MI.memoperands_begin();
3512 
3513   unsigned ImmOffset;
3514   unsigned TotalOffset;
3515   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3516   if (TotalOffset != 0)
3517     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3518 
3519   if (!VIndex)
3520     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3521 
3522   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3523     .addDef(Dst)
3524     .addUse(VData); // vdata
3525 
3526   if (IsCmpSwap)
3527     MIB.addReg(CmpVal);
3528 
3529   MIB.addUse(RSrc)               // rsrc
3530      .addUse(VIndex)             // vindex
3531      .addUse(VOffset)            // voffset
3532      .addUse(SOffset)            // soffset
3533      .addImm(ImmOffset)          // offset(imm)
3534      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3535      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3536      .addMemOperand(MMO);
3537 
3538   MI.eraseFromParent();
3539   return true;
3540 }
3541 
3542 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3543 /// vector with s16 typed elements.
3544 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3545                                         SmallVectorImpl<Register> &PackedAddrs,
3546                                         int AddrIdx, int DimIdx, int NumVAddrs,
3547                                         int NumGradients) {
3548   const LLT S16 = LLT::scalar(16);
3549   const LLT V2S16 = LLT::vector(2, 16);
3550 
3551   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3552     MachineOperand &SrcOp = MI.getOperand(I);
3553     if (!SrcOp.isReg())
3554       continue; // _L to _LZ may have eliminated this.
3555 
3556     Register AddrReg = SrcOp.getReg();
3557 
3558     if (I < DimIdx) {
3559       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3560       PackedAddrs.push_back(AddrReg);
3561     } else {
3562       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3563       // derivatives dx/dh and dx/dv are packed with undef.
3564       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3565           ((NumGradients / 2) % 2 == 1 &&
3566            (I == DimIdx + (NumGradients / 2) - 1 ||
3567             I == DimIdx + NumGradients - 1)) ||
3568           // Check for _L to _LZ optimization
3569           !MI.getOperand(I + 1).isReg()) {
3570         PackedAddrs.push_back(
3571             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3572                 .getReg(0));
3573       } else {
3574         PackedAddrs.push_back(
3575             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3576                 .getReg(0));
3577         ++I;
3578       }
3579     }
3580   }
3581 }
3582 
3583 /// Convert from separate vaddr components to a single vector address register,
3584 /// and replace the remaining operands with $noreg.
3585 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3586                                      int DimIdx, int NumVAddrs) {
3587   const LLT S32 = LLT::scalar(32);
3588 
3589   SmallVector<Register, 8> AddrRegs;
3590   for (int I = 0; I != NumVAddrs; ++I) {
3591     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3592     if (SrcOp.isReg()) {
3593       AddrRegs.push_back(SrcOp.getReg());
3594       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3595     }
3596   }
3597 
3598   int NumAddrRegs = AddrRegs.size();
3599   if (NumAddrRegs != 1) {
3600     // Round up to 8 elements for v5-v7
3601     // FIXME: Missing intermediate sized register classes and instructions.
3602     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3603       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3604       auto Undef = B.buildUndef(S32);
3605       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3606       NumAddrRegs = RoundedNumRegs;
3607     }
3608 
3609     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3610     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3611   }
3612 
3613   for (int I = 1; I != NumVAddrs; ++I) {
3614     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3615     if (SrcOp.isReg())
3616       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3617   }
3618 }
3619 
3620 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3621 ///
3622 /// Depending on the subtarget, load/store with 16-bit element data need to be
3623 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3624 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3625 /// registers.
3626 ///
3627 /// We don't want to directly select image instructions just yet, but also want
3628 /// to exposes all register repacking to the legalizer/combiners. We also don't
3629 /// want a selected instrution entering RegBankSelect. In order to avoid
3630 /// defining a multitude of intermediate image instructions, directly hack on
3631 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3632 /// now unnecessary arguments with $noreg.
3633 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3634     MachineInstr &MI, MachineIRBuilder &B,
3635     GISelChangeObserver &Observer,
3636     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3637   B.setInstr(MI);
3638 
3639   const int NumDefs = MI.getNumExplicitDefs();
3640   bool IsTFE = NumDefs == 2;
3641   // We are only processing the operands of d16 image operations on subtargets
3642   // that use the unpacked register layout, or need to repack the TFE result.
3643 
3644   // TODO: Do we need to guard against already legalized intrinsics?
3645   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3646     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3647 
3648   MachineRegisterInfo *MRI = B.getMRI();
3649   const LLT S32 = LLT::scalar(32);
3650   const LLT S16 = LLT::scalar(16);
3651   const LLT V2S16 = LLT::vector(2, 16);
3652 
3653   // Index of first address argument
3654   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3655 
3656   // Check for 16 bit addresses and pack if true.
3657   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3658   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3659   const bool IsA16 = AddrTy == S16;
3660 
3661   int NumVAddrs, NumGradients;
3662   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3663   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3664     getDMaskIdx(BaseOpcode, NumDefs);
3665   unsigned DMask = 0;
3666 
3667   int DMaskLanes = 0;
3668   if (!BaseOpcode->Atomic) {
3669     DMask = MI.getOperand(DMaskIdx).getImm();
3670     if (BaseOpcode->Gather4) {
3671       DMaskLanes = 4;
3672     } else if (DMask != 0) {
3673       DMaskLanes = countPopulation(DMask);
3674     } else if (!IsTFE && !BaseOpcode->Store) {
3675       // If dmask is 0, this is a no-op load. This can be eliminated.
3676       B.buildUndef(MI.getOperand(0));
3677       MI.eraseFromParent();
3678       return true;
3679     }
3680   }
3681 
3682   Observer.changingInstr(MI);
3683   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3684 
3685   unsigned NewOpcode = NumDefs == 0 ?
3686     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3687 
3688   // Track that we legalized this
3689   MI.setDesc(B.getTII().get(NewOpcode));
3690 
3691   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3692   // dmask to be at least 1 otherwise the instruction will fail
3693   if (IsTFE && DMask == 0) {
3694     DMask = 0x1;
3695     DMaskLanes = 1;
3696     MI.getOperand(DMaskIdx).setImm(DMask);
3697   }
3698 
3699   if (BaseOpcode->Atomic) {
3700     Register VData0 = MI.getOperand(2).getReg();
3701     LLT Ty = MRI->getType(VData0);
3702 
3703     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3704     if (Ty.isVector())
3705       return false;
3706 
3707     if (BaseOpcode->AtomicX2) {
3708       Register VData1 = MI.getOperand(3).getReg();
3709       // The two values are packed in one register.
3710       LLT PackedTy = LLT::vector(2, Ty);
3711       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3712       MI.getOperand(2).setReg(Concat.getReg(0));
3713       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3714     }
3715   }
3716 
3717   int CorrectedNumVAddrs = NumVAddrs;
3718 
3719   // Optimize _L to _LZ when _L is zero
3720   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3721         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3722     const ConstantFP *ConstantLod;
3723     const int LodIdx = AddrIdx + NumVAddrs - 1;
3724 
3725     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3726       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3727         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3728         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3729           LZMappingInfo->LZ, ImageDimIntr->Dim);
3730 
3731         // The starting indexes should remain in the same place.
3732         --NumVAddrs;
3733         --CorrectedNumVAddrs;
3734 
3735         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3736           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3737         MI.RemoveOperand(LodIdx);
3738       }
3739     }
3740   }
3741 
3742   // Optimize _mip away, when 'lod' is zero
3743   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3744     int64_t ConstantLod;
3745     const int LodIdx = AddrIdx + NumVAddrs - 1;
3746 
3747     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3748       if (ConstantLod == 0) {
3749         // TODO: Change intrinsic opcode and remove operand instead or replacing
3750         // it with 0, as the _L to _LZ handling is done above.
3751         MI.getOperand(LodIdx).ChangeToImmediate(0);
3752         --CorrectedNumVAddrs;
3753       }
3754     }
3755   }
3756 
3757   // If the register allocator cannot place the address registers contiguously
3758   // without introducing moves, then using the non-sequential address encoding
3759   // is always preferable, since it saves VALU instructions and is usually a
3760   // wash in terms of code size or even better.
3761   //
3762   // However, we currently have no way of hinting to the register allocator
3763   // that MIMG addresses should be placed contiguously when it is possible to
3764   // do so, so force non-NSA for the common 2-address case as a heuristic.
3765   //
3766   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3767   // allocation when possible.
3768   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3769 
3770   // Rewrite the addressing register layout before doing anything else.
3771   if (IsA16) {
3772     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3773     // should be introduced.
3774     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3775       return false;
3776 
3777     if (NumVAddrs > 1) {
3778       SmallVector<Register, 4> PackedRegs;
3779       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3780                                   NumGradients);
3781 
3782       if (!UseNSA && PackedRegs.size() > 1) {
3783         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3784         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3785         PackedRegs[0] = Concat.getReg(0);
3786         PackedRegs.resize(1);
3787       }
3788 
3789       const int NumPacked = PackedRegs.size();
3790       for (int I = 0; I != NumVAddrs; ++I) {
3791         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3792         if (!SrcOp.isReg()) {
3793           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3794           continue;
3795         }
3796 
3797         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3798 
3799         if (I < NumPacked)
3800           SrcOp.setReg(PackedRegs[I]);
3801         else
3802           SrcOp.setReg(AMDGPU::NoRegister);
3803       }
3804     }
3805   } else if (!UseNSA && NumVAddrs > 1) {
3806     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3807   }
3808 
3809 
3810   if (BaseOpcode->Store) { // No TFE for stores?
3811     // TODO: Handle dmask trim
3812     Register VData = MI.getOperand(1).getReg();
3813     LLT Ty = MRI->getType(VData);
3814     if (!Ty.isVector() || Ty.getElementType() != S16)
3815       return true;
3816 
3817     B.setInstr(MI);
3818 
3819     Register RepackedReg = handleD16VData(B, *MRI, VData);
3820     if (RepackedReg != VData) {
3821       MI.getOperand(1).setReg(RepackedReg);
3822     }
3823 
3824     return true;
3825   }
3826 
3827   Register DstReg = MI.getOperand(0).getReg();
3828   LLT Ty = MRI->getType(DstReg);
3829   const LLT EltTy = Ty.getScalarType();
3830   const bool IsD16 = Ty.getScalarType() == S16;
3831   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3832 
3833   // Confirm that the return type is large enough for the dmask specified
3834   if (NumElts < DMaskLanes)
3835     return false;
3836 
3837   if (NumElts > 4 || DMaskLanes > 4)
3838     return false;
3839 
3840   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3841   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3842 
3843   // The raw dword aligned data component of the load. The only legal cases
3844   // where this matters should be when using the packed D16 format, for
3845   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3846   LLT RoundedTy;
3847 
3848   // S32 vector to to cover all data, plus TFE result element.
3849   LLT TFETy;
3850 
3851   // Register type to use for each loaded component. Will be S32 or V2S16.
3852   LLT RegTy;
3853 
3854   if (IsD16 && ST.hasUnpackedD16VMem()) {
3855     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3856     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3857     RegTy = S32;
3858   } else {
3859     unsigned EltSize = EltTy.getSizeInBits();
3860     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3861     unsigned RoundedSize = 32 * RoundedElts;
3862     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3863     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3864     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3865   }
3866 
3867   // The return type does not need adjustment.
3868   // TODO: Should we change s16 case to s32 or <2 x s16>?
3869   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3870     return true;
3871 
3872   Register Dst1Reg;
3873 
3874   // Insert after the instruction.
3875   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3876 
3877   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3878   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3879   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3880   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3881 
3882   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3883 
3884   MI.getOperand(0).setReg(NewResultReg);
3885 
3886   // In the IR, TFE is supposed to be used with a 2 element struct return
3887   // type. The intruction really returns these two values in one contiguous
3888   // register, with one additional dword beyond the loaded data. Rewrite the
3889   // return type to use a single register result.
3890 
3891   if (IsTFE) {
3892     Dst1Reg = MI.getOperand(1).getReg();
3893     if (MRI->getType(Dst1Reg) != S32)
3894       return false;
3895 
3896     // TODO: Make sure the TFE operand bit is set.
3897     MI.RemoveOperand(1);
3898 
3899     // Handle the easy case that requires no repack instructions.
3900     if (Ty == S32) {
3901       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3902       return true;
3903     }
3904   }
3905 
3906   // Now figure out how to copy the new result register back into the old
3907   // result.
3908   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3909 
3910   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3911 
3912   if (ResultNumRegs == 1) {
3913     assert(!IsTFE);
3914     ResultRegs[0] = NewResultReg;
3915   } else {
3916     // We have to repack into a new vector of some kind.
3917     for (int I = 0; I != NumDataRegs; ++I)
3918       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3919     B.buildUnmerge(ResultRegs, NewResultReg);
3920 
3921     // Drop the final TFE element to get the data part. The TFE result is
3922     // directly written to the right place already.
3923     if (IsTFE)
3924       ResultRegs.resize(NumDataRegs);
3925   }
3926 
3927   // For an s16 scalar result, we form an s32 result with a truncate regardless
3928   // of packed vs. unpacked.
3929   if (IsD16 && !Ty.isVector()) {
3930     B.buildTrunc(DstReg, ResultRegs[0]);
3931     return true;
3932   }
3933 
3934   // Avoid a build/concat_vector of 1 entry.
3935   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3936     B.buildBitcast(DstReg, ResultRegs[0]);
3937     return true;
3938   }
3939 
3940   assert(Ty.isVector());
3941 
3942   if (IsD16) {
3943     // For packed D16 results with TFE enabled, all the data components are
3944     // S32. Cast back to the expected type.
3945     //
3946     // TODO: We don't really need to use load s32 elements. We would only need one
3947     // cast for the TFE result if a multiple of v2s16 was used.
3948     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3949       for (Register &Reg : ResultRegs)
3950         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3951     } else if (ST.hasUnpackedD16VMem()) {
3952       for (Register &Reg : ResultRegs)
3953         Reg = B.buildTrunc(S16, Reg).getReg(0);
3954     }
3955   }
3956 
3957   auto padWithUndef = [&](LLT Ty, int NumElts) {
3958     if (NumElts == 0)
3959       return;
3960     Register Undef = B.buildUndef(Ty).getReg(0);
3961     for (int I = 0; I != NumElts; ++I)
3962       ResultRegs.push_back(Undef);
3963   };
3964 
3965   // Pad out any elements eliminated due to the dmask.
3966   LLT ResTy = MRI->getType(ResultRegs[0]);
3967   if (!ResTy.isVector()) {
3968     padWithUndef(ResTy, NumElts - ResultRegs.size());
3969     B.buildBuildVector(DstReg, ResultRegs);
3970     return true;
3971   }
3972 
3973   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3974   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3975 
3976   // Deal with the one annoying legal case.
3977   const LLT V3S16 = LLT::vector(3, 16);
3978   if (Ty == V3S16) {
3979     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3980     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3981     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3982     return true;
3983   }
3984 
3985   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3986   B.buildConcatVectors(DstReg, ResultRegs);
3987   return true;
3988 }
3989 
3990 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3991   MachineInstr &MI, MachineIRBuilder &B,
3992   GISelChangeObserver &Observer) const {
3993   Register Dst = MI.getOperand(0).getReg();
3994   LLT Ty = B.getMRI()->getType(Dst);
3995   unsigned Size = Ty.getSizeInBits();
3996   MachineFunction &MF = B.getMF();
3997 
3998   Observer.changingInstr(MI);
3999 
4000   // FIXME: We don't really need this intermediate instruction. The intrinsic
4001   // should be fixed to have a memory operand. Since it's readnone, we're not
4002   // allowed to add one.
4003   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4004   MI.RemoveOperand(1); // Remove intrinsic ID
4005 
4006   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4007   // TODO: Should this use datalayout alignment?
4008   const unsigned MemSize = (Size + 7) / 8;
4009   const Align MemAlign(4);
4010   MachineMemOperand *MMO = MF.getMachineMemOperand(
4011       MachinePointerInfo(),
4012       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4013           MachineMemOperand::MOInvariant,
4014       MemSize, MemAlign);
4015   MI.addMemOperand(MF, MMO);
4016 
4017   // There are no 96-bit result scalar loads, but widening to 128-bit should
4018   // always be legal. We may need to restore this to a 96-bit result if it turns
4019   // out this needs to be converted to a vector load during RegBankSelect.
4020   if (!isPowerOf2_32(Size)) {
4021     LegalizerHelper Helper(MF, *this, Observer, B);
4022     B.setInstr(MI);
4023 
4024     if (Ty.isVector())
4025       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4026     else
4027       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4028   }
4029 
4030   Observer.changedInstr(MI);
4031   return true;
4032 }
4033 
4034 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4035                                                 MachineRegisterInfo &MRI,
4036                                                 MachineIRBuilder &B) const {
4037   B.setInstr(MI);
4038 
4039   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4040   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4041       !ST.isTrapHandlerEnabled()) {
4042     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4043   } else {
4044     // Pass queue pointer to trap handler as input, and insert trap instruction
4045     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4046     const ArgDescriptor *Arg =
4047         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4048     if (!Arg)
4049       return false;
4050     MachineRegisterInfo &MRI = *B.getMRI();
4051     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4052     Register LiveIn = getLiveInRegister(
4053         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4054         /*InsertLiveInCopy=*/false);
4055     if (!loadInputValue(LiveIn, B, Arg))
4056       return false;
4057     B.buildCopy(SGPR01, LiveIn);
4058     B.buildInstr(AMDGPU::S_TRAP)
4059         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4060         .addReg(SGPR01, RegState::Implicit);
4061   }
4062 
4063   MI.eraseFromParent();
4064   return true;
4065 }
4066 
4067 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4068     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4069   B.setInstr(MI);
4070 
4071   // Is non-HSA path or trap-handler disabled? then, report a warning
4072   // accordingly
4073   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4074       !ST.isTrapHandlerEnabled()) {
4075     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4076                                      "debugtrap handler not supported",
4077                                      MI.getDebugLoc(), DS_Warning);
4078     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4079     Ctx.diagnose(NoTrap);
4080   } else {
4081     // Insert debug-trap instruction
4082     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4083   }
4084 
4085   MI.eraseFromParent();
4086   return true;
4087 }
4088 
4089 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4090                                             MachineIRBuilder &B,
4091                                             GISelChangeObserver &Observer) const {
4092   MachineRegisterInfo &MRI = *B.getMRI();
4093 
4094   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4095   auto IntrID = MI.getIntrinsicID();
4096   switch (IntrID) {
4097   case Intrinsic::amdgcn_if:
4098   case Intrinsic::amdgcn_else: {
4099     MachineInstr *Br = nullptr;
4100     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4101       const SIRegisterInfo *TRI
4102         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4103 
4104       B.setInstr(*BrCond);
4105       Register Def = MI.getOperand(1).getReg();
4106       Register Use = MI.getOperand(3).getReg();
4107 
4108       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4109       if (Br)
4110         BrTarget = Br->getOperand(0).getMBB();
4111 
4112       if (IntrID == Intrinsic::amdgcn_if) {
4113         B.buildInstr(AMDGPU::SI_IF)
4114           .addDef(Def)
4115           .addUse(Use)
4116           .addMBB(BrTarget);
4117       } else {
4118         B.buildInstr(AMDGPU::SI_ELSE)
4119           .addDef(Def)
4120           .addUse(Use)
4121           .addMBB(BrTarget)
4122           .addImm(0);
4123       }
4124 
4125       if (Br)
4126         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4127 
4128       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4129       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4130       MI.eraseFromParent();
4131       BrCond->eraseFromParent();
4132       return true;
4133     }
4134 
4135     return false;
4136   }
4137   case Intrinsic::amdgcn_loop: {
4138     MachineInstr *Br = nullptr;
4139     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4140       const SIRegisterInfo *TRI
4141         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4142 
4143       B.setInstr(*BrCond);
4144 
4145       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4146       if (Br)
4147         BrTarget = Br->getOperand(0).getMBB();
4148 
4149       Register Reg = MI.getOperand(2).getReg();
4150       B.buildInstr(AMDGPU::SI_LOOP)
4151         .addUse(Reg)
4152         .addMBB(BrTarget);
4153 
4154       if (Br)
4155         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4156 
4157       MI.eraseFromParent();
4158       BrCond->eraseFromParent();
4159       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4160       return true;
4161     }
4162 
4163     return false;
4164   }
4165   case Intrinsic::amdgcn_kernarg_segment_ptr:
4166     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4167       B.setInstr(MI);
4168       // This only makes sense to call in a kernel, so just lower to null.
4169       B.buildConstant(MI.getOperand(0).getReg(), 0);
4170       MI.eraseFromParent();
4171       return true;
4172     }
4173 
4174     return legalizePreloadedArgIntrin(
4175       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4176   case Intrinsic::amdgcn_implicitarg_ptr:
4177     return legalizeImplicitArgPtr(MI, MRI, B);
4178   case Intrinsic::amdgcn_workitem_id_x:
4179     return legalizePreloadedArgIntrin(MI, MRI, B,
4180                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4181   case Intrinsic::amdgcn_workitem_id_y:
4182     return legalizePreloadedArgIntrin(MI, MRI, B,
4183                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4184   case Intrinsic::amdgcn_workitem_id_z:
4185     return legalizePreloadedArgIntrin(MI, MRI, B,
4186                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4187   case Intrinsic::amdgcn_workgroup_id_x:
4188     return legalizePreloadedArgIntrin(MI, MRI, B,
4189                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4190   case Intrinsic::amdgcn_workgroup_id_y:
4191     return legalizePreloadedArgIntrin(MI, MRI, B,
4192                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4193   case Intrinsic::amdgcn_workgroup_id_z:
4194     return legalizePreloadedArgIntrin(MI, MRI, B,
4195                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4196   case Intrinsic::amdgcn_dispatch_ptr:
4197     return legalizePreloadedArgIntrin(MI, MRI, B,
4198                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4199   case Intrinsic::amdgcn_queue_ptr:
4200     return legalizePreloadedArgIntrin(MI, MRI, B,
4201                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4202   case Intrinsic::amdgcn_implicit_buffer_ptr:
4203     return legalizePreloadedArgIntrin(
4204       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4205   case Intrinsic::amdgcn_dispatch_id:
4206     return legalizePreloadedArgIntrin(MI, MRI, B,
4207                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4208   case Intrinsic::amdgcn_fdiv_fast:
4209     return legalizeFDIVFastIntrin(MI, MRI, B);
4210   case Intrinsic::amdgcn_is_shared:
4211     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4212   case Intrinsic::amdgcn_is_private:
4213     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4214   case Intrinsic::amdgcn_wavefrontsize: {
4215     B.setInstr(MI);
4216     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4217     MI.eraseFromParent();
4218     return true;
4219   }
4220   case Intrinsic::amdgcn_s_buffer_load:
4221     return legalizeSBufferLoad(MI, B, Observer);
4222   case Intrinsic::amdgcn_raw_buffer_store:
4223   case Intrinsic::amdgcn_struct_buffer_store:
4224     return legalizeBufferStore(MI, MRI, B, false, false);
4225   case Intrinsic::amdgcn_raw_buffer_store_format:
4226   case Intrinsic::amdgcn_struct_buffer_store_format:
4227     return legalizeBufferStore(MI, MRI, B, false, true);
4228   case Intrinsic::amdgcn_raw_tbuffer_store:
4229   case Intrinsic::amdgcn_struct_tbuffer_store:
4230     return legalizeBufferStore(MI, MRI, B, true, true);
4231   case Intrinsic::amdgcn_raw_buffer_load:
4232   case Intrinsic::amdgcn_struct_buffer_load:
4233     return legalizeBufferLoad(MI, MRI, B, false, false);
4234   case Intrinsic::amdgcn_raw_buffer_load_format:
4235   case Intrinsic::amdgcn_struct_buffer_load_format:
4236     return legalizeBufferLoad(MI, MRI, B, true, false);
4237   case Intrinsic::amdgcn_raw_tbuffer_load:
4238   case Intrinsic::amdgcn_struct_tbuffer_load:
4239     return legalizeBufferLoad(MI, MRI, B, true, true);
4240   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4241   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4242   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4243   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4244   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4245   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4246   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4247   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4248   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4249   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4250   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4251   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4252   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4253   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4254   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4255   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4256   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4257   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4258   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4259   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4260   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4261   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4262   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4263   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4264   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4265   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4266     return legalizeBufferAtomic(MI, B, IntrID);
4267   case Intrinsic::amdgcn_atomic_inc:
4268     return legalizeAtomicIncDec(MI, B, true);
4269   case Intrinsic::amdgcn_atomic_dec:
4270     return legalizeAtomicIncDec(MI, B, false);
4271   case Intrinsic::trap:
4272     return legalizeTrapIntrinsic(MI, MRI, B);
4273   case Intrinsic::debugtrap:
4274     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4275   default: {
4276     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4277             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4278       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4279     return true;
4280   }
4281   }
4282 
4283   return true;
4284 }
4285