1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Round the number of elements to the next power of two elements
40 static LLT getPow2VectorType(LLT Ty) {
41   unsigned NElts = Ty.getNumElements();
42   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
43   return Ty.changeNumElements(Pow2NElts);
44 }
45 
46 // Round the number of bits to the next power of two bits
47 static LLT getPow2ScalarType(LLT Ty) {
48   unsigned Bits = Ty.getSizeInBits();
49   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
50   return LLT::scalar(Pow2Bits);
51 }
52 
53 static LegalityPredicate isMultiple32(unsigned TypeIdx,
54                                       unsigned MaxSize = 1024) {
55   return [=](const LegalityQuery &Query) {
56     const LLT Ty = Query.Types[TypeIdx];
57     const LLT EltTy = Ty.getScalarType();
58     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
59   };
60 }
61 
62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
63   return [=](const LegalityQuery &Query) {
64     const LLT Ty = Query.Types[TypeIdx];
65     return Ty.isVector() &&
66            Ty.getNumElements() % 2 != 0 &&
67            Ty.getElementType().getSizeInBits() < 32 &&
68            Ty.getSizeInBits() % 32 != 0;
69   };
70 }
71 
72 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     const LLT EltTy = Ty.getScalarType();
76     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
77   };
78 }
79 
80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     const LLT EltTy = Ty.getElementType();
84     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
85   };
86 }
87 
88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getElementType();
92     unsigned Size = Ty.getSizeInBits();
93     unsigned Pieces = (Size + 63) / 64;
94     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
95     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
96   };
97 }
98 
99 // Increase the number of vector elements to reach the next multiple of 32-bit
100 // type.
101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104 
105     const LLT EltTy = Ty.getElementType();
106     const int Size = Ty.getSizeInBits();
107     const int EltSize = EltTy.getSizeInBits();
108     const int NextMul32 = (Size + 31) / 32;
109 
110     assert(EltSize < 32);
111 
112     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
113     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
114   };
115 }
116 
117 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
118   return [=](const LegalityQuery &Query) {
119     const LLT QueryTy = Query.Types[TypeIdx];
120     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
121   };
122 }
123 
124 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
125   return [=](const LegalityQuery &Query) {
126     const LLT QueryTy = Query.Types[TypeIdx];
127     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
128   };
129 }
130 
131 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
132   return [=](const LegalityQuery &Query) {
133     const LLT QueryTy = Query.Types[TypeIdx];
134     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
135   };
136 }
137 
138 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
139 // v2s16.
140 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
141   return [=](const LegalityQuery &Query) {
142     const LLT Ty = Query.Types[TypeIdx];
143     if (Ty.isVector()) {
144       const int EltSize = Ty.getElementType().getSizeInBits();
145       return EltSize == 32 || EltSize == 64 ||
146             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
147              EltSize == 128 || EltSize == 256;
148     }
149 
150     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
151   };
152 }
153 
154 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT QueryTy = Query.Types[TypeIdx];
157     if (!QueryTy.isVector())
158       return false;
159     const LLT EltTy = QueryTy.getElementType();
160     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
161   };
162 }
163 
164 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
165   return [=](const LegalityQuery &Query) {
166     const LLT Ty = Query.Types[TypeIdx];
167     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
168            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
169   };
170 }
171 
172 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
173                                          const GCNTargetMachine &TM)
174   :  ST(ST_) {
175   using namespace TargetOpcode;
176 
177   auto GetAddrSpacePtr = [&TM](unsigned AS) {
178     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
179   };
180 
181   const LLT S1 = LLT::scalar(1);
182   const LLT S16 = LLT::scalar(16);
183   const LLT S32 = LLT::scalar(32);
184   const LLT S64 = LLT::scalar(64);
185   const LLT S128 = LLT::scalar(128);
186   const LLT S256 = LLT::scalar(256);
187   const LLT S512 = LLT::scalar(512);
188   const LLT S1024 = LLT::scalar(1024);
189 
190   const LLT V2S16 = LLT::vector(2, 16);
191   const LLT V4S16 = LLT::vector(4, 16);
192 
193   const LLT V2S32 = LLT::vector(2, 32);
194   const LLT V3S32 = LLT::vector(3, 32);
195   const LLT V4S32 = LLT::vector(4, 32);
196   const LLT V5S32 = LLT::vector(5, 32);
197   const LLT V6S32 = LLT::vector(6, 32);
198   const LLT V7S32 = LLT::vector(7, 32);
199   const LLT V8S32 = LLT::vector(8, 32);
200   const LLT V9S32 = LLT::vector(9, 32);
201   const LLT V10S32 = LLT::vector(10, 32);
202   const LLT V11S32 = LLT::vector(11, 32);
203   const LLT V12S32 = LLT::vector(12, 32);
204   const LLT V13S32 = LLT::vector(13, 32);
205   const LLT V14S32 = LLT::vector(14, 32);
206   const LLT V15S32 = LLT::vector(15, 32);
207   const LLT V16S32 = LLT::vector(16, 32);
208   const LLT V32S32 = LLT::vector(32, 32);
209 
210   const LLT V2S64 = LLT::vector(2, 64);
211   const LLT V3S64 = LLT::vector(3, 64);
212   const LLT V4S64 = LLT::vector(4, 64);
213   const LLT V5S64 = LLT::vector(5, 64);
214   const LLT V6S64 = LLT::vector(6, 64);
215   const LLT V7S64 = LLT::vector(7, 64);
216   const LLT V8S64 = LLT::vector(8, 64);
217   const LLT V16S64 = LLT::vector(16, 64);
218 
219   std::initializer_list<LLT> AllS32Vectors =
220     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
221      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
222   std::initializer_list<LLT> AllS64Vectors =
223     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
224 
225   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
226   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
227   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
228   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
229   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
230   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
231   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
232 
233   const LLT CodePtr = FlatPtr;
234 
235   const std::initializer_list<LLT> AddrSpaces64 = {
236     GlobalPtr, ConstantPtr, FlatPtr
237   };
238 
239   const std::initializer_list<LLT> AddrSpaces32 = {
240     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
241   };
242 
243   const std::initializer_list<LLT> FPTypesBase = {
244     S32, S64
245   };
246 
247   const std::initializer_list<LLT> FPTypes16 = {
248     S32, S64, S16
249   };
250 
251   const std::initializer_list<LLT> FPTypesPK16 = {
252     S32, S64, S16, V2S16
253   };
254 
255   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
256 
257   setAction({G_BRCOND, S1}, Legal); // VCC branches
258   setAction({G_BRCOND, S32}, Legal); // SCC branches
259 
260   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
261   // elements for v3s16
262   getActionDefinitionsBuilder(G_PHI)
263     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
264     .legalFor(AllS32Vectors)
265     .legalFor(AllS64Vectors)
266     .legalFor(AddrSpaces64)
267     .legalFor(AddrSpaces32)
268     .clampScalar(0, S32, S256)
269     .widenScalarToNextPow2(0, 32)
270     .clampMaxNumElements(0, S32, 16)
271     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
272     .legalIf(isPointer(0));
273 
274   if (ST.hasVOP3PInsts()) {
275     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
276       .legalFor({S32, S16, V2S16})
277       .clampScalar(0, S16, S32)
278       .clampMaxNumElements(0, S16, 2)
279       .scalarize(0)
280       .widenScalarToNextPow2(0, 32);
281   } else if (ST.has16BitInsts()) {
282     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
283       .legalFor({S32, S16})
284       .clampScalar(0, S16, S32)
285       .scalarize(0)
286       .widenScalarToNextPow2(0, 32);
287   } else {
288     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
289       .legalFor({S32})
290       .clampScalar(0, S32, S32)
291       .scalarize(0);
292   }
293 
294   // FIXME: Not really legal. Placeholder for custom lowering.
295   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
296     .customFor({S32, S64})
297     .clampScalar(0, S32, S64)
298     .widenScalarToNextPow2(0, 32)
299     .scalarize(0);
300 
301   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
302     .legalFor({S32})
303     .clampScalar(0, S32, S32)
304     .scalarize(0);
305 
306   // Report legal for any types we can handle anywhere. For the cases only legal
307   // on the SALU, RegBankSelect will be able to re-legalize.
308   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
309     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
310     .clampScalar(0, S32, S64)
311     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
312     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
313     .widenScalarToNextPow2(0)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
317                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
318     .legalFor({{S32, S1}, {S32, S32}})
319     .minScalar(0, S32)
320     // TODO: .scalarize(0)
321     .lower();
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     .lower();
327 
328 
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   getActionDefinitionsBuilder(G_FCONSTANT)
337     .legalFor({S32, S64, S16})
338     .clampScalar(0, S16, S64);
339 
340   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
341       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
342                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
343       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344       .clampScalarOrElt(0, S32, S1024)
345       .legalIf(isMultiple32(0))
346       .widenScalarToNextPow2(0, 32)
347       .clampMaxNumElements(0, S32, 16);
348 
349   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
350 
351   // If the amount is divergent, we have to do a wave reduction to get the
352   // maximum value, so this is expanded during RegBankSelect.
353   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
354     .legalFor({{PrivatePtr, S32}});
355 
356   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
357     .unsupportedFor({PrivatePtr})
358     .custom();
359   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
360 
361   auto &FPOpActions = getActionDefinitionsBuilder(
362     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
363     .legalFor({S32, S64});
364   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
365     .customFor({S32, S64});
366   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
367     .customFor({S32, S64});
368 
369   if (ST.has16BitInsts()) {
370     if (ST.hasVOP3PInsts())
371       FPOpActions.legalFor({S16, V2S16});
372     else
373       FPOpActions.legalFor({S16});
374 
375     TrigActions.customFor({S16});
376     FDIVActions.customFor({S16});
377   }
378 
379   auto &MinNumMaxNum = getActionDefinitionsBuilder({
380       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
381 
382   if (ST.hasVOP3PInsts()) {
383     MinNumMaxNum.customFor(FPTypesPK16)
384       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
385       .clampMaxNumElements(0, S16, 2)
386       .clampScalar(0, S16, S64)
387       .scalarize(0);
388   } else if (ST.has16BitInsts()) {
389     MinNumMaxNum.customFor(FPTypes16)
390       .clampScalar(0, S16, S64)
391       .scalarize(0);
392   } else {
393     MinNumMaxNum.customFor(FPTypesBase)
394       .clampScalar(0, S32, S64)
395       .scalarize(0);
396   }
397 
398   if (ST.hasVOP3PInsts())
399     FPOpActions.clampMaxNumElements(0, S16, 2);
400 
401   FPOpActions
402     .scalarize(0)
403     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
404 
405   TrigActions
406     .scalarize(0)
407     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
408 
409   FDIVActions
410     .scalarize(0)
411     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
412 
413   getActionDefinitionsBuilder({G_FNEG, G_FABS})
414     .legalFor(FPTypesPK16)
415     .clampMaxNumElements(0, S16, 2)
416     .scalarize(0)
417     .clampScalar(0, S16, S64);
418 
419   if (ST.has16BitInsts()) {
420     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
421       .legalFor({S32, S64, S16})
422       .scalarize(0)
423       .clampScalar(0, S16, S64);
424   } else {
425     getActionDefinitionsBuilder(G_FSQRT)
426       .legalFor({S32, S64})
427       .scalarize(0)
428       .clampScalar(0, S32, S64);
429 
430     if (ST.hasFractBug()) {
431       getActionDefinitionsBuilder(G_FFLOOR)
432         .customFor({S64})
433         .legalFor({S32, S64})
434         .scalarize(0)
435         .clampScalar(0, S32, S64);
436     } else {
437       getActionDefinitionsBuilder(G_FFLOOR)
438         .legalFor({S32, S64})
439         .scalarize(0)
440         .clampScalar(0, S32, S64);
441     }
442   }
443 
444   getActionDefinitionsBuilder(G_FPTRUNC)
445     .legalFor({{S32, S64}, {S16, S32}})
446     .scalarize(0)
447     .lower();
448 
449   getActionDefinitionsBuilder(G_FPEXT)
450     .legalFor({{S64, S32}, {S32, S16}})
451     .lowerFor({{S64, S16}}) // FIXME: Implement
452     .scalarize(0);
453 
454   getActionDefinitionsBuilder(G_FSUB)
455       // Use actual fsub instruction
456       .legalFor({S32})
457       // Must use fadd + fneg
458       .lowerFor({S64, S16, V2S16})
459       .scalarize(0)
460       .clampScalar(0, S32, S64);
461 
462   // Whether this is legal depends on the floating point mode for the function.
463   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
464   if (ST.hasMadF16())
465     FMad.customFor({S32, S16});
466   else
467     FMad.customFor({S32});
468   FMad.scalarize(0)
469       .lower();
470 
471   // TODO: Do we need to clamp maximum bitwidth?
472   getActionDefinitionsBuilder(G_TRUNC)
473     .legalIf(isScalar(0))
474     .legalFor({{V2S16, V2S32}})
475     .clampMaxNumElements(0, S16, 2)
476     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
477     // situations (like an invalid implicit use), we don't want to infinite loop
478     // in the legalizer.
479     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
480     .alwaysLegal();
481 
482   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
483     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
484                {S32, S1}, {S64, S1}, {S16, S1}})
485     .scalarize(0)
486     .clampScalar(0, S32, S64)
487     .widenScalarToNextPow2(1, 32);
488 
489   // TODO: Split s1->s64 during regbankselect for VALU.
490   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
491     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
492     .lowerFor({{S32, S64}})
493     .lowerIf(typeIs(1, S1))
494     .customFor({{S64, S64}});
495   if (ST.has16BitInsts())
496     IToFP.legalFor({{S16, S16}});
497   IToFP.clampScalar(1, S32, S64)
498        .scalarize(0)
499        .widenScalarToNextPow2(1);
500 
501   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
502     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
503     .customFor({{S64, S64}});
504   if (ST.has16BitInsts())
505     FPToI.legalFor({{S16, S16}});
506   else
507     FPToI.minScalar(1, S32);
508 
509   FPToI.minScalar(0, S32)
510        .scalarize(0)
511        .lower();
512 
513   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
514     .scalarize(0)
515     .lower();
516 
517   if (ST.has16BitInsts()) {
518     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
519       .legalFor({S16, S32, S64})
520       .clampScalar(0, S16, S64)
521       .scalarize(0);
522   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
523     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
524       .legalFor({S32, S64})
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   } else {
528     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
529       .legalFor({S32})
530       .customFor({S64})
531       .clampScalar(0, S32, S64)
532       .scalarize(0);
533   }
534 
535   // FIXME: Clamp offset operand.
536   getActionDefinitionsBuilder(G_PTR_ADD)
537     .legalIf(isPointer(0))
538     .scalarize(0);
539 
540   getActionDefinitionsBuilder(G_PTRMASK)
541     .legalIf(typeInSet(1, {S64, S32}))
542     .minScalar(1, S32)
543     .maxScalarIf(sizeIs(0, 32), 1, S32)
544     .maxScalarIf(sizeIs(0, 64), 1, S64)
545     .scalarize(0);
546 
547   auto &CmpBuilder =
548     getActionDefinitionsBuilder(G_ICMP)
549     // The compare output type differs based on the register bank of the output,
550     // so make both s1 and s32 legal.
551     //
552     // Scalar compares producing output in scc will be promoted to s32, as that
553     // is the allocatable register type that will be needed for the copy from
554     // scc. This will be promoted during RegBankSelect, and we assume something
555     // before that won't try to use s32 result types.
556     //
557     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
558     // bank.
559     .legalForCartesianProduct(
560       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
561     .legalForCartesianProduct(
562       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
563   if (ST.has16BitInsts()) {
564     CmpBuilder.legalFor({{S1, S16}});
565   }
566 
567   CmpBuilder
568     .widenScalarToNextPow2(1)
569     .clampScalar(1, S32, S64)
570     .scalarize(0)
571     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
572 
573   getActionDefinitionsBuilder(G_FCMP)
574     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
575     .widenScalarToNextPow2(1)
576     .clampScalar(1, S32, S64)
577     .scalarize(0);
578 
579   // FIXME: fpow has a selection pattern that should move to custom lowering.
580   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
581   if (ST.has16BitInsts())
582     Exp2Ops.legalFor({S32, S16});
583   else
584     Exp2Ops.legalFor({S32});
585   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
586   Exp2Ops.scalarize(0);
587 
588   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
589   if (ST.has16BitInsts())
590     ExpOps.customFor({{S32}, {S16}});
591   else
592     ExpOps.customFor({S32});
593   ExpOps.clampScalar(0, MinScalarFPTy, S32)
594         .scalarize(0);
595 
596   // The 64-bit versions produce 32-bit results, but only on the SALU.
597   getActionDefinitionsBuilder(G_CTPOP)
598     .legalFor({{S32, S32}, {S32, S64}})
599     .clampScalar(0, S32, S32)
600     .clampScalar(1, S32, S64)
601     .scalarize(0)
602     .widenScalarToNextPow2(0, 32)
603     .widenScalarToNextPow2(1, 32);
604 
605   // The hardware instructions return a different result on 0 than the generic
606   // instructions expect. The hardware produces -1, but these produce the
607   // bitwidth.
608   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
609     .scalarize(0)
610     .clampScalar(0, S32, S32)
611     .clampScalar(1, S32, S64)
612     .widenScalarToNextPow2(0, 32)
613     .widenScalarToNextPow2(1, 32)
614     .lower();
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   getActionDefinitionsBuilder(G_BITREVERSE)
626     .legalFor({S32})
627     .clampScalar(0, S32, S32)
628     .scalarize(0);
629 
630   if (ST.has16BitInsts()) {
631     getActionDefinitionsBuilder(G_BSWAP)
632       .legalFor({S16, S32, V2S16})
633       .clampMaxNumElements(0, S16, 2)
634       // FIXME: Fixing non-power-of-2 before clamp is workaround for
635       // narrowScalar limitation.
636       .widenScalarToNextPow2(0)
637       .clampScalar(0, S16, S32)
638       .scalarize(0);
639 
640     if (ST.hasVOP3PInsts()) {
641       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642         .legalFor({S32, S16, V2S16})
643         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644         .clampMaxNumElements(0, S16, 2)
645         .minScalar(0, S16)
646         .widenScalarToNextPow2(0)
647         .scalarize(0)
648         .lower();
649     } else {
650       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
651         .legalFor({S32, S16})
652         .widenScalarToNextPow2(0)
653         .minScalar(0, S16)
654         .scalarize(0)
655         .lower();
656     }
657   } else {
658     // TODO: Should have same legality without v_perm_b32
659     getActionDefinitionsBuilder(G_BSWAP)
660       .legalFor({S32})
661       .lowerIf(scalarNarrowerThan(0, 32))
662       // FIXME: Fixing non-power-of-2 before clamp is workaround for
663       // narrowScalar limitation.
664       .widenScalarToNextPow2(0)
665       .maxScalar(0, S32)
666       .scalarize(0)
667       .lower();
668 
669     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
670       .legalFor({S32})
671       .minScalar(0, S32)
672       .widenScalarToNextPow2(0)
673       .scalarize(0)
674       .lower();
675   }
676 
677   getActionDefinitionsBuilder(G_INTTOPTR)
678     // List the common cases
679     .legalForCartesianProduct(AddrSpaces64, {S64})
680     .legalForCartesianProduct(AddrSpaces32, {S32})
681     .scalarize(0)
682     // Accept any address space as long as the size matches
683     .legalIf(sameSize(0, 1))
684     .widenScalarIf(smallerThan(1, 0),
685       [](const LegalityQuery &Query) {
686         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
687       })
688     .narrowScalarIf(largerThan(1, 0),
689       [](const LegalityQuery &Query) {
690         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
691       });
692 
693   getActionDefinitionsBuilder(G_PTRTOINT)
694     // List the common cases
695     .legalForCartesianProduct(AddrSpaces64, {S64})
696     .legalForCartesianProduct(AddrSpaces32, {S32})
697     .scalarize(0)
698     // Accept any address space as long as the size matches
699     .legalIf(sameSize(0, 1))
700     .widenScalarIf(smallerThan(0, 1),
701       [](const LegalityQuery &Query) {
702         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
703       })
704     .narrowScalarIf(
705       largerThan(0, 1),
706       [](const LegalityQuery &Query) {
707         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
708       });
709 
710   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
711     .scalarize(0)
712     .custom();
713 
714   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
715   // handle some operations by just promoting the register during
716   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
717   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
718     switch (AS) {
719     // FIXME: Private element size.
720     case AMDGPUAS::PRIVATE_ADDRESS:
721       return 32;
722     // FIXME: Check subtarget
723     case AMDGPUAS::LOCAL_ADDRESS:
724       return ST.useDS128() ? 128 : 64;
725 
726     // Treat constant and global as identical. SMRD loads are sometimes usable
727     // for global loads (ideally constant address space should be eliminated)
728     // depending on the context. Legality cannot be context dependent, but
729     // RegBankSelect can split the load as necessary depending on the pointer
730     // register bank/uniformity and if the memory is invariant or not written in
731     // a kernel.
732     case AMDGPUAS::CONSTANT_ADDRESS:
733     case AMDGPUAS::GLOBAL_ADDRESS:
734       return IsLoad ? 512 : 128;
735     default:
736       return 128;
737     }
738   };
739 
740   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
741                                     bool IsLoad) -> bool {
742     const LLT DstTy = Query.Types[0];
743 
744     // Split vector extloads.
745     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
746     unsigned Align = Query.MMODescrs[0].AlignInBits;
747 
748     if (MemSize < DstTy.getSizeInBits())
749       MemSize = std::max(MemSize, Align);
750 
751     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
752       return true;
753 
754     const LLT PtrTy = Query.Types[1];
755     unsigned AS = PtrTy.getAddressSpace();
756     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
757       return true;
758 
759     // Catch weird sized loads that don't evenly divide into the access sizes
760     // TODO: May be able to widen depending on alignment etc.
761     unsigned NumRegs = (MemSize + 31) / 32;
762     if (NumRegs == 3) {
763       if (!ST.hasDwordx3LoadStores())
764         return true;
765     } else {
766       // If the alignment allows, these should have been widened.
767       if (!isPowerOf2_32(NumRegs))
768         return true;
769     }
770 
771     if (Align < MemSize) {
772       const SITargetLowering *TLI = ST.getTargetLowering();
773       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
774     }
775 
776     return false;
777   };
778 
779   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
780     unsigned Size = Query.Types[0].getSizeInBits();
781     if (isPowerOf2_32(Size))
782       return false;
783 
784     if (Size == 96 && ST.hasDwordx3LoadStores())
785       return false;
786 
787     unsigned AddrSpace = Query.Types[1].getAddressSpace();
788     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
789       return false;
790 
791     unsigned Align = Query.MMODescrs[0].AlignInBits;
792     unsigned RoundedSize = NextPowerOf2(Size);
793     return (Align >= RoundedSize);
794   };
795 
796   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
797   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
798   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
799 
800   // TODO: Refine based on subtargets which support unaligned access or 128-bit
801   // LDS
802   // TODO: Unsupported flat for SI.
803 
804   for (unsigned Op : {G_LOAD, G_STORE}) {
805     const bool IsStore = Op == G_STORE;
806 
807     auto &Actions = getActionDefinitionsBuilder(Op);
808     // Whitelist the common cases.
809     // TODO: Loads to s16 on gfx9
810     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
811                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
812                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
813                                       {S128, GlobalPtr, 128, GlobalAlign32},
814                                       {S64, GlobalPtr, 64, GlobalAlign32},
815                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
816                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
817                                       {S32, GlobalPtr, 8, GlobalAlign8},
818                                       {S32, GlobalPtr, 16, GlobalAlign16},
819 
820                                       {S32, LocalPtr, 32, 32},
821                                       {S64, LocalPtr, 64, 32},
822                                       {V2S32, LocalPtr, 64, 32},
823                                       {S32, LocalPtr, 8, 8},
824                                       {S32, LocalPtr, 16, 16},
825                                       {V2S16, LocalPtr, 32, 32},
826 
827                                       {S32, PrivatePtr, 32, 32},
828                                       {S32, PrivatePtr, 8, 8},
829                                       {S32, PrivatePtr, 16, 16},
830                                       {V2S16, PrivatePtr, 32, 32},
831 
832                                       {S32, FlatPtr, 32, GlobalAlign32},
833                                       {S32, FlatPtr, 16, GlobalAlign16},
834                                       {S32, FlatPtr, 8, GlobalAlign8},
835                                       {V2S16, FlatPtr, 32, GlobalAlign32},
836 
837                                       {S32, ConstantPtr, 32, GlobalAlign32},
838                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
839                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
840                                       {S64, ConstantPtr, 64, GlobalAlign32},
841                                       {S128, ConstantPtr, 128, GlobalAlign32},
842                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
843     Actions
844         .customIf(typeIs(1, Constant32Ptr))
845         // Widen suitably aligned loads by loading extra elements.
846         .moreElementsIf([=](const LegalityQuery &Query) {
847             const LLT Ty = Query.Types[0];
848             return Op == G_LOAD && Ty.isVector() &&
849                    shouldWidenLoadResult(Query);
850           }, moreElementsToNextPow2(0))
851         .widenScalarIf([=](const LegalityQuery &Query) {
852             const LLT Ty = Query.Types[0];
853             return Op == G_LOAD && !Ty.isVector() &&
854                    shouldWidenLoadResult(Query);
855           }, widenScalarOrEltToNextPow2(0))
856         .narrowScalarIf(
857             [=](const LegalityQuery &Query) -> bool {
858               return !Query.Types[0].isVector() &&
859                      needToSplitMemOp(Query, Op == G_LOAD);
860             },
861             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
862               const LLT DstTy = Query.Types[0];
863               const LLT PtrTy = Query.Types[1];
864 
865               const unsigned DstSize = DstTy.getSizeInBits();
866               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
867 
868               // Split extloads.
869               if (DstSize > MemSize)
870                 return std::make_pair(0, LLT::scalar(MemSize));
871 
872               if (!isPowerOf2_32(DstSize)) {
873                 // We're probably decomposing an odd sized store. Try to split
874                 // to the widest type. TODO: Account for alignment. As-is it
875                 // should be OK, since the new parts will be further legalized.
876                 unsigned FloorSize = PowerOf2Floor(DstSize);
877                 return std::make_pair(0, LLT::scalar(FloorSize));
878               }
879 
880               if (DstSize > 32 && (DstSize % 32 != 0)) {
881                 // FIXME: Need a way to specify non-extload of larger size if
882                 // suitably aligned.
883                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
884               }
885 
886               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
887                                                      Op == G_LOAD);
888               if (MemSize > MaxSize)
889                 return std::make_pair(0, LLT::scalar(MaxSize));
890 
891               unsigned Align = Query.MMODescrs[0].AlignInBits;
892               return std::make_pair(0, LLT::scalar(Align));
893             })
894         .fewerElementsIf(
895             [=](const LegalityQuery &Query) -> bool {
896               return Query.Types[0].isVector() &&
897                      needToSplitMemOp(Query, Op == G_LOAD);
898             },
899             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
900               const LLT DstTy = Query.Types[0];
901               const LLT PtrTy = Query.Types[1];
902 
903               LLT EltTy = DstTy.getElementType();
904               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
905                                                      Op == G_LOAD);
906 
907               // FIXME: Handle widened to power of 2 results better. This ends
908               // up scalarizing.
909               // FIXME: 3 element stores scalarized on SI
910 
911               // Split if it's too large for the address space.
912               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
913                 unsigned NumElts = DstTy.getNumElements();
914                 unsigned EltSize = EltTy.getSizeInBits();
915 
916                 if (MaxSize % EltSize == 0) {
917                   return std::make_pair(
918                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
919                 }
920 
921                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
922 
923                 // FIXME: Refine when odd breakdowns handled
924                 // The scalars will need to be re-legalized.
925                 if (NumPieces == 1 || NumPieces >= NumElts ||
926                     NumElts % NumPieces != 0)
927                   return std::make_pair(0, EltTy);
928 
929                 return std::make_pair(0,
930                                       LLT::vector(NumElts / NumPieces, EltTy));
931               }
932 
933               // FIXME: We could probably handle weird extending loads better.
934               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
935               if (DstTy.getSizeInBits() > MemSize)
936                 return std::make_pair(0, EltTy);
937 
938               unsigned EltSize = EltTy.getSizeInBits();
939               unsigned DstSize = DstTy.getSizeInBits();
940               if (!isPowerOf2_32(DstSize)) {
941                 // We're probably decomposing an odd sized store. Try to split
942                 // to the widest type. TODO: Account for alignment. As-is it
943                 // should be OK, since the new parts will be further legalized.
944                 unsigned FloorSize = PowerOf2Floor(DstSize);
945                 return std::make_pair(
946                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
947               }
948 
949               // Need to split because of alignment.
950               unsigned Align = Query.MMODescrs[0].AlignInBits;
951               if (EltSize > Align &&
952                   (EltSize / Align < DstTy.getNumElements())) {
953                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
954               }
955 
956               // May need relegalization for the scalars.
957               return std::make_pair(0, EltTy);
958             })
959         .minScalar(0, S32);
960 
961     if (IsStore)
962       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
963 
964     // TODO: Need a bitcast lower option?
965     Actions
966         .legalIf([=](const LegalityQuery &Query) {
967           const LLT Ty0 = Query.Types[0];
968           unsigned Size = Ty0.getSizeInBits();
969           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
970           unsigned Align = Query.MMODescrs[0].AlignInBits;
971 
972           // FIXME: Widening store from alignment not valid.
973           if (MemSize < Size)
974             MemSize = std::max(MemSize, Align);
975 
976           // No extending vector loads.
977           if (Size > MemSize && Ty0.isVector())
978             return false;
979 
980           switch (MemSize) {
981           case 8:
982           case 16:
983             return Size == 32;
984           case 32:
985           case 64:
986           case 128:
987             return true;
988           case 96:
989             return ST.hasDwordx3LoadStores();
990           case 256:
991           case 512:
992             return true;
993           default:
994             return false;
995           }
996         })
997         .widenScalarToNextPow2(0)
998         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
999   }
1000 
1001   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1002                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1003                                                   {S32, GlobalPtr, 16, 2 * 8},
1004                                                   {S32, LocalPtr, 8, 8},
1005                                                   {S32, LocalPtr, 16, 16},
1006                                                   {S32, PrivatePtr, 8, 8},
1007                                                   {S32, PrivatePtr, 16, 16},
1008                                                   {S32, ConstantPtr, 8, 8},
1009                                                   {S32, ConstantPtr, 16, 2 * 8}});
1010   if (ST.hasFlatAddressSpace()) {
1011     ExtLoads.legalForTypesWithMemDesc(
1012         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1013   }
1014 
1015   ExtLoads.clampScalar(0, S32, S32)
1016           .widenScalarToNextPow2(0)
1017           .unsupportedIfMemSizeNotPow2()
1018           .lower();
1019 
1020   auto &Atomics = getActionDefinitionsBuilder(
1021     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1022      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1023      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1024      G_ATOMICRMW_UMIN})
1025     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1026                {S64, GlobalPtr}, {S64, LocalPtr}});
1027   if (ST.hasFlatAddressSpace()) {
1028     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1029   }
1030 
1031   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1032     .legalFor({{S32, LocalPtr}});
1033 
1034   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1035   // demarshalling
1036   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1037     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1038                 {S32, FlatPtr}, {S64, FlatPtr}})
1039     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1040                {S32, RegionPtr}, {S64, RegionPtr}});
1041   // TODO: Pointer types, any 32-bit or 64-bit vector
1042 
1043   // Condition should be s32 for scalar, s1 for vector.
1044   getActionDefinitionsBuilder(G_SELECT)
1045     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1046           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1047           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1048     .clampScalar(0, S16, S64)
1049     .scalarize(1)
1050     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1051     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1052     .clampMaxNumElements(0, S32, 2)
1053     .clampMaxNumElements(0, LocalPtr, 2)
1054     .clampMaxNumElements(0, PrivatePtr, 2)
1055     .scalarize(0)
1056     .widenScalarToNextPow2(0)
1057     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1058 
1059   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1060   // be more flexible with the shift amount type.
1061   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1062     .legalFor({{S32, S32}, {S64, S32}});
1063   if (ST.has16BitInsts()) {
1064     if (ST.hasVOP3PInsts()) {
1065       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1066             .clampMaxNumElements(0, S16, 2);
1067     } else
1068       Shifts.legalFor({{S16, S16}});
1069 
1070     // TODO: Support 16-bit shift amounts for all types
1071     Shifts.widenScalarIf(
1072       [=](const LegalityQuery &Query) {
1073         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1074         // 32-bit amount.
1075         const LLT ValTy = Query.Types[0];
1076         const LLT AmountTy = Query.Types[1];
1077         return ValTy.getSizeInBits() <= 16 &&
1078                AmountTy.getSizeInBits() < 16;
1079       }, changeTo(1, S16));
1080     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1081     Shifts.clampScalar(1, S32, S32);
1082     Shifts.clampScalar(0, S16, S64);
1083     Shifts.widenScalarToNextPow2(0, 16);
1084   } else {
1085     // Make sure we legalize the shift amount type first, as the general
1086     // expansion for the shifted type will produce much worse code if it hasn't
1087     // been truncated already.
1088     Shifts.clampScalar(1, S32, S32);
1089     Shifts.clampScalar(0, S32, S64);
1090     Shifts.widenScalarToNextPow2(0, 32);
1091   }
1092   Shifts.scalarize(0);
1093 
1094   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1095     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1096     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1097     unsigned IdxTypeIdx = 2;
1098 
1099     getActionDefinitionsBuilder(Op)
1100       .customIf([=](const LegalityQuery &Query) {
1101           const LLT EltTy = Query.Types[EltTypeIdx];
1102           const LLT VecTy = Query.Types[VecTypeIdx];
1103           const LLT IdxTy = Query.Types[IdxTypeIdx];
1104           return (EltTy.getSizeInBits() == 16 ||
1105                   EltTy.getSizeInBits() % 32 == 0) &&
1106                  VecTy.getSizeInBits() % 32 == 0 &&
1107                  VecTy.getSizeInBits() <= 1024 &&
1108                  IdxTy.getSizeInBits() == 32;
1109         })
1110       .clampScalar(EltTypeIdx, S32, S64)
1111       .clampScalar(VecTypeIdx, S32, S64)
1112       .clampScalar(IdxTypeIdx, S32, S32);
1113   }
1114 
1115   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1116     .unsupportedIf([=](const LegalityQuery &Query) {
1117         const LLT &EltTy = Query.Types[1].getElementType();
1118         return Query.Types[0] != EltTy;
1119       });
1120 
1121   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1122     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1123     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1124 
1125     // FIXME: Doesn't handle extract of illegal sizes.
1126     getActionDefinitionsBuilder(Op)
1127       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1128       // FIXME: Multiples of 16 should not be legal.
1129       .legalIf([=](const LegalityQuery &Query) {
1130           const LLT BigTy = Query.Types[BigTyIdx];
1131           const LLT LitTy = Query.Types[LitTyIdx];
1132           return (BigTy.getSizeInBits() % 32 == 0) &&
1133                  (LitTy.getSizeInBits() % 16 == 0);
1134         })
1135       .widenScalarIf(
1136         [=](const LegalityQuery &Query) {
1137           const LLT BigTy = Query.Types[BigTyIdx];
1138           return (BigTy.getScalarSizeInBits() < 16);
1139         },
1140         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1141       .widenScalarIf(
1142         [=](const LegalityQuery &Query) {
1143           const LLT LitTy = Query.Types[LitTyIdx];
1144           return (LitTy.getScalarSizeInBits() < 16);
1145         },
1146         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1147       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1148       .widenScalarToNextPow2(BigTyIdx, 32);
1149 
1150   }
1151 
1152   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1153     .legalForCartesianProduct(AllS32Vectors, {S32})
1154     .legalForCartesianProduct(AllS64Vectors, {S64})
1155     .clampNumElements(0, V16S32, V32S32)
1156     .clampNumElements(0, V2S64, V16S64)
1157     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1158 
1159   if (ST.hasScalarPackInsts()) {
1160     BuildVector
1161       // FIXME: Should probably widen s1 vectors straight to s32
1162       .minScalarOrElt(0, S16)
1163       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1164       .minScalar(1, S32);
1165 
1166     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1167       .legalFor({V2S16, S32})
1168       .lower();
1169     BuildVector.minScalarOrElt(0, S32);
1170   } else {
1171     BuildVector.customFor({V2S16, S16});
1172     BuildVector.minScalarOrElt(0, S32);
1173 
1174     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1175       .customFor({V2S16, S32})
1176       .lower();
1177   }
1178 
1179   BuildVector.legalIf(isRegisterType(0));
1180 
1181   // FIXME: Clamp maximum size
1182   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1183     .legalIf(isRegisterType(0));
1184 
1185   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1186   // pre-legalize.
1187   if (ST.hasVOP3PInsts()) {
1188     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1189       .customFor({V2S16, V2S16})
1190       .lower();
1191   } else
1192     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1193 
1194   // Merge/Unmerge
1195   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1196     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1197     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1198 
1199     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1200       const LLT Ty = Query.Types[TypeIdx];
1201       if (Ty.isVector()) {
1202         const LLT &EltTy = Ty.getElementType();
1203         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1204           return true;
1205         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1206           return true;
1207       }
1208       return false;
1209     };
1210 
1211     auto &Builder = getActionDefinitionsBuilder(Op)
1212       .lowerFor({{S16, V2S16}})
1213       .lowerIf([=](const LegalityQuery &Query) {
1214           const LLT BigTy = Query.Types[BigTyIdx];
1215           return BigTy.getSizeInBits() == 32;
1216         })
1217       // Try to widen to s16 first for small types.
1218       // TODO: Only do this on targets with legal s16 shifts
1219       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1220       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1221       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1222       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1223                            elementTypeIs(1, S16)),
1224                        changeTo(1, V2S16))
1225       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1226       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1227       // valid.
1228       .clampScalar(LitTyIdx, S32, S512)
1229       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1230       // Break up vectors with weird elements into scalars
1231       .fewerElementsIf(
1232         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1233         scalarize(0))
1234       .fewerElementsIf(
1235         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1236         scalarize(1))
1237       .clampScalar(BigTyIdx, S32, S1024);
1238 
1239     if (Op == G_MERGE_VALUES) {
1240       Builder.widenScalarIf(
1241         // TODO: Use 16-bit shifts if legal for 8-bit values?
1242         [=](const LegalityQuery &Query) {
1243           const LLT Ty = Query.Types[LitTyIdx];
1244           return Ty.getSizeInBits() < 32;
1245         },
1246         changeTo(LitTyIdx, S32));
1247     }
1248 
1249     Builder.widenScalarIf(
1250       [=](const LegalityQuery &Query) {
1251         const LLT Ty = Query.Types[BigTyIdx];
1252         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1253           Ty.getSizeInBits() % 16 != 0;
1254       },
1255       [=](const LegalityQuery &Query) {
1256         // Pick the next power of 2, or a multiple of 64 over 128.
1257         // Whichever is smaller.
1258         const LLT &Ty = Query.Types[BigTyIdx];
1259         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1260         if (NewSizeInBits >= 256) {
1261           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1262           if (RoundedTo < NewSizeInBits)
1263             NewSizeInBits = RoundedTo;
1264         }
1265         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1266       })
1267       .legalIf([=](const LegalityQuery &Query) {
1268           const LLT &BigTy = Query.Types[BigTyIdx];
1269           const LLT &LitTy = Query.Types[LitTyIdx];
1270 
1271           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1272             return false;
1273           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1274             return false;
1275 
1276           return BigTy.getSizeInBits() % 16 == 0 &&
1277                  LitTy.getSizeInBits() % 16 == 0 &&
1278                  BigTy.getSizeInBits() <= 1024;
1279         })
1280       // Any vectors left are the wrong size. Scalarize them.
1281       .scalarize(0)
1282       .scalarize(1);
1283   }
1284 
1285   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1286   // RegBankSelect.
1287   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1288     .legalFor({{S32}, {S64}});
1289 
1290   if (ST.hasVOP3PInsts()) {
1291     SextInReg.lowerFor({{V2S16}})
1292       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1293       // get more vector shift opportunities, since we'll get those when
1294       // expanded.
1295       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1296   } else if (ST.has16BitInsts()) {
1297     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1298   } else {
1299     // Prefer to promote to s32 before lowering if we don't have 16-bit
1300     // shifts. This avoid a lot of intermediate truncate and extend operations.
1301     SextInReg.lowerFor({{S32}, {S64}});
1302   }
1303 
1304   SextInReg
1305     .scalarize(0)
1306     .clampScalar(0, S32, S64)
1307     .lower();
1308 
1309   getActionDefinitionsBuilder(G_FSHR)
1310     .legalFor({{S32, S32}})
1311     .scalarize(0)
1312     .lower();
1313 
1314   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1315     .legalFor({S64});
1316 
1317   getActionDefinitionsBuilder({
1318       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1319       G_FCOPYSIGN,
1320 
1321       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1322       G_READ_REGISTER,
1323       G_WRITE_REGISTER,
1324 
1325       G_SADDO, G_SSUBO,
1326 
1327        // TODO: Implement
1328       G_FMINIMUM, G_FMAXIMUM,
1329       G_FSHL
1330     }).lower();
1331 
1332   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1333         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1334         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1335     .unsupported();
1336 
1337   computeTables();
1338   verify(*ST.getInstrInfo());
1339 }
1340 
1341 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1342                                          MachineRegisterInfo &MRI,
1343                                          MachineIRBuilder &B,
1344                                          GISelChangeObserver &Observer) const {
1345   switch (MI.getOpcode()) {
1346   case TargetOpcode::G_ADDRSPACE_CAST:
1347     return legalizeAddrSpaceCast(MI, MRI, B);
1348   case TargetOpcode::G_FRINT:
1349     return legalizeFrint(MI, MRI, B);
1350   case TargetOpcode::G_FCEIL:
1351     return legalizeFceil(MI, MRI, B);
1352   case TargetOpcode::G_INTRINSIC_TRUNC:
1353     return legalizeIntrinsicTrunc(MI, MRI, B);
1354   case TargetOpcode::G_SITOFP:
1355     return legalizeITOFP(MI, MRI, B, true);
1356   case TargetOpcode::G_UITOFP:
1357     return legalizeITOFP(MI, MRI, B, false);
1358   case TargetOpcode::G_FPTOSI:
1359     return legalizeFPTOI(MI, MRI, B, true);
1360   case TargetOpcode::G_FPTOUI:
1361     return legalizeFPTOI(MI, MRI, B, false);
1362   case TargetOpcode::G_FMINNUM:
1363   case TargetOpcode::G_FMAXNUM:
1364   case TargetOpcode::G_FMINNUM_IEEE:
1365   case TargetOpcode::G_FMAXNUM_IEEE:
1366     return legalizeMinNumMaxNum(MI, MRI, B);
1367   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1368     return legalizeExtractVectorElt(MI, MRI, B);
1369   case TargetOpcode::G_INSERT_VECTOR_ELT:
1370     return legalizeInsertVectorElt(MI, MRI, B);
1371   case TargetOpcode::G_SHUFFLE_VECTOR:
1372     return legalizeShuffleVector(MI, MRI, B);
1373   case TargetOpcode::G_FSIN:
1374   case TargetOpcode::G_FCOS:
1375     return legalizeSinCos(MI, MRI, B);
1376   case TargetOpcode::G_GLOBAL_VALUE:
1377     return legalizeGlobalValue(MI, MRI, B);
1378   case TargetOpcode::G_LOAD:
1379     return legalizeLoad(MI, MRI, B, Observer);
1380   case TargetOpcode::G_FMAD:
1381     return legalizeFMad(MI, MRI, B);
1382   case TargetOpcode::G_FDIV:
1383     return legalizeFDIV(MI, MRI, B);
1384   case TargetOpcode::G_UDIV:
1385   case TargetOpcode::G_UREM:
1386     return legalizeUDIV_UREM(MI, MRI, B);
1387   case TargetOpcode::G_SDIV:
1388   case TargetOpcode::G_SREM:
1389     return legalizeSDIV_SREM(MI, MRI, B);
1390   case TargetOpcode::G_ATOMIC_CMPXCHG:
1391     return legalizeAtomicCmpXChg(MI, MRI, B);
1392   case TargetOpcode::G_FLOG:
1393     return legalizeFlog(MI, B, numbers::ln2f);
1394   case TargetOpcode::G_FLOG10:
1395     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1396   case TargetOpcode::G_FEXP:
1397     return legalizeFExp(MI, B);
1398   case TargetOpcode::G_FPOW:
1399     return legalizeFPow(MI, B);
1400   case TargetOpcode::G_FFLOOR:
1401     return legalizeFFloor(MI, MRI, B);
1402   case TargetOpcode::G_BUILD_VECTOR:
1403     return legalizeBuildVector(MI, MRI, B);
1404   default:
1405     return false;
1406   }
1407 
1408   llvm_unreachable("expected switch to return");
1409 }
1410 
1411 Register AMDGPULegalizerInfo::getSegmentAperture(
1412   unsigned AS,
1413   MachineRegisterInfo &MRI,
1414   MachineIRBuilder &B) const {
1415   MachineFunction &MF = B.getMF();
1416   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1417   const LLT S32 = LLT::scalar(32);
1418 
1419   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1420 
1421   if (ST.hasApertureRegs()) {
1422     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1423     // getreg.
1424     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1425         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1426         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1427     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1428         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1429         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1430     unsigned Encoding =
1431         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1432         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1433         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1434 
1435     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1436 
1437     B.buildInstr(AMDGPU::S_GETREG_B32)
1438       .addDef(GetReg)
1439       .addImm(Encoding);
1440     MRI.setType(GetReg, S32);
1441 
1442     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1443     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1444   }
1445 
1446   Register QueuePtr = MRI.createGenericVirtualRegister(
1447     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1448 
1449   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1450   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1451     return Register();
1452 
1453   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1454   // private_segment_aperture_base_hi.
1455   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1456 
1457   // TODO: can we be smarter about machine pointer info?
1458   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1459   MachineMemOperand *MMO = MF.getMachineMemOperand(
1460       PtrInfo,
1461       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1462           MachineMemOperand::MOInvariant,
1463       4, commonAlignment(Align(64), StructOffset));
1464 
1465   Register LoadAddr;
1466 
1467   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1468   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1469 }
1470 
1471 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1472   MachineInstr &MI, MachineRegisterInfo &MRI,
1473   MachineIRBuilder &B) const {
1474   MachineFunction &MF = B.getMF();
1475 
1476   B.setInstr(MI);
1477 
1478   const LLT S32 = LLT::scalar(32);
1479   Register Dst = MI.getOperand(0).getReg();
1480   Register Src = MI.getOperand(1).getReg();
1481 
1482   LLT DstTy = MRI.getType(Dst);
1483   LLT SrcTy = MRI.getType(Src);
1484   unsigned DestAS = DstTy.getAddressSpace();
1485   unsigned SrcAS = SrcTy.getAddressSpace();
1486 
1487   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1488   // vector element.
1489   assert(!DstTy.isVector());
1490 
1491   const AMDGPUTargetMachine &TM
1492     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1493 
1494   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1495   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1496     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1497     return true;
1498   }
1499 
1500   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1501     // Truncate.
1502     B.buildExtract(Dst, Src, 0);
1503     MI.eraseFromParent();
1504     return true;
1505   }
1506 
1507   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1508     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1509     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1510 
1511     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1512     // another. Merge operands are required to be the same type, but creating an
1513     // extra ptrtoint would be kind of pointless.
1514     auto HighAddr = B.buildConstant(
1515       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1516     B.buildMerge(Dst, {Src, HighAddr});
1517     MI.eraseFromParent();
1518     return true;
1519   }
1520 
1521   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1522     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1523            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1524     unsigned NullVal = TM.getNullPointerValue(DestAS);
1525 
1526     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1527     auto FlatNull = B.buildConstant(SrcTy, 0);
1528 
1529     // Extract low 32-bits of the pointer.
1530     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1531 
1532     auto CmpRes =
1533         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1534     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1535 
1536     MI.eraseFromParent();
1537     return true;
1538   }
1539 
1540   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1541     return false;
1542 
1543   if (!ST.hasFlatAddressSpace())
1544     return false;
1545 
1546   auto SegmentNull =
1547       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1548   auto FlatNull =
1549       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1550 
1551   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1552   if (!ApertureReg.isValid())
1553     return false;
1554 
1555   auto CmpRes =
1556       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1557 
1558   // Coerce the type of the low half of the result so we can use merge_values.
1559   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1560 
1561   // TODO: Should we allow mismatched types but matching sizes in merges to
1562   // avoid the ptrtoint?
1563   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1564   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1565 
1566   MI.eraseFromParent();
1567   return true;
1568 }
1569 
1570 bool AMDGPULegalizerInfo::legalizeFrint(
1571   MachineInstr &MI, MachineRegisterInfo &MRI,
1572   MachineIRBuilder &B) const {
1573   B.setInstr(MI);
1574 
1575   Register Src = MI.getOperand(1).getReg();
1576   LLT Ty = MRI.getType(Src);
1577   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1578 
1579   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1580   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1581 
1582   auto C1 = B.buildFConstant(Ty, C1Val);
1583   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1584 
1585   // TODO: Should this propagate fast-math-flags?
1586   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1587   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1588 
1589   auto C2 = B.buildFConstant(Ty, C2Val);
1590   auto Fabs = B.buildFAbs(Ty, Src);
1591 
1592   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1593   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1594   return true;
1595 }
1596 
1597 bool AMDGPULegalizerInfo::legalizeFceil(
1598   MachineInstr &MI, MachineRegisterInfo &MRI,
1599   MachineIRBuilder &B) const {
1600   B.setInstr(MI);
1601 
1602   const LLT S1 = LLT::scalar(1);
1603   const LLT S64 = LLT::scalar(64);
1604 
1605   Register Src = MI.getOperand(1).getReg();
1606   assert(MRI.getType(Src) == S64);
1607 
1608   // result = trunc(src)
1609   // if (src > 0.0 && src != result)
1610   //   result += 1.0
1611 
1612   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1613 
1614   const auto Zero = B.buildFConstant(S64, 0.0);
1615   const auto One = B.buildFConstant(S64, 1.0);
1616   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1617   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1618   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1619   auto Add = B.buildSelect(S64, And, One, Zero);
1620 
1621   // TODO: Should this propagate fast-math-flags?
1622   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1623   return true;
1624 }
1625 
1626 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1627                                               MachineIRBuilder &B) {
1628   const unsigned FractBits = 52;
1629   const unsigned ExpBits = 11;
1630   LLT S32 = LLT::scalar(32);
1631 
1632   auto Const0 = B.buildConstant(S32, FractBits - 32);
1633   auto Const1 = B.buildConstant(S32, ExpBits);
1634 
1635   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1636     .addUse(Const0.getReg(0))
1637     .addUse(Const1.getReg(0));
1638 
1639   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1640 }
1641 
1642 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1643   MachineInstr &MI, MachineRegisterInfo &MRI,
1644   MachineIRBuilder &B) const {
1645   B.setInstr(MI);
1646 
1647   const LLT S1 = LLT::scalar(1);
1648   const LLT S32 = LLT::scalar(32);
1649   const LLT S64 = LLT::scalar(64);
1650 
1651   Register Src = MI.getOperand(1).getReg();
1652   assert(MRI.getType(Src) == S64);
1653 
1654   // TODO: Should this use extract since the low half is unused?
1655   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1656   Register Hi = Unmerge.getReg(1);
1657 
1658   // Extract the upper half, since this is where we will find the sign and
1659   // exponent.
1660   auto Exp = extractF64Exponent(Hi, B);
1661 
1662   const unsigned FractBits = 52;
1663 
1664   // Extract the sign bit.
1665   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1666   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1667 
1668   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1669 
1670   const auto Zero32 = B.buildConstant(S32, 0);
1671 
1672   // Extend back to 64-bits.
1673   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1674 
1675   auto Shr = B.buildAShr(S64, FractMask, Exp);
1676   auto Not = B.buildNot(S64, Shr);
1677   auto Tmp0 = B.buildAnd(S64, Src, Not);
1678   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1679 
1680   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1681   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1682 
1683   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1684   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1685   return true;
1686 }
1687 
1688 bool AMDGPULegalizerInfo::legalizeITOFP(
1689   MachineInstr &MI, MachineRegisterInfo &MRI,
1690   MachineIRBuilder &B, bool Signed) const {
1691   B.setInstr(MI);
1692 
1693   Register Dst = MI.getOperand(0).getReg();
1694   Register Src = MI.getOperand(1).getReg();
1695 
1696   const LLT S64 = LLT::scalar(64);
1697   const LLT S32 = LLT::scalar(32);
1698 
1699   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1700 
1701   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1702 
1703   auto CvtHi = Signed ?
1704     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1705     B.buildUITOFP(S64, Unmerge.getReg(1));
1706 
1707   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1708 
1709   auto ThirtyTwo = B.buildConstant(S32, 32);
1710   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1711     .addUse(CvtHi.getReg(0))
1712     .addUse(ThirtyTwo.getReg(0));
1713 
1714   // TODO: Should this propagate fast-math-flags?
1715   B.buildFAdd(Dst, LdExp, CvtLo);
1716   MI.eraseFromParent();
1717   return true;
1718 }
1719 
1720 // TODO: Copied from DAG implementation. Verify logic and document how this
1721 // actually works.
1722 bool AMDGPULegalizerInfo::legalizeFPTOI(
1723   MachineInstr &MI, MachineRegisterInfo &MRI,
1724   MachineIRBuilder &B, bool Signed) const {
1725   B.setInstr(MI);
1726 
1727   Register Dst = MI.getOperand(0).getReg();
1728   Register Src = MI.getOperand(1).getReg();
1729 
1730   const LLT S64 = LLT::scalar(64);
1731   const LLT S32 = LLT::scalar(32);
1732 
1733   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1734 
1735   unsigned Flags = MI.getFlags();
1736 
1737   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1738   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1739   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1740 
1741   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1742   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1743   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1744 
1745   auto Hi = Signed ?
1746     B.buildFPTOSI(S32, FloorMul) :
1747     B.buildFPTOUI(S32, FloorMul);
1748   auto Lo = B.buildFPTOUI(S32, Fma);
1749 
1750   B.buildMerge(Dst, { Lo, Hi });
1751   MI.eraseFromParent();
1752 
1753   return true;
1754 }
1755 
1756 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1757   MachineInstr &MI, MachineRegisterInfo &MRI,
1758   MachineIRBuilder &B) const {
1759   MachineFunction &MF = B.getMF();
1760   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1761 
1762   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1763                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1764 
1765   // With ieee_mode disabled, the instructions have the correct behavior
1766   // already for G_FMINNUM/G_FMAXNUM
1767   if (!MFI->getMode().IEEE)
1768     return !IsIEEEOp;
1769 
1770   if (IsIEEEOp)
1771     return true;
1772 
1773   MachineIRBuilder HelperBuilder(MI);
1774   GISelObserverWrapper DummyObserver;
1775   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1776   HelperBuilder.setInstr(MI);
1777   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1778 }
1779 
1780 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1781   MachineInstr &MI, MachineRegisterInfo &MRI,
1782   MachineIRBuilder &B) const {
1783   // TODO: Should move some of this into LegalizerHelper.
1784 
1785   // TODO: Promote dynamic indexing of s16 to s32
1786 
1787   // FIXME: Artifact combiner probably should have replaced the truncated
1788   // constant before this, so we shouldn't need
1789   // getConstantVRegValWithLookThrough.
1790   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1791     MI.getOperand(2).getReg(), MRI);
1792   if (!IdxVal) // Dynamic case will be selected to register indexing.
1793     return true;
1794 
1795   Register Dst = MI.getOperand(0).getReg();
1796   Register Vec = MI.getOperand(1).getReg();
1797 
1798   LLT VecTy = MRI.getType(Vec);
1799   LLT EltTy = VecTy.getElementType();
1800   assert(EltTy == MRI.getType(Dst));
1801 
1802   B.setInstr(MI);
1803 
1804   if (IdxVal->Value < VecTy.getNumElements())
1805     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1806   else
1807     B.buildUndef(Dst);
1808 
1809   MI.eraseFromParent();
1810   return true;
1811 }
1812 
1813 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1814   MachineInstr &MI, MachineRegisterInfo &MRI,
1815   MachineIRBuilder &B) const {
1816   // TODO: Should move some of this into LegalizerHelper.
1817 
1818   // TODO: Promote dynamic indexing of s16 to s32
1819 
1820   // FIXME: Artifact combiner probably should have replaced the truncated
1821   // constant before this, so we shouldn't need
1822   // getConstantVRegValWithLookThrough.
1823   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1824     MI.getOperand(3).getReg(), MRI);
1825   if (!IdxVal) // Dynamic case will be selected to register indexing.
1826     return true;
1827 
1828   Register Dst = MI.getOperand(0).getReg();
1829   Register Vec = MI.getOperand(1).getReg();
1830   Register Ins = MI.getOperand(2).getReg();
1831 
1832   LLT VecTy = MRI.getType(Vec);
1833   LLT EltTy = VecTy.getElementType();
1834   assert(EltTy == MRI.getType(Ins));
1835 
1836   B.setInstr(MI);
1837 
1838   if (IdxVal->Value < VecTy.getNumElements())
1839     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1840   else
1841     B.buildUndef(Dst);
1842 
1843   MI.eraseFromParent();
1844   return true;
1845 }
1846 
1847 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1848   MachineInstr &MI, MachineRegisterInfo &MRI,
1849   MachineIRBuilder &B) const {
1850   const LLT V2S16 = LLT::vector(2, 16);
1851 
1852   Register Dst = MI.getOperand(0).getReg();
1853   Register Src0 = MI.getOperand(1).getReg();
1854   LLT DstTy = MRI.getType(Dst);
1855   LLT SrcTy = MRI.getType(Src0);
1856 
1857   if (SrcTy == V2S16 && DstTy == V2S16 &&
1858       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1859     return true;
1860 
1861   MachineIRBuilder HelperBuilder(MI);
1862   GISelObserverWrapper DummyObserver;
1863   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1864   HelperBuilder.setInstr(MI);
1865   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1866 }
1867 
1868 bool AMDGPULegalizerInfo::legalizeSinCos(
1869   MachineInstr &MI, MachineRegisterInfo &MRI,
1870   MachineIRBuilder &B) const {
1871   B.setInstr(MI);
1872 
1873   Register DstReg = MI.getOperand(0).getReg();
1874   Register SrcReg = MI.getOperand(1).getReg();
1875   LLT Ty = MRI.getType(DstReg);
1876   unsigned Flags = MI.getFlags();
1877 
1878   Register TrigVal;
1879   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1880   if (ST.hasTrigReducedRange()) {
1881     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1882     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1883       .addUse(MulVal.getReg(0))
1884       .setMIFlags(Flags).getReg(0);
1885   } else
1886     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1887 
1888   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1889     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1890   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1891     .addUse(TrigVal)
1892     .setMIFlags(Flags);
1893   MI.eraseFromParent();
1894   return true;
1895 }
1896 
1897 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1898   Register DstReg, LLT PtrTy,
1899   MachineIRBuilder &B, const GlobalValue *GV,
1900   unsigned Offset, unsigned GAFlags) const {
1901   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1902   // to the following code sequence:
1903   //
1904   // For constant address space:
1905   //   s_getpc_b64 s[0:1]
1906   //   s_add_u32 s0, s0, $symbol
1907   //   s_addc_u32 s1, s1, 0
1908   //
1909   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1910   //   a fixup or relocation is emitted to replace $symbol with a literal
1911   //   constant, which is a pc-relative offset from the encoding of the $symbol
1912   //   operand to the global variable.
1913   //
1914   // For global address space:
1915   //   s_getpc_b64 s[0:1]
1916   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1917   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1918   //
1919   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1920   //   fixups or relocations are emitted to replace $symbol@*@lo and
1921   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1922   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1923   //   operand to the global variable.
1924   //
1925   // What we want here is an offset from the value returned by s_getpc
1926   // (which is the address of the s_add_u32 instruction) to the global
1927   // variable, but since the encoding of $symbol starts 4 bytes after the start
1928   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1929   // small. This requires us to add 4 to the global variable offset in order to
1930   // compute the correct address.
1931 
1932   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1933 
1934   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1935     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1936 
1937   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1938     .addDef(PCReg);
1939 
1940   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1941   if (GAFlags == SIInstrInfo::MO_NONE)
1942     MIB.addImm(0);
1943   else
1944     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1945 
1946   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1947 
1948   if (PtrTy.getSizeInBits() == 32)
1949     B.buildExtract(DstReg, PCReg, 0);
1950   return true;
1951  }
1952 
1953 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1954   MachineInstr &MI, MachineRegisterInfo &MRI,
1955   MachineIRBuilder &B) const {
1956   Register DstReg = MI.getOperand(0).getReg();
1957   LLT Ty = MRI.getType(DstReg);
1958   unsigned AS = Ty.getAddressSpace();
1959 
1960   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1961   MachineFunction &MF = B.getMF();
1962   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1963   B.setInstr(MI);
1964 
1965   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1966     if (!MFI->isEntryFunction()) {
1967       const Function &Fn = MF.getFunction();
1968       DiagnosticInfoUnsupported BadLDSDecl(
1969         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1970         DS_Warning);
1971       Fn.getContext().diagnose(BadLDSDecl);
1972 
1973       // We currently don't have a way to correctly allocate LDS objects that
1974       // aren't directly associated with a kernel. We do force inlining of
1975       // functions that use local objects. However, if these dead functions are
1976       // not eliminated, we don't want a compile time error. Just emit a warning
1977       // and a trap, since there should be no callable path here.
1978       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1979       B.buildUndef(DstReg);
1980       MI.eraseFromParent();
1981       return true;
1982     }
1983 
1984     // TODO: We could emit code to handle the initialization somewhere.
1985     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1986       const SITargetLowering *TLI = ST.getTargetLowering();
1987       if (!TLI->shouldUseLDSConstAddress(GV)) {
1988         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1989         return true; // Leave in place;
1990       }
1991 
1992       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1993       MI.eraseFromParent();
1994       return true;
1995     }
1996 
1997     const Function &Fn = MF.getFunction();
1998     DiagnosticInfoUnsupported BadInit(
1999       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2000     Fn.getContext().diagnose(BadInit);
2001     return true;
2002   }
2003 
2004   const SITargetLowering *TLI = ST.getTargetLowering();
2005 
2006   if (TLI->shouldEmitFixup(GV)) {
2007     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2008     MI.eraseFromParent();
2009     return true;
2010   }
2011 
2012   if (TLI->shouldEmitPCReloc(GV)) {
2013     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2014     MI.eraseFromParent();
2015     return true;
2016   }
2017 
2018   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2019   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2020 
2021   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2022       MachinePointerInfo::getGOT(MF),
2023       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2024           MachineMemOperand::MOInvariant,
2025       8 /*Size*/, Align(8));
2026 
2027   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2028 
2029   if (Ty.getSizeInBits() == 32) {
2030     // Truncate if this is a 32-bit constant adrdess.
2031     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2032     B.buildExtract(DstReg, Load, 0);
2033   } else
2034     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2035 
2036   MI.eraseFromParent();
2037   return true;
2038 }
2039 
2040 bool AMDGPULegalizerInfo::legalizeLoad(
2041   MachineInstr &MI, MachineRegisterInfo &MRI,
2042   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2043   B.setInstr(MI);
2044   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2045   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2046   Observer.changingInstr(MI);
2047   MI.getOperand(1).setReg(Cast.getReg(0));
2048   Observer.changedInstr(MI);
2049   return true;
2050 }
2051 
2052 bool AMDGPULegalizerInfo::legalizeFMad(
2053   MachineInstr &MI, MachineRegisterInfo &MRI,
2054   MachineIRBuilder &B) const {
2055   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2056   assert(Ty.isScalar());
2057 
2058   MachineFunction &MF = B.getMF();
2059   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2060 
2061   // TODO: Always legal with future ftz flag.
2062   // FIXME: Do we need just output?
2063   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2064     return true;
2065   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2066     return true;
2067 
2068   MachineIRBuilder HelperBuilder(MI);
2069   GISelObserverWrapper DummyObserver;
2070   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2071   HelperBuilder.setInstr(MI);
2072   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2073 }
2074 
2075 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2076   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2077   Register DstReg = MI.getOperand(0).getReg();
2078   Register PtrReg = MI.getOperand(1).getReg();
2079   Register CmpVal = MI.getOperand(2).getReg();
2080   Register NewVal = MI.getOperand(3).getReg();
2081 
2082   assert(SITargetLowering::isFlatGlobalAddrSpace(
2083            MRI.getType(PtrReg).getAddressSpace()) &&
2084          "this should not have been custom lowered");
2085 
2086   LLT ValTy = MRI.getType(CmpVal);
2087   LLT VecTy = LLT::vector(2, ValTy);
2088 
2089   B.setInstr(MI);
2090   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2091 
2092   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2093     .addDef(DstReg)
2094     .addUse(PtrReg)
2095     .addUse(PackedVal)
2096     .setMemRefs(MI.memoperands());
2097 
2098   MI.eraseFromParent();
2099   return true;
2100 }
2101 
2102 bool AMDGPULegalizerInfo::legalizeFlog(
2103   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2104   Register Dst = MI.getOperand(0).getReg();
2105   Register Src = MI.getOperand(1).getReg();
2106   LLT Ty = B.getMRI()->getType(Dst);
2107   unsigned Flags = MI.getFlags();
2108   B.setInstr(MI);
2109 
2110   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2111   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2112 
2113   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2114   MI.eraseFromParent();
2115   return true;
2116 }
2117 
2118 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2119                                        MachineIRBuilder &B) const {
2120   Register Dst = MI.getOperand(0).getReg();
2121   Register Src = MI.getOperand(1).getReg();
2122   unsigned Flags = MI.getFlags();
2123   LLT Ty = B.getMRI()->getType(Dst);
2124   B.setInstr(MI);
2125 
2126   auto K = B.buildFConstant(Ty, numbers::log2e);
2127   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2128   B.buildFExp2(Dst, Mul, Flags);
2129   MI.eraseFromParent();
2130   return true;
2131 }
2132 
2133 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2134                                        MachineIRBuilder &B) const {
2135   Register Dst = MI.getOperand(0).getReg();
2136   Register Src0 = MI.getOperand(1).getReg();
2137   Register Src1 = MI.getOperand(2).getReg();
2138   unsigned Flags = MI.getFlags();
2139   LLT Ty = B.getMRI()->getType(Dst);
2140   B.setInstr(MI);
2141   const LLT S16 = LLT::scalar(16);
2142   const LLT S32 = LLT::scalar(32);
2143 
2144   if (Ty == S32) {
2145     auto Log = B.buildFLog2(S32, Src0, Flags);
2146     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2147       .addUse(Log.getReg(0))
2148       .addUse(Src1)
2149       .setMIFlags(Flags);
2150     B.buildFExp2(Dst, Mul, Flags);
2151   } else if (Ty == S16) {
2152     // There's no f16 fmul_legacy, so we need to convert for it.
2153     auto Log = B.buildFLog2(S16, Src0, Flags);
2154     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2155     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2156     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2157       .addUse(Ext0.getReg(0))
2158       .addUse(Ext1.getReg(0))
2159       .setMIFlags(Flags);
2160 
2161     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2162   } else
2163     return false;
2164 
2165   MI.eraseFromParent();
2166   return true;
2167 }
2168 
2169 // Find a source register, ignoring any possible source modifiers.
2170 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2171   Register ModSrc = OrigSrc;
2172   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2173     ModSrc = SrcFNeg->getOperand(1).getReg();
2174     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2175       ModSrc = SrcFAbs->getOperand(1).getReg();
2176   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2177     ModSrc = SrcFAbs->getOperand(1).getReg();
2178   return ModSrc;
2179 }
2180 
2181 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2182                                          MachineRegisterInfo &MRI,
2183                                          MachineIRBuilder &B) const {
2184   B.setInstr(MI);
2185 
2186   const LLT S1 = LLT::scalar(1);
2187   const LLT S64 = LLT::scalar(64);
2188   Register Dst = MI.getOperand(0).getReg();
2189   Register OrigSrc = MI.getOperand(1).getReg();
2190   unsigned Flags = MI.getFlags();
2191   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2192          "this should not have been custom lowered");
2193 
2194   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2195   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2196   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2197   // V_FRACT bug is:
2198   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2199   //
2200   // Convert floor(x) to (x - fract(x))
2201 
2202   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2203     .addUse(OrigSrc)
2204     .setMIFlags(Flags);
2205 
2206   // Give source modifier matching some assistance before obscuring a foldable
2207   // pattern.
2208 
2209   // TODO: We can avoid the neg on the fract? The input sign to fract
2210   // shouldn't matter?
2211   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2212 
2213   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2214 
2215   Register Min = MRI.createGenericVirtualRegister(S64);
2216 
2217   // We don't need to concern ourselves with the snan handling difference, so
2218   // use the one which will directly select.
2219   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2220   if (MFI->getMode().IEEE)
2221     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2222   else
2223     B.buildFMinNum(Min, Fract, Const, Flags);
2224 
2225   Register CorrectedFract = Min;
2226   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2227     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2228     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2229   }
2230 
2231   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2232   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2233 
2234   MI.eraseFromParent();
2235   return true;
2236 }
2237 
2238 // Turn an illegal packed v2s16 build vector into bit operations.
2239 // TODO: This should probably be a bitcast action in LegalizerHelper.
2240 bool AMDGPULegalizerInfo::legalizeBuildVector(
2241   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2242   Register Dst = MI.getOperand(0).getReg();
2243   const LLT S32 = LLT::scalar(32);
2244   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2245 
2246   Register Src0 = MI.getOperand(1).getReg();
2247   Register Src1 = MI.getOperand(2).getReg();
2248   assert(MRI.getType(Src0) == LLT::scalar(16));
2249 
2250   B.setInstr(MI);
2251   auto Merge = B.buildMerge(S32, {Src0, Src1});
2252   B.buildBitcast(Dst, Merge);
2253 
2254   MI.eraseFromParent();
2255   return true;
2256 }
2257 
2258 // Return the use branch instruction, otherwise null if the usage is invalid.
2259 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2260                                        MachineRegisterInfo &MRI,
2261                                        MachineInstr *&Br,
2262                                        MachineBasicBlock *&UncondBrTarget) {
2263   Register CondDef = MI.getOperand(0).getReg();
2264   if (!MRI.hasOneNonDBGUse(CondDef))
2265     return nullptr;
2266 
2267   MachineBasicBlock *Parent = MI.getParent();
2268   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2269   if (UseMI.getParent() != Parent ||
2270       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2271     return nullptr;
2272 
2273   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2274   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2275   if (Next == Parent->end()) {
2276     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2277     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2278       return nullptr;
2279     UncondBrTarget = &*NextMBB;
2280   } else {
2281     if (Next->getOpcode() != AMDGPU::G_BR)
2282       return nullptr;
2283     Br = &*Next;
2284     UncondBrTarget = Br->getOperand(0).getMBB();
2285   }
2286 
2287   return &UseMI;
2288 }
2289 
2290 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2291                                                MachineRegisterInfo &MRI,
2292                                                Register LiveIn,
2293                                                Register PhyReg) const {
2294   assert(PhyReg.isPhysical() && "Physical register expected");
2295 
2296   // Insert the live-in copy, if required, by defining destination virtual
2297   // register.
2298   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2299   if (!MRI.getVRegDef(LiveIn)) {
2300     // FIXME: Should have scoped insert pt
2301     MachineBasicBlock &OrigInsBB = B.getMBB();
2302     auto OrigInsPt = B.getInsertPt();
2303 
2304     MachineBasicBlock &EntryMBB = B.getMF().front();
2305     EntryMBB.addLiveIn(PhyReg);
2306     B.setInsertPt(EntryMBB, EntryMBB.begin());
2307     B.buildCopy(LiveIn, PhyReg);
2308 
2309     B.setInsertPt(OrigInsBB, OrigInsPt);
2310   }
2311 
2312   return LiveIn;
2313 }
2314 
2315 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2316                                                 MachineRegisterInfo &MRI,
2317                                                 Register PhyReg, LLT Ty,
2318                                                 bool InsertLiveInCopy) const {
2319   assert(PhyReg.isPhysical() && "Physical register expected");
2320 
2321   // Get or create virtual live-in regester
2322   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2323   if (!LiveIn) {
2324     LiveIn = MRI.createGenericVirtualRegister(Ty);
2325     MRI.addLiveIn(PhyReg, LiveIn);
2326   }
2327 
2328   // When the actual true copy required is from virtual register to physical
2329   // register (to be inserted later), live-in copy insertion from physical
2330   // to register virtual register is not required
2331   if (!InsertLiveInCopy)
2332     return LiveIn;
2333 
2334   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2335 }
2336 
2337 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2338     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2339   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2340   const ArgDescriptor *Arg;
2341   const TargetRegisterClass *RC;
2342   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2343   if (!Arg) {
2344     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2345     return nullptr;
2346   }
2347   return Arg;
2348 }
2349 
2350 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2351                                          const ArgDescriptor *Arg) const {
2352   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2353     return false; // TODO: Handle these
2354 
2355   Register SrcReg = Arg->getRegister();
2356   assert(SrcReg.isPhysical() && "Physical register expected");
2357   assert(DstReg.isVirtual() && "Virtual register expected");
2358 
2359   MachineRegisterInfo &MRI = *B.getMRI();
2360 
2361   LLT Ty = MRI.getType(DstReg);
2362   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2363 
2364   if (Arg->isMasked()) {
2365     // TODO: Should we try to emit this once in the entry block?
2366     const LLT S32 = LLT::scalar(32);
2367     const unsigned Mask = Arg->getMask();
2368     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2369 
2370     Register AndMaskSrc = LiveIn;
2371 
2372     if (Shift != 0) {
2373       auto ShiftAmt = B.buildConstant(S32, Shift);
2374       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2375     }
2376 
2377     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2378   } else {
2379     B.buildCopy(DstReg, LiveIn);
2380   }
2381 
2382   return true;
2383 }
2384 
2385 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2386     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2387     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2388   B.setInstr(MI);
2389 
2390   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2391   if (!Arg)
2392     return false;
2393 
2394   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2395     return false;
2396 
2397   MI.eraseFromParent();
2398   return true;
2399 }
2400 
2401 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2402                                        MachineRegisterInfo &MRI,
2403                                        MachineIRBuilder &B) const {
2404   B.setInstr(MI);
2405   Register Dst = MI.getOperand(0).getReg();
2406   LLT DstTy = MRI.getType(Dst);
2407   LLT S16 = LLT::scalar(16);
2408   LLT S32 = LLT::scalar(32);
2409   LLT S64 = LLT::scalar(64);
2410 
2411   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2412     return true;
2413 
2414   if (DstTy == S16)
2415     return legalizeFDIV16(MI, MRI, B);
2416   if (DstTy == S32)
2417     return legalizeFDIV32(MI, MRI, B);
2418   if (DstTy == S64)
2419     return legalizeFDIV64(MI, MRI, B);
2420 
2421   return false;
2422 }
2423 
2424 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2425   const LLT S32 = LLT::scalar(32);
2426 
2427   auto Cvt0 = B.buildUITOFP(S32, Src);
2428   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2429   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2430   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2431   return B.buildFPTOUI(S32, Mul).getReg(0);
2432 }
2433 
2434 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2435                                                   Register DstReg,
2436                                                   Register Num,
2437                                                   Register Den,
2438                                                   bool IsRem) const {
2439   const LLT S1 = LLT::scalar(1);
2440   const LLT S32 = LLT::scalar(32);
2441 
2442   // RCP =  URECIP(Den) = 2^32 / Den + e
2443   // e is rounding error.
2444   auto RCP = buildDivRCP(B, Den);
2445 
2446   // RCP_LO = mul(RCP, Den)
2447   auto RCP_LO = B.buildMul(S32, RCP, Den);
2448 
2449   // RCP_HI = mulhu (RCP, Den) */
2450   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2451 
2452   // NEG_RCP_LO = -RCP_LO
2453   auto Zero = B.buildConstant(S32, 0);
2454   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2455 
2456   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2457   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2458   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2459 
2460   // Calculate the rounding error from the URECIP instruction
2461   // E = mulhu(ABS_RCP_LO, RCP)
2462   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2463 
2464   // RCP_A_E = RCP + E
2465   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2466 
2467   // RCP_S_E = RCP - E
2468   auto RCP_S_E = B.buildSub(S32, RCP, E);
2469 
2470   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2471   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2472 
2473   // Quotient = mulhu(Tmp0, Num)stmp
2474   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2475 
2476   // Num_S_Remainder = Quotient * Den
2477   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2478 
2479   // Remainder = Num - Num_S_Remainder
2480   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2481 
2482   // Remainder_GE_Den = Remainder >= Den
2483   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2484 
2485   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2486   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2487                                        Num, Num_S_Remainder);
2488 
2489   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2490   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2491 
2492   // Calculate Division result:
2493 
2494   // Quotient_A_One = Quotient + 1
2495   auto One = B.buildConstant(S32, 1);
2496   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2497 
2498   // Quotient_S_One = Quotient - 1
2499   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2500 
2501   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2502   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2503 
2504   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2505   if (IsRem) {
2506     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2507 
2508     // Calculate Rem result:
2509     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2510 
2511     // Remainder_A_Den = Remainder + Den
2512     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2513 
2514     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2515     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2516 
2517     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2518     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2519   } else {
2520     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2521   }
2522 }
2523 
2524 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2525                                               MachineRegisterInfo &MRI,
2526                                               MachineIRBuilder &B) const {
2527   B.setInstr(MI);
2528   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2529   Register DstReg = MI.getOperand(0).getReg();
2530   Register Num = MI.getOperand(1).getReg();
2531   Register Den = MI.getOperand(2).getReg();
2532   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2533   MI.eraseFromParent();
2534   return true;
2535 }
2536 
2537 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2538 //
2539 // Return lo, hi of result
2540 //
2541 // %cvt.lo = G_UITOFP Val.lo
2542 // %cvt.hi = G_UITOFP Val.hi
2543 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2544 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2545 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2546 // %mul2 = G_FMUL %mul1, 2**(-32)
2547 // %trunc = G_INTRINSIC_TRUNC %mul2
2548 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2549 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2550 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2551                                                        Register Val) {
2552   const LLT S32 = LLT::scalar(32);
2553   auto Unmerge = B.buildUnmerge(S32, Val);
2554 
2555   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2556   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2557 
2558   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2559                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2560 
2561   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2562   auto Mul1 =
2563       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2564 
2565   // 2**(-32)
2566   auto Mul2 =
2567       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2568   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2569 
2570   // -(2**32)
2571   auto Mad2 = B.buildFMAD(S32, Trunc,
2572                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2573 
2574   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2575   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2576 
2577   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2578 }
2579 
2580 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2581                                               MachineRegisterInfo &MRI,
2582                                               MachineIRBuilder &B) const {
2583   B.setInstr(MI);
2584 
2585   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2586   const LLT S32 = LLT::scalar(32);
2587   const LLT S64 = LLT::scalar(64);
2588   const LLT S1 = LLT::scalar(1);
2589   Register Numer = MI.getOperand(1).getReg();
2590   Register Denom = MI.getOperand(2).getReg();
2591   Register RcpLo, RcpHi;
2592 
2593   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2594 
2595   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2596 
2597   auto Zero64 = B.buildConstant(S64, 0);
2598   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2599 
2600   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2601   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2602 
2603   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2604   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2605   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2606 
2607   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2608   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2609   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2610   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2611 
2612   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2613   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2614   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2615   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2616   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2617 
2618   auto Zero32 = B.buildConstant(S32, 0);
2619   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2620   auto Add2_HiC =
2621       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2622   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2623   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2624 
2625   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2626   Register NumerLo = UnmergeNumer.getReg(0);
2627   Register NumerHi = UnmergeNumer.getReg(1);
2628 
2629   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2630   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2631   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2632   Register Mul3_Lo = UnmergeMul3.getReg(0);
2633   Register Mul3_Hi = UnmergeMul3.getReg(1);
2634   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2635   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2636   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2637   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2638 
2639   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2640   Register DenomLo = UnmergeDenom.getReg(0);
2641   Register DenomHi = UnmergeDenom.getReg(1);
2642 
2643   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2644   auto C1 = B.buildSExt(S32, CmpHi);
2645 
2646   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2647   auto C2 = B.buildSExt(S32, CmpLo);
2648 
2649   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2650   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2651 
2652   // TODO: Here and below portions of the code can be enclosed into if/endif.
2653   // Currently control flow is unconditional and we have 4 selects after
2654   // potential endif to substitute PHIs.
2655 
2656   // if C3 != 0 ...
2657   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2658   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2659   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2660   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2661 
2662   auto One64 = B.buildConstant(S64, 1);
2663   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2664 
2665   auto C4 =
2666       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2667   auto C5 =
2668       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2669   auto C6 = B.buildSelect(
2670       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2671 
2672   // if (C6 != 0)
2673   auto Add4 = B.buildAdd(S64, Add3, One64);
2674   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2675 
2676   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2677   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2678   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2679 
2680   // endif C6
2681   // endif C3
2682 
2683   if (IsDiv) {
2684     auto Sel1 = B.buildSelect(
2685         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2686     B.buildSelect(MI.getOperand(0),
2687                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2688   } else {
2689     auto Sel2 = B.buildSelect(
2690         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2691     B.buildSelect(MI.getOperand(0),
2692                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2693   }
2694 
2695   MI.eraseFromParent();
2696   return true;
2697 }
2698 
2699 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2700                                             MachineRegisterInfo &MRI,
2701                                             MachineIRBuilder &B) const {
2702   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2703   if (Ty == LLT::scalar(32))
2704     return legalizeUDIV_UREM32(MI, MRI, B);
2705   if (Ty == LLT::scalar(64))
2706     return legalizeUDIV_UREM64(MI, MRI, B);
2707   return false;
2708 }
2709 
2710 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2711                                               MachineRegisterInfo &MRI,
2712                                               MachineIRBuilder &B) const {
2713   B.setInstr(MI);
2714   const LLT S32 = LLT::scalar(32);
2715 
2716   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2717   Register DstReg = MI.getOperand(0).getReg();
2718   Register LHS = MI.getOperand(1).getReg();
2719   Register RHS = MI.getOperand(2).getReg();
2720 
2721   auto ThirtyOne = B.buildConstant(S32, 31);
2722   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2723   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2724 
2725   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2726   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2727 
2728   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2729   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2730 
2731   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2732   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2733 
2734   if (IsRem) {
2735     auto RSign = LHSign; // Remainder sign is the same as LHS
2736     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2737     B.buildSub(DstReg, UDivRem, RSign);
2738   } else {
2739     auto DSign = B.buildXor(S32, LHSign, RHSign);
2740     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2741     B.buildSub(DstReg, UDivRem, DSign);
2742   }
2743 
2744   MI.eraseFromParent();
2745   return true;
2746 }
2747 
2748 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2749                                             MachineRegisterInfo &MRI,
2750                                             MachineIRBuilder &B) const {
2751   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2752     return legalizeSDIV_SREM32(MI, MRI, B);
2753   return false;
2754 }
2755 
2756 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2757                                                  MachineRegisterInfo &MRI,
2758                                                  MachineIRBuilder &B) const {
2759   Register Res = MI.getOperand(0).getReg();
2760   Register LHS = MI.getOperand(1).getReg();
2761   Register RHS = MI.getOperand(2).getReg();
2762 
2763   uint16_t Flags = MI.getFlags();
2764 
2765   LLT ResTy = MRI.getType(Res);
2766   LLT S32 = LLT::scalar(32);
2767   LLT S64 = LLT::scalar(64);
2768 
2769   const MachineFunction &MF = B.getMF();
2770   bool Unsafe =
2771     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2772 
2773   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2774     return false;
2775 
2776   if (!Unsafe && ResTy == S32 &&
2777       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2778     return false;
2779 
2780   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2781     // 1 / x -> RCP(x)
2782     if (CLHS->isExactlyValue(1.0)) {
2783       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2784         .addUse(RHS)
2785         .setMIFlags(Flags);
2786 
2787       MI.eraseFromParent();
2788       return true;
2789     }
2790 
2791     // -1 / x -> RCP( FNEG(x) )
2792     if (CLHS->isExactlyValue(-1.0)) {
2793       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2794       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2795         .addUse(FNeg.getReg(0))
2796         .setMIFlags(Flags);
2797 
2798       MI.eraseFromParent();
2799       return true;
2800     }
2801   }
2802 
2803   // x / y -> x * (1.0 / y)
2804   if (Unsafe) {
2805     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2806       .addUse(RHS)
2807       .setMIFlags(Flags);
2808     B.buildFMul(Res, LHS, RCP, Flags);
2809 
2810     MI.eraseFromParent();
2811     return true;
2812   }
2813 
2814   return false;
2815 }
2816 
2817 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2818                                          MachineRegisterInfo &MRI,
2819                                          MachineIRBuilder &B) const {
2820   B.setInstr(MI);
2821   Register Res = MI.getOperand(0).getReg();
2822   Register LHS = MI.getOperand(1).getReg();
2823   Register RHS = MI.getOperand(2).getReg();
2824 
2825   uint16_t Flags = MI.getFlags();
2826 
2827   LLT S16 = LLT::scalar(16);
2828   LLT S32 = LLT::scalar(32);
2829 
2830   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2831   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2832 
2833   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2834     .addUse(RHSExt.getReg(0))
2835     .setMIFlags(Flags);
2836 
2837   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2838   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2839 
2840   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2841     .addUse(RDst.getReg(0))
2842     .addUse(RHS)
2843     .addUse(LHS)
2844     .setMIFlags(Flags);
2845 
2846   MI.eraseFromParent();
2847   return true;
2848 }
2849 
2850 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2851 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2852 static void toggleSPDenormMode(bool Enable,
2853                                MachineIRBuilder &B,
2854                                const GCNSubtarget &ST,
2855                                AMDGPU::SIModeRegisterDefaults Mode) {
2856   // Set SP denorm mode to this value.
2857   unsigned SPDenormMode =
2858     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2859 
2860   if (ST.hasDenormModeInst()) {
2861     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2862     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2863 
2864     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2865     B.buildInstr(AMDGPU::S_DENORM_MODE)
2866       .addImm(NewDenormModeValue);
2867 
2868   } else {
2869     // Select FP32 bit field in mode register.
2870     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2871                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2872                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2873 
2874     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2875       .addImm(SPDenormMode)
2876       .addImm(SPDenormModeBitField);
2877   }
2878 }
2879 
2880 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2881                                          MachineRegisterInfo &MRI,
2882                                          MachineIRBuilder &B) const {
2883   B.setInstr(MI);
2884   Register Res = MI.getOperand(0).getReg();
2885   Register LHS = MI.getOperand(1).getReg();
2886   Register RHS = MI.getOperand(2).getReg();
2887   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2888   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2889 
2890   uint16_t Flags = MI.getFlags();
2891 
2892   LLT S32 = LLT::scalar(32);
2893   LLT S1 = LLT::scalar(1);
2894 
2895   auto One = B.buildFConstant(S32, 1.0f);
2896 
2897   auto DenominatorScaled =
2898     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2899       .addUse(LHS)
2900       .addUse(RHS)
2901       .addImm(0)
2902       .setMIFlags(Flags);
2903   auto NumeratorScaled =
2904     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2905       .addUse(LHS)
2906       .addUse(RHS)
2907       .addImm(1)
2908       .setMIFlags(Flags);
2909 
2910   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2911     .addUse(DenominatorScaled.getReg(0))
2912     .setMIFlags(Flags);
2913   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2914 
2915   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2916   // aren't modeled as reading it.
2917   if (!Mode.allFP32Denormals())
2918     toggleSPDenormMode(true, B, ST, Mode);
2919 
2920   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2921   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2922   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2923   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2924   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2925   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2926 
2927   if (!Mode.allFP32Denormals())
2928     toggleSPDenormMode(false, B, ST, Mode);
2929 
2930   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2931     .addUse(Fma4.getReg(0))
2932     .addUse(Fma1.getReg(0))
2933     .addUse(Fma3.getReg(0))
2934     .addUse(NumeratorScaled.getReg(1))
2935     .setMIFlags(Flags);
2936 
2937   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2938     .addUse(Fmas.getReg(0))
2939     .addUse(RHS)
2940     .addUse(LHS)
2941     .setMIFlags(Flags);
2942 
2943   MI.eraseFromParent();
2944   return true;
2945 }
2946 
2947 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2948                                          MachineRegisterInfo &MRI,
2949                                          MachineIRBuilder &B) const {
2950   B.setInstr(MI);
2951   Register Res = MI.getOperand(0).getReg();
2952   Register LHS = MI.getOperand(1).getReg();
2953   Register RHS = MI.getOperand(2).getReg();
2954 
2955   uint16_t Flags = MI.getFlags();
2956 
2957   LLT S64 = LLT::scalar(64);
2958   LLT S1 = LLT::scalar(1);
2959 
2960   auto One = B.buildFConstant(S64, 1.0);
2961 
2962   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2963     .addUse(LHS)
2964     .addUse(RHS)
2965     .addImm(0)
2966     .setMIFlags(Flags);
2967 
2968   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2969 
2970   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2971     .addUse(DivScale0.getReg(0))
2972     .setMIFlags(Flags);
2973 
2974   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2975   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2976   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2977 
2978   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2979     .addUse(LHS)
2980     .addUse(RHS)
2981     .addImm(1)
2982     .setMIFlags(Flags);
2983 
2984   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2985   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
2986   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2987 
2988   Register Scale;
2989   if (!ST.hasUsableDivScaleConditionOutput()) {
2990     // Workaround a hardware bug on SI where the condition output from div_scale
2991     // is not usable.
2992 
2993     LLT S32 = LLT::scalar(32);
2994 
2995     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2996     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2997     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2998     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2999 
3000     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3001                               Scale1Unmerge.getReg(1));
3002     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3003                               Scale0Unmerge.getReg(1));
3004     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3005   } else {
3006     Scale = DivScale1.getReg(1);
3007   }
3008 
3009   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3010     .addUse(Fma4.getReg(0))
3011     .addUse(Fma3.getReg(0))
3012     .addUse(Mul.getReg(0))
3013     .addUse(Scale)
3014     .setMIFlags(Flags);
3015 
3016   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3017     .addUse(Fmas.getReg(0))
3018     .addUse(RHS)
3019     .addUse(LHS)
3020     .setMIFlags(Flags);
3021 
3022   MI.eraseFromParent();
3023   return true;
3024 }
3025 
3026 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3027                                                  MachineRegisterInfo &MRI,
3028                                                  MachineIRBuilder &B) const {
3029   B.setInstr(MI);
3030   Register Res = MI.getOperand(0).getReg();
3031   Register LHS = MI.getOperand(2).getReg();
3032   Register RHS = MI.getOperand(3).getReg();
3033   uint16_t Flags = MI.getFlags();
3034 
3035   LLT S32 = LLT::scalar(32);
3036   LLT S1 = LLT::scalar(1);
3037 
3038   auto Abs = B.buildFAbs(S32, RHS, Flags);
3039   const APFloat C0Val(1.0f);
3040 
3041   auto C0 = B.buildConstant(S32, 0x6f800000);
3042   auto C1 = B.buildConstant(S32, 0x2f800000);
3043   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3044 
3045   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3046   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3047 
3048   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3049 
3050   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3051     .addUse(Mul0.getReg(0))
3052     .setMIFlags(Flags);
3053 
3054   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3055 
3056   B.buildFMul(Res, Sel, Mul1, Flags);
3057 
3058   MI.eraseFromParent();
3059   return true;
3060 }
3061 
3062 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3063                                                  MachineRegisterInfo &MRI,
3064                                                  MachineIRBuilder &B) const {
3065   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3066   if (!MFI->isEntryFunction()) {
3067     return legalizePreloadedArgIntrin(MI, MRI, B,
3068                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3069   }
3070 
3071   B.setInstr(MI);
3072 
3073   uint64_t Offset =
3074     ST.getTargetLowering()->getImplicitParameterOffset(
3075       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3076   Register DstReg = MI.getOperand(0).getReg();
3077   LLT DstTy = MRI.getType(DstReg);
3078   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3079 
3080   const ArgDescriptor *Arg;
3081   const TargetRegisterClass *RC;
3082   std::tie(Arg, RC)
3083     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3084   if (!Arg)
3085     return false;
3086 
3087   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3088   if (!loadInputValue(KernargPtrReg, B, Arg))
3089     return false;
3090 
3091   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3092   MI.eraseFromParent();
3093   return true;
3094 }
3095 
3096 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3097                                               MachineRegisterInfo &MRI,
3098                                               MachineIRBuilder &B,
3099                                               unsigned AddrSpace) const {
3100   B.setInstr(MI);
3101   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3102   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3103   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3104   MI.eraseFromParent();
3105   return true;
3106 }
3107 
3108 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3109 // offset (the offset that is included in bounds checking and swizzling, to be
3110 // split between the instruction's voffset and immoffset fields) and soffset
3111 // (the offset that is excluded from bounds checking and swizzling, to go in
3112 // the instruction's soffset field).  This function takes the first kind of
3113 // offset and figures out how to split it between voffset and immoffset.
3114 std::tuple<Register, unsigned, unsigned>
3115 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3116                                         Register OrigOffset) const {
3117   const unsigned MaxImm = 4095;
3118   Register BaseReg;
3119   unsigned TotalConstOffset;
3120   MachineInstr *OffsetDef;
3121   const LLT S32 = LLT::scalar(32);
3122 
3123   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3124     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3125 
3126   unsigned ImmOffset = TotalConstOffset;
3127 
3128   // If the immediate value is too big for the immoffset field, put the value
3129   // and -4096 into the immoffset field so that the value that is copied/added
3130   // for the voffset field is a multiple of 4096, and it stands more chance
3131   // of being CSEd with the copy/add for another similar load/store.
3132   // However, do not do that rounding down to a multiple of 4096 if that is a
3133   // negative number, as it appears to be illegal to have a negative offset
3134   // in the vgpr, even if adding the immediate offset makes it positive.
3135   unsigned Overflow = ImmOffset & ~MaxImm;
3136   ImmOffset -= Overflow;
3137   if ((int32_t)Overflow < 0) {
3138     Overflow += ImmOffset;
3139     ImmOffset = 0;
3140   }
3141 
3142   if (Overflow != 0) {
3143     if (!BaseReg) {
3144       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3145     } else {
3146       auto OverflowVal = B.buildConstant(S32, Overflow);
3147       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3148     }
3149   }
3150 
3151   if (!BaseReg)
3152     BaseReg = B.buildConstant(S32, 0).getReg(0);
3153 
3154   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3155 }
3156 
3157 /// Handle register layout difference for f16 images for some subtargets.
3158 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3159                                              MachineRegisterInfo &MRI,
3160                                              Register Reg) const {
3161   if (!ST.hasUnpackedD16VMem())
3162     return Reg;
3163 
3164   const LLT S16 = LLT::scalar(16);
3165   const LLT S32 = LLT::scalar(32);
3166   LLT StoreVT = MRI.getType(Reg);
3167   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3168 
3169   auto Unmerge = B.buildUnmerge(S16, Reg);
3170 
3171   SmallVector<Register, 4> WideRegs;
3172   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3173     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3174 
3175   int NumElts = StoreVT.getNumElements();
3176 
3177   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3178 }
3179 
3180 Register AMDGPULegalizerInfo::fixStoreSourceType(
3181   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3182   MachineRegisterInfo *MRI = B.getMRI();
3183   LLT Ty = MRI->getType(VData);
3184 
3185   const LLT S16 = LLT::scalar(16);
3186 
3187   // Fixup illegal register types for i8 stores.
3188   if (Ty == LLT::scalar(8) || Ty == S16) {
3189     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3190     return AnyExt;
3191   }
3192 
3193   if (Ty.isVector()) {
3194     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3195       if (IsFormat)
3196         return handleD16VData(B, *MRI, VData);
3197     }
3198   }
3199 
3200   return VData;
3201 }
3202 
3203 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3204                                               MachineRegisterInfo &MRI,
3205                                               MachineIRBuilder &B,
3206                                               bool IsTyped,
3207                                               bool IsFormat) const {
3208   B.setInstr(MI);
3209 
3210   Register VData = MI.getOperand(1).getReg();
3211   LLT Ty = MRI.getType(VData);
3212   LLT EltTy = Ty.getScalarType();
3213   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3214   const LLT S32 = LLT::scalar(32);
3215 
3216   VData = fixStoreSourceType(B, VData, IsFormat);
3217   Register RSrc = MI.getOperand(2).getReg();
3218 
3219   MachineMemOperand *MMO = *MI.memoperands_begin();
3220   const int MemSize = MMO->getSize();
3221 
3222   unsigned ImmOffset;
3223   unsigned TotalOffset;
3224 
3225   // The typed intrinsics add an immediate after the registers.
3226   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3227 
3228   // The struct intrinsic variants add one additional operand over raw.
3229   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3230   Register VIndex;
3231   int OpOffset = 0;
3232   if (HasVIndex) {
3233     VIndex = MI.getOperand(3).getReg();
3234     OpOffset = 1;
3235   }
3236 
3237   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3238   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3239 
3240   unsigned Format = 0;
3241   if (IsTyped) {
3242     Format = MI.getOperand(5 + OpOffset).getImm();
3243     ++OpOffset;
3244   }
3245 
3246   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3247 
3248   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3249   if (TotalOffset != 0)
3250     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3251 
3252   unsigned Opc;
3253   if (IsTyped) {
3254     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3255                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3256   } else if (IsFormat) {
3257     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3258                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3259   } else {
3260     switch (MemSize) {
3261     case 1:
3262       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3263       break;
3264     case 2:
3265       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3266       break;
3267     default:
3268       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3269       break;
3270     }
3271   }
3272 
3273   if (!VIndex)
3274     VIndex = B.buildConstant(S32, 0).getReg(0);
3275 
3276   auto MIB = B.buildInstr(Opc)
3277     .addUse(VData)              // vdata
3278     .addUse(RSrc)               // rsrc
3279     .addUse(VIndex)             // vindex
3280     .addUse(VOffset)            // voffset
3281     .addUse(SOffset)            // soffset
3282     .addImm(ImmOffset);         // offset(imm)
3283 
3284   if (IsTyped)
3285     MIB.addImm(Format);
3286 
3287   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3288      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3289      .addMemOperand(MMO);
3290 
3291   MI.eraseFromParent();
3292   return true;
3293 }
3294 
3295 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3296                                              MachineRegisterInfo &MRI,
3297                                              MachineIRBuilder &B,
3298                                              bool IsFormat,
3299                                              bool IsTyped) const {
3300   B.setInstr(MI);
3301 
3302   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3303   MachineMemOperand *MMO = *MI.memoperands_begin();
3304   const int MemSize = MMO->getSize();
3305   const LLT S32 = LLT::scalar(32);
3306 
3307   Register Dst = MI.getOperand(0).getReg();
3308   Register RSrc = MI.getOperand(2).getReg();
3309 
3310   // The typed intrinsics add an immediate after the registers.
3311   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3312 
3313   // The struct intrinsic variants add one additional operand over raw.
3314   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3315   Register VIndex;
3316   int OpOffset = 0;
3317   if (HasVIndex) {
3318     VIndex = MI.getOperand(3).getReg();
3319     OpOffset = 1;
3320   }
3321 
3322   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3323   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3324 
3325   unsigned Format = 0;
3326   if (IsTyped) {
3327     Format = MI.getOperand(5 + OpOffset).getImm();
3328     ++OpOffset;
3329   }
3330 
3331   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3332   unsigned ImmOffset;
3333   unsigned TotalOffset;
3334 
3335   LLT Ty = MRI.getType(Dst);
3336   LLT EltTy = Ty.getScalarType();
3337   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3338   const bool Unpacked = ST.hasUnpackedD16VMem();
3339 
3340   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3341   if (TotalOffset != 0)
3342     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3343 
3344   unsigned Opc;
3345 
3346   if (IsTyped) {
3347     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3348                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3349   } else if (IsFormat) {
3350     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3351                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3352   } else {
3353     switch (MemSize) {
3354     case 1:
3355       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3356       break;
3357     case 2:
3358       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3359       break;
3360     default:
3361       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3362       break;
3363     }
3364   }
3365 
3366   Register LoadDstReg;
3367 
3368   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3369   LLT UnpackedTy = Ty.changeElementSize(32);
3370 
3371   if (IsExtLoad)
3372     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3373   else if (Unpacked && IsD16 && Ty.isVector())
3374     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3375   else
3376     LoadDstReg = Dst;
3377 
3378   if (!VIndex)
3379     VIndex = B.buildConstant(S32, 0).getReg(0);
3380 
3381   auto MIB = B.buildInstr(Opc)
3382     .addDef(LoadDstReg)         // vdata
3383     .addUse(RSrc)               // rsrc
3384     .addUse(VIndex)             // vindex
3385     .addUse(VOffset)            // voffset
3386     .addUse(SOffset)            // soffset
3387     .addImm(ImmOffset);         // offset(imm)
3388 
3389   if (IsTyped)
3390     MIB.addImm(Format);
3391 
3392   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3393      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3394      .addMemOperand(MMO);
3395 
3396   if (LoadDstReg != Dst) {
3397     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3398 
3399     // Widen result for extending loads was widened.
3400     if (IsExtLoad)
3401       B.buildTrunc(Dst, LoadDstReg);
3402     else {
3403       // Repack to original 16-bit vector result
3404       // FIXME: G_TRUNC should work, but legalization currently fails
3405       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3406       SmallVector<Register, 4> Repack;
3407       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3408         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3409       B.buildMerge(Dst, Repack);
3410     }
3411   }
3412 
3413   MI.eraseFromParent();
3414   return true;
3415 }
3416 
3417 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3418                                                MachineIRBuilder &B,
3419                                                bool IsInc) const {
3420   B.setInstr(MI);
3421   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3422                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3423   B.buildInstr(Opc)
3424     .addDef(MI.getOperand(0).getReg())
3425     .addUse(MI.getOperand(2).getReg())
3426     .addUse(MI.getOperand(3).getReg())
3427     .cloneMemRefs(MI);
3428   MI.eraseFromParent();
3429   return true;
3430 }
3431 
3432 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3433   switch (IntrID) {
3434   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3435   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3436     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3437   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3438   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3439     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3440   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3441   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3442     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3443   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3444   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3445     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3446   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3447   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3448     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3449   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3450   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3451     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3452   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3453   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3454     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3455   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3456   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3457     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3458   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3459   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3460     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3461   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3462   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3463     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3464   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3465   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3466     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3467   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3468   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3469     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3470   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3471   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3472     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3473   default:
3474     llvm_unreachable("unhandled atomic opcode");
3475   }
3476 }
3477 
3478 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3479                                                MachineIRBuilder &B,
3480                                                Intrinsic::ID IID) const {
3481   B.setInstr(MI);
3482 
3483   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3484                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3485 
3486   Register Dst = MI.getOperand(0).getReg();
3487   Register VData = MI.getOperand(2).getReg();
3488 
3489   Register CmpVal;
3490   int OpOffset = 0;
3491 
3492   if (IsCmpSwap) {
3493     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3494     ++OpOffset;
3495   }
3496 
3497   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3498   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3499 
3500   // The struct intrinsic variants add one additional operand over raw.
3501   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3502   Register VIndex;
3503   if (HasVIndex) {
3504     VIndex = MI.getOperand(4 + OpOffset).getReg();
3505     ++OpOffset;
3506   }
3507 
3508   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3509   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3510   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3511 
3512   MachineMemOperand *MMO = *MI.memoperands_begin();
3513 
3514   unsigned ImmOffset;
3515   unsigned TotalOffset;
3516   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3517   if (TotalOffset != 0)
3518     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3519 
3520   if (!VIndex)
3521     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3522 
3523   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3524     .addDef(Dst)
3525     .addUse(VData); // vdata
3526 
3527   if (IsCmpSwap)
3528     MIB.addReg(CmpVal);
3529 
3530   MIB.addUse(RSrc)               // rsrc
3531      .addUse(VIndex)             // vindex
3532      .addUse(VOffset)            // voffset
3533      .addUse(SOffset)            // soffset
3534      .addImm(ImmOffset)          // offset(imm)
3535      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3536      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3537      .addMemOperand(MMO);
3538 
3539   MI.eraseFromParent();
3540   return true;
3541 }
3542 
3543 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3544 /// vector with s16 typed elements.
3545 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3546                                         SmallVectorImpl<Register> &PackedAddrs,
3547                                         int AddrIdx, int DimIdx, int NumVAddrs,
3548                                         int NumGradients) {
3549   const LLT S16 = LLT::scalar(16);
3550   const LLT V2S16 = LLT::vector(2, 16);
3551 
3552   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3553     MachineOperand &SrcOp = MI.getOperand(I);
3554     if (!SrcOp.isReg())
3555       continue; // _L to _LZ may have eliminated this.
3556 
3557     Register AddrReg = SrcOp.getReg();
3558 
3559     if (I < DimIdx) {
3560       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3561       PackedAddrs.push_back(AddrReg);
3562     } else {
3563       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3564       // derivatives dx/dh and dx/dv are packed with undef.
3565       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3566           ((NumGradients / 2) % 2 == 1 &&
3567            (I == DimIdx + (NumGradients / 2) - 1 ||
3568             I == DimIdx + NumGradients - 1)) ||
3569           // Check for _L to _LZ optimization
3570           !MI.getOperand(I + 1).isReg()) {
3571         PackedAddrs.push_back(
3572             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3573                 .getReg(0));
3574       } else {
3575         PackedAddrs.push_back(
3576             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3577                 .getReg(0));
3578         ++I;
3579       }
3580     }
3581   }
3582 }
3583 
3584 /// Convert from separate vaddr components to a single vector address register,
3585 /// and replace the remaining operands with $noreg.
3586 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3587                                      int DimIdx, int NumVAddrs) {
3588   const LLT S32 = LLT::scalar(32);
3589 
3590   SmallVector<Register, 8> AddrRegs;
3591   for (int I = 0; I != NumVAddrs; ++I) {
3592     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3593     if (SrcOp.isReg()) {
3594       AddrRegs.push_back(SrcOp.getReg());
3595       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3596     }
3597   }
3598 
3599   int NumAddrRegs = AddrRegs.size();
3600   if (NumAddrRegs != 1) {
3601     // Round up to 8 elements for v5-v7
3602     // FIXME: Missing intermediate sized register classes and instructions.
3603     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3604       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3605       auto Undef = B.buildUndef(S32);
3606       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3607       NumAddrRegs = RoundedNumRegs;
3608     }
3609 
3610     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3611     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3612   }
3613 
3614   for (int I = 1; I != NumVAddrs; ++I) {
3615     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3616     if (SrcOp.isReg())
3617       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3618   }
3619 }
3620 
3621 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3622 ///
3623 /// Depending on the subtarget, load/store with 16-bit element data need to be
3624 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3625 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3626 /// registers.
3627 ///
3628 /// We don't want to directly select image instructions just yet, but also want
3629 /// to exposes all register repacking to the legalizer/combiners. We also don't
3630 /// want a selected instrution entering RegBankSelect. In order to avoid
3631 /// defining a multitude of intermediate image instructions, directly hack on
3632 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3633 /// now unnecessary arguments with $noreg.
3634 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3635     MachineInstr &MI, MachineIRBuilder &B,
3636     GISelChangeObserver &Observer,
3637     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3638   B.setInstr(MI);
3639 
3640   const int NumDefs = MI.getNumExplicitDefs();
3641   bool IsTFE = NumDefs == 2;
3642   // We are only processing the operands of d16 image operations on subtargets
3643   // that use the unpacked register layout, or need to repack the TFE result.
3644 
3645   // TODO: Do we need to guard against already legalized intrinsics?
3646   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3647     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3648 
3649   MachineRegisterInfo *MRI = B.getMRI();
3650   const LLT S32 = LLT::scalar(32);
3651   const LLT S16 = LLT::scalar(16);
3652   const LLT V2S16 = LLT::vector(2, 16);
3653 
3654   // Index of first address argument
3655   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3656 
3657   // Check for 16 bit addresses and pack if true.
3658   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3659   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3660   const bool IsA16 = AddrTy == S16;
3661 
3662   int NumVAddrs, NumGradients;
3663   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3664   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3665     getDMaskIdx(BaseOpcode, NumDefs);
3666   unsigned DMask = 0;
3667 
3668   int DMaskLanes = 0;
3669   if (!BaseOpcode->Atomic) {
3670     DMask = MI.getOperand(DMaskIdx).getImm();
3671     if (BaseOpcode->Gather4) {
3672       DMaskLanes = 4;
3673     } else if (DMask != 0) {
3674       DMaskLanes = countPopulation(DMask);
3675     } else if (!IsTFE && !BaseOpcode->Store) {
3676       // If dmask is 0, this is a no-op load. This can be eliminated.
3677       B.buildUndef(MI.getOperand(0));
3678       MI.eraseFromParent();
3679       return true;
3680     }
3681   }
3682 
3683   Observer.changingInstr(MI);
3684   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3685 
3686   unsigned NewOpcode = NumDefs == 0 ?
3687     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3688 
3689   // Track that we legalized this
3690   MI.setDesc(B.getTII().get(NewOpcode));
3691 
3692   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3693   // dmask to be at least 1 otherwise the instruction will fail
3694   if (IsTFE && DMask == 0) {
3695     DMask = 0x1;
3696     DMaskLanes = 1;
3697     MI.getOperand(DMaskIdx).setImm(DMask);
3698   }
3699 
3700   if (BaseOpcode->Atomic) {
3701     Register VData0 = MI.getOperand(2).getReg();
3702     LLT Ty = MRI->getType(VData0);
3703 
3704     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3705     if (Ty.isVector())
3706       return false;
3707 
3708     if (BaseOpcode->AtomicX2) {
3709       Register VData1 = MI.getOperand(3).getReg();
3710       // The two values are packed in one register.
3711       LLT PackedTy = LLT::vector(2, Ty);
3712       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3713       MI.getOperand(2).setReg(Concat.getReg(0));
3714       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3715     }
3716   }
3717 
3718   int CorrectedNumVAddrs = NumVAddrs;
3719 
3720   // Optimize _L to _LZ when _L is zero
3721   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3722         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3723     const ConstantFP *ConstantLod;
3724     const int LodIdx = AddrIdx + NumVAddrs - 1;
3725 
3726     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3727       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3728         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3729         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3730           LZMappingInfo->LZ, ImageDimIntr->Dim);
3731 
3732         // The starting indexes should remain in the same place.
3733         --NumVAddrs;
3734         --CorrectedNumVAddrs;
3735 
3736         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3737           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3738         MI.RemoveOperand(LodIdx);
3739       }
3740     }
3741   }
3742 
3743   // Optimize _mip away, when 'lod' is zero
3744   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3745     int64_t ConstantLod;
3746     const int LodIdx = AddrIdx + NumVAddrs - 1;
3747 
3748     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3749       if (ConstantLod == 0) {
3750         // TODO: Change intrinsic opcode and remove operand instead or replacing
3751         // it with 0, as the _L to _LZ handling is done above.
3752         MI.getOperand(LodIdx).ChangeToImmediate(0);
3753         --CorrectedNumVAddrs;
3754       }
3755     }
3756   }
3757 
3758   // If the register allocator cannot place the address registers contiguously
3759   // without introducing moves, then using the non-sequential address encoding
3760   // is always preferable, since it saves VALU instructions and is usually a
3761   // wash in terms of code size or even better.
3762   //
3763   // However, we currently have no way of hinting to the register allocator
3764   // that MIMG addresses should be placed contiguously when it is possible to
3765   // do so, so force non-NSA for the common 2-address case as a heuristic.
3766   //
3767   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3768   // allocation when possible.
3769   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3770 
3771   // Rewrite the addressing register layout before doing anything else.
3772   if (IsA16) {
3773     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3774     // should be introduced.
3775     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3776       return false;
3777 
3778     if (NumVAddrs > 1) {
3779       SmallVector<Register, 4> PackedRegs;
3780       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3781                                   NumGradients);
3782 
3783       if (!UseNSA && PackedRegs.size() > 1) {
3784         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3785         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3786         PackedRegs[0] = Concat.getReg(0);
3787         PackedRegs.resize(1);
3788       }
3789 
3790       const int NumPacked = PackedRegs.size();
3791       for (int I = 0; I != NumVAddrs; ++I) {
3792         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3793         if (!SrcOp.isReg()) {
3794           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3795           continue;
3796         }
3797 
3798         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3799 
3800         if (I < NumPacked)
3801           SrcOp.setReg(PackedRegs[I]);
3802         else
3803           SrcOp.setReg(AMDGPU::NoRegister);
3804       }
3805     }
3806   } else if (!UseNSA && NumVAddrs > 1) {
3807     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3808   }
3809 
3810 
3811   if (BaseOpcode->Store) { // No TFE for stores?
3812     // TODO: Handle dmask trim
3813     Register VData = MI.getOperand(1).getReg();
3814     LLT Ty = MRI->getType(VData);
3815     if (!Ty.isVector() || Ty.getElementType() != S16)
3816       return true;
3817 
3818     B.setInstr(MI);
3819 
3820     Register RepackedReg = handleD16VData(B, *MRI, VData);
3821     if (RepackedReg != VData) {
3822       MI.getOperand(1).setReg(RepackedReg);
3823     }
3824 
3825     return true;
3826   }
3827 
3828   Register DstReg = MI.getOperand(0).getReg();
3829   LLT Ty = MRI->getType(DstReg);
3830   const LLT EltTy = Ty.getScalarType();
3831   const bool IsD16 = Ty.getScalarType() == S16;
3832   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3833 
3834   // Confirm that the return type is large enough for the dmask specified
3835   if (NumElts < DMaskLanes)
3836     return false;
3837 
3838   if (NumElts > 4 || DMaskLanes > 4)
3839     return false;
3840 
3841   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3842   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3843 
3844   // The raw dword aligned data component of the load. The only legal cases
3845   // where this matters should be when using the packed D16 format, for
3846   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3847   LLT RoundedTy;
3848 
3849   // S32 vector to to cover all data, plus TFE result element.
3850   LLT TFETy;
3851 
3852   // Register type to use for each loaded component. Will be S32 or V2S16.
3853   LLT RegTy;
3854 
3855   if (IsD16 && ST.hasUnpackedD16VMem()) {
3856     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3857     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3858     RegTy = S32;
3859   } else {
3860     unsigned EltSize = EltTy.getSizeInBits();
3861     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3862     unsigned RoundedSize = 32 * RoundedElts;
3863     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3864     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3865     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3866   }
3867 
3868   // The return type does not need adjustment.
3869   // TODO: Should we change s16 case to s32 or <2 x s16>?
3870   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3871     return true;
3872 
3873   Register Dst1Reg;
3874 
3875   // Insert after the instruction.
3876   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3877 
3878   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3879   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3880   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3881   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3882 
3883   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3884 
3885   MI.getOperand(0).setReg(NewResultReg);
3886 
3887   // In the IR, TFE is supposed to be used with a 2 element struct return
3888   // type. The intruction really returns these two values in one contiguous
3889   // register, with one additional dword beyond the loaded data. Rewrite the
3890   // return type to use a single register result.
3891 
3892   if (IsTFE) {
3893     Dst1Reg = MI.getOperand(1).getReg();
3894     if (MRI->getType(Dst1Reg) != S32)
3895       return false;
3896 
3897     // TODO: Make sure the TFE operand bit is set.
3898     MI.RemoveOperand(1);
3899 
3900     // Handle the easy case that requires no repack instructions.
3901     if (Ty == S32) {
3902       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3903       return true;
3904     }
3905   }
3906 
3907   // Now figure out how to copy the new result register back into the old
3908   // result.
3909   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3910 
3911   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3912 
3913   if (ResultNumRegs == 1) {
3914     assert(!IsTFE);
3915     ResultRegs[0] = NewResultReg;
3916   } else {
3917     // We have to repack into a new vector of some kind.
3918     for (int I = 0; I != NumDataRegs; ++I)
3919       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3920     B.buildUnmerge(ResultRegs, NewResultReg);
3921 
3922     // Drop the final TFE element to get the data part. The TFE result is
3923     // directly written to the right place already.
3924     if (IsTFE)
3925       ResultRegs.resize(NumDataRegs);
3926   }
3927 
3928   // For an s16 scalar result, we form an s32 result with a truncate regardless
3929   // of packed vs. unpacked.
3930   if (IsD16 && !Ty.isVector()) {
3931     B.buildTrunc(DstReg, ResultRegs[0]);
3932     return true;
3933   }
3934 
3935   // Avoid a build/concat_vector of 1 entry.
3936   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3937     B.buildBitcast(DstReg, ResultRegs[0]);
3938     return true;
3939   }
3940 
3941   assert(Ty.isVector());
3942 
3943   if (IsD16) {
3944     // For packed D16 results with TFE enabled, all the data components are
3945     // S32. Cast back to the expected type.
3946     //
3947     // TODO: We don't really need to use load s32 elements. We would only need one
3948     // cast for the TFE result if a multiple of v2s16 was used.
3949     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3950       for (Register &Reg : ResultRegs)
3951         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3952     } else if (ST.hasUnpackedD16VMem()) {
3953       for (Register &Reg : ResultRegs)
3954         Reg = B.buildTrunc(S16, Reg).getReg(0);
3955     }
3956   }
3957 
3958   auto padWithUndef = [&](LLT Ty, int NumElts) {
3959     if (NumElts == 0)
3960       return;
3961     Register Undef = B.buildUndef(Ty).getReg(0);
3962     for (int I = 0; I != NumElts; ++I)
3963       ResultRegs.push_back(Undef);
3964   };
3965 
3966   // Pad out any elements eliminated due to the dmask.
3967   LLT ResTy = MRI->getType(ResultRegs[0]);
3968   if (!ResTy.isVector()) {
3969     padWithUndef(ResTy, NumElts - ResultRegs.size());
3970     B.buildBuildVector(DstReg, ResultRegs);
3971     return true;
3972   }
3973 
3974   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3975   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3976 
3977   // Deal with the one annoying legal case.
3978   const LLT V3S16 = LLT::vector(3, 16);
3979   if (Ty == V3S16) {
3980     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3981     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3982     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3983     return true;
3984   }
3985 
3986   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3987   B.buildConcatVectors(DstReg, ResultRegs);
3988   return true;
3989 }
3990 
3991 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3992   MachineInstr &MI, MachineIRBuilder &B,
3993   GISelChangeObserver &Observer) const {
3994   Register Dst = MI.getOperand(0).getReg();
3995   LLT Ty = B.getMRI()->getType(Dst);
3996   unsigned Size = Ty.getSizeInBits();
3997   MachineFunction &MF = B.getMF();
3998 
3999   Observer.changingInstr(MI);
4000 
4001   // FIXME: We don't really need this intermediate instruction. The intrinsic
4002   // should be fixed to have a memory operand. Since it's readnone, we're not
4003   // allowed to add one.
4004   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4005   MI.RemoveOperand(1); // Remove intrinsic ID
4006 
4007   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4008   // TODO: Should this use datalayout alignment?
4009   const unsigned MemSize = (Size + 7) / 8;
4010   const Align MemAlign(4);
4011   MachineMemOperand *MMO = MF.getMachineMemOperand(
4012       MachinePointerInfo(),
4013       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4014           MachineMemOperand::MOInvariant,
4015       MemSize, MemAlign);
4016   MI.addMemOperand(MF, MMO);
4017 
4018   // There are no 96-bit result scalar loads, but widening to 128-bit should
4019   // always be legal. We may need to restore this to a 96-bit result if it turns
4020   // out this needs to be converted to a vector load during RegBankSelect.
4021   if (!isPowerOf2_32(Size)) {
4022     LegalizerHelper Helper(MF, *this, Observer, B);
4023     B.setInstr(MI);
4024 
4025     if (Ty.isVector())
4026       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4027     else
4028       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4029   }
4030 
4031   Observer.changedInstr(MI);
4032   return true;
4033 }
4034 
4035 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4036                                                 MachineRegisterInfo &MRI,
4037                                                 MachineIRBuilder &B) const {
4038   B.setInstr(MI);
4039 
4040   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4041   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4042       !ST.isTrapHandlerEnabled()) {
4043     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4044   } else {
4045     // Pass queue pointer to trap handler as input, and insert trap instruction
4046     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4047     const ArgDescriptor *Arg =
4048         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4049     if (!Arg)
4050       return false;
4051     MachineRegisterInfo &MRI = *B.getMRI();
4052     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4053     Register LiveIn = getLiveInRegister(
4054         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4055         /*InsertLiveInCopy=*/false);
4056     if (!loadInputValue(LiveIn, B, Arg))
4057       return false;
4058     B.buildCopy(SGPR01, LiveIn);
4059     B.buildInstr(AMDGPU::S_TRAP)
4060         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4061         .addReg(SGPR01, RegState::Implicit);
4062   }
4063 
4064   MI.eraseFromParent();
4065   return true;
4066 }
4067 
4068 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4069     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4070   B.setInstr(MI);
4071 
4072   // Is non-HSA path or trap-handler disabled? then, report a warning
4073   // accordingly
4074   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4075       !ST.isTrapHandlerEnabled()) {
4076     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4077                                      "debugtrap handler not supported",
4078                                      MI.getDebugLoc(), DS_Warning);
4079     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4080     Ctx.diagnose(NoTrap);
4081   } else {
4082     // Insert debug-trap instruction
4083     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4084   }
4085 
4086   MI.eraseFromParent();
4087   return true;
4088 }
4089 
4090 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4091                                             MachineIRBuilder &B,
4092                                             GISelChangeObserver &Observer) const {
4093   MachineRegisterInfo &MRI = *B.getMRI();
4094 
4095   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4096   auto IntrID = MI.getIntrinsicID();
4097   switch (IntrID) {
4098   case Intrinsic::amdgcn_if:
4099   case Intrinsic::amdgcn_else: {
4100     MachineInstr *Br = nullptr;
4101     MachineBasicBlock *UncondBrTarget = nullptr;
4102     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4103       const SIRegisterInfo *TRI
4104         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4105 
4106       B.setInstr(*BrCond);
4107       Register Def = MI.getOperand(1).getReg();
4108       Register Use = MI.getOperand(3).getReg();
4109 
4110       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4111       if (IntrID == Intrinsic::amdgcn_if) {
4112         B.buildInstr(AMDGPU::SI_IF)
4113           .addDef(Def)
4114           .addUse(Use)
4115           .addMBB(UncondBrTarget);
4116       } else {
4117         B.buildInstr(AMDGPU::SI_ELSE)
4118           .addDef(Def)
4119           .addUse(Use)
4120           .addMBB(UncondBrTarget)
4121           .addImm(0);
4122       }
4123 
4124       if (Br) {
4125         Br->getOperand(0).setMBB(CondBrTarget);
4126       } else {
4127         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4128         // since we're swapping branch targets it needs to be reinserted.
4129         // FIXME: IRTranslator should probably not do this
4130         B.buildBr(*CondBrTarget);
4131       }
4132 
4133       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4134       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4135       MI.eraseFromParent();
4136       BrCond->eraseFromParent();
4137       return true;
4138     }
4139 
4140     return false;
4141   }
4142   case Intrinsic::amdgcn_loop: {
4143     MachineInstr *Br = nullptr;
4144     MachineBasicBlock *UncondBrTarget = nullptr;
4145     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4146       const SIRegisterInfo *TRI
4147         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4148 
4149       B.setInstr(*BrCond);
4150 
4151       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4152       Register Reg = MI.getOperand(2).getReg();
4153       B.buildInstr(AMDGPU::SI_LOOP)
4154         .addUse(Reg)
4155         .addMBB(UncondBrTarget);
4156 
4157       if (Br)
4158         Br->getOperand(0).setMBB(CondBrTarget);
4159       else
4160         B.buildBr(*CondBrTarget);
4161 
4162       MI.eraseFromParent();
4163       BrCond->eraseFromParent();
4164       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4165       return true;
4166     }
4167 
4168     return false;
4169   }
4170   case Intrinsic::amdgcn_kernarg_segment_ptr:
4171     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4172       B.setInstr(MI);
4173       // This only makes sense to call in a kernel, so just lower to null.
4174       B.buildConstant(MI.getOperand(0).getReg(), 0);
4175       MI.eraseFromParent();
4176       return true;
4177     }
4178 
4179     return legalizePreloadedArgIntrin(
4180       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4181   case Intrinsic::amdgcn_implicitarg_ptr:
4182     return legalizeImplicitArgPtr(MI, MRI, B);
4183   case Intrinsic::amdgcn_workitem_id_x:
4184     return legalizePreloadedArgIntrin(MI, MRI, B,
4185                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4186   case Intrinsic::amdgcn_workitem_id_y:
4187     return legalizePreloadedArgIntrin(MI, MRI, B,
4188                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4189   case Intrinsic::amdgcn_workitem_id_z:
4190     return legalizePreloadedArgIntrin(MI, MRI, B,
4191                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4192   case Intrinsic::amdgcn_workgroup_id_x:
4193     return legalizePreloadedArgIntrin(MI, MRI, B,
4194                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4195   case Intrinsic::amdgcn_workgroup_id_y:
4196     return legalizePreloadedArgIntrin(MI, MRI, B,
4197                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4198   case Intrinsic::amdgcn_workgroup_id_z:
4199     return legalizePreloadedArgIntrin(MI, MRI, B,
4200                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4201   case Intrinsic::amdgcn_dispatch_ptr:
4202     return legalizePreloadedArgIntrin(MI, MRI, B,
4203                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4204   case Intrinsic::amdgcn_queue_ptr:
4205     return legalizePreloadedArgIntrin(MI, MRI, B,
4206                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4207   case Intrinsic::amdgcn_implicit_buffer_ptr:
4208     return legalizePreloadedArgIntrin(
4209       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4210   case Intrinsic::amdgcn_dispatch_id:
4211     return legalizePreloadedArgIntrin(MI, MRI, B,
4212                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4213   case Intrinsic::amdgcn_fdiv_fast:
4214     return legalizeFDIVFastIntrin(MI, MRI, B);
4215   case Intrinsic::amdgcn_is_shared:
4216     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4217   case Intrinsic::amdgcn_is_private:
4218     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4219   case Intrinsic::amdgcn_wavefrontsize: {
4220     B.setInstr(MI);
4221     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4222     MI.eraseFromParent();
4223     return true;
4224   }
4225   case Intrinsic::amdgcn_s_buffer_load:
4226     return legalizeSBufferLoad(MI, B, Observer);
4227   case Intrinsic::amdgcn_raw_buffer_store:
4228   case Intrinsic::amdgcn_struct_buffer_store:
4229     return legalizeBufferStore(MI, MRI, B, false, false);
4230   case Intrinsic::amdgcn_raw_buffer_store_format:
4231   case Intrinsic::amdgcn_struct_buffer_store_format:
4232     return legalizeBufferStore(MI, MRI, B, false, true);
4233   case Intrinsic::amdgcn_raw_tbuffer_store:
4234   case Intrinsic::amdgcn_struct_tbuffer_store:
4235     return legalizeBufferStore(MI, MRI, B, true, true);
4236   case Intrinsic::amdgcn_raw_buffer_load:
4237   case Intrinsic::amdgcn_struct_buffer_load:
4238     return legalizeBufferLoad(MI, MRI, B, false, false);
4239   case Intrinsic::amdgcn_raw_buffer_load_format:
4240   case Intrinsic::amdgcn_struct_buffer_load_format:
4241     return legalizeBufferLoad(MI, MRI, B, true, false);
4242   case Intrinsic::amdgcn_raw_tbuffer_load:
4243   case Intrinsic::amdgcn_struct_tbuffer_load:
4244     return legalizeBufferLoad(MI, MRI, B, true, true);
4245   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4246   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4247   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4248   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4249   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4250   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4251   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4252   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4253   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4254   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4255   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4256   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4257   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4258   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4259   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4260   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4261   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4262   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4263   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4264   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4265   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4266   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4267   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4268   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4269   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4270   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4271     return legalizeBufferAtomic(MI, B, IntrID);
4272   case Intrinsic::amdgcn_atomic_inc:
4273     return legalizeAtomicIncDec(MI, B, true);
4274   case Intrinsic::amdgcn_atomic_dec:
4275     return legalizeAtomicIncDec(MI, B, false);
4276   case Intrinsic::trap:
4277     return legalizeTrapIntrinsic(MI, MRI, B);
4278   case Intrinsic::debugtrap:
4279     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4280   default: {
4281     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4282             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4283       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4284     return true;
4285   }
4286   }
4287 
4288   return true;
4289 }
4290