1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Round the number of elements to the next power of two elements
40 static LLT getPow2VectorType(LLT Ty) {
41   unsigned NElts = Ty.getNumElements();
42   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
43   return Ty.changeNumElements(Pow2NElts);
44 }
45 
46 // Round the number of bits to the next power of two bits
47 static LLT getPow2ScalarType(LLT Ty) {
48   unsigned Bits = Ty.getSizeInBits();
49   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
50   return LLT::scalar(Pow2Bits);
51 }
52 
53 static LegalityPredicate isMultiple32(unsigned TypeIdx,
54                                       unsigned MaxSize = 1024) {
55   return [=](const LegalityQuery &Query) {
56     const LLT Ty = Query.Types[TypeIdx];
57     const LLT EltTy = Ty.getScalarType();
58     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
59   };
60 }
61 
62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
63   return [=](const LegalityQuery &Query) {
64     const LLT Ty = Query.Types[TypeIdx];
65     return Ty.isVector() &&
66            Ty.getNumElements() % 2 != 0 &&
67            Ty.getElementType().getSizeInBits() < 32 &&
68            Ty.getSizeInBits() % 32 != 0;
69   };
70 }
71 
72 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     const LLT EltTy = Ty.getScalarType();
76     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
77   };
78 }
79 
80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     const LLT EltTy = Ty.getElementType();
84     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
85   };
86 }
87 
88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getElementType();
92     unsigned Size = Ty.getSizeInBits();
93     unsigned Pieces = (Size + 63) / 64;
94     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
95     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
96   };
97 }
98 
99 // Increase the number of vector elements to reach the next multiple of 32-bit
100 // type.
101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104 
105     const LLT EltTy = Ty.getElementType();
106     const int Size = Ty.getSizeInBits();
107     const int EltSize = EltTy.getSizeInBits();
108     const int NextMul32 = (Size + 31) / 32;
109 
110     assert(EltSize < 32);
111 
112     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
113     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
114   };
115 }
116 
117 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
118   return [=](const LegalityQuery &Query) {
119     const LLT QueryTy = Query.Types[TypeIdx];
120     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
121   };
122 }
123 
124 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
125   return [=](const LegalityQuery &Query) {
126     const LLT QueryTy = Query.Types[TypeIdx];
127     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
128   };
129 }
130 
131 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
132   return [=](const LegalityQuery &Query) {
133     const LLT QueryTy = Query.Types[TypeIdx];
134     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
135   };
136 }
137 
138 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
139 // v2s16.
140 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
141   return [=](const LegalityQuery &Query) {
142     const LLT Ty = Query.Types[TypeIdx];
143     if (Ty.isVector()) {
144       const int EltSize = Ty.getElementType().getSizeInBits();
145       return EltSize == 32 || EltSize == 64 ||
146             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
147              EltSize == 128 || EltSize == 256;
148     }
149 
150     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
151   };
152 }
153 
154 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT QueryTy = Query.Types[TypeIdx];
157     if (!QueryTy.isVector())
158       return false;
159     const LLT EltTy = QueryTy.getElementType();
160     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
161   };
162 }
163 
164 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
165   return [=](const LegalityQuery &Query) {
166     const LLT Ty = Query.Types[TypeIdx];
167     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
168            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
169   };
170 }
171 
172 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
173                                          const GCNTargetMachine &TM)
174   :  ST(ST_) {
175   using namespace TargetOpcode;
176 
177   auto GetAddrSpacePtr = [&TM](unsigned AS) {
178     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
179   };
180 
181   const LLT S1 = LLT::scalar(1);
182   const LLT S16 = LLT::scalar(16);
183   const LLT S32 = LLT::scalar(32);
184   const LLT S64 = LLT::scalar(64);
185   const LLT S128 = LLT::scalar(128);
186   const LLT S256 = LLT::scalar(256);
187   const LLT S512 = LLT::scalar(512);
188   const LLT S1024 = LLT::scalar(1024);
189 
190   const LLT V2S16 = LLT::vector(2, 16);
191   const LLT V4S16 = LLT::vector(4, 16);
192 
193   const LLT V2S32 = LLT::vector(2, 32);
194   const LLT V3S32 = LLT::vector(3, 32);
195   const LLT V4S32 = LLT::vector(4, 32);
196   const LLT V5S32 = LLT::vector(5, 32);
197   const LLT V6S32 = LLT::vector(6, 32);
198   const LLT V7S32 = LLT::vector(7, 32);
199   const LLT V8S32 = LLT::vector(8, 32);
200   const LLT V9S32 = LLT::vector(9, 32);
201   const LLT V10S32 = LLT::vector(10, 32);
202   const LLT V11S32 = LLT::vector(11, 32);
203   const LLT V12S32 = LLT::vector(12, 32);
204   const LLT V13S32 = LLT::vector(13, 32);
205   const LLT V14S32 = LLT::vector(14, 32);
206   const LLT V15S32 = LLT::vector(15, 32);
207   const LLT V16S32 = LLT::vector(16, 32);
208   const LLT V32S32 = LLT::vector(32, 32);
209 
210   const LLT V2S64 = LLT::vector(2, 64);
211   const LLT V3S64 = LLT::vector(3, 64);
212   const LLT V4S64 = LLT::vector(4, 64);
213   const LLT V5S64 = LLT::vector(5, 64);
214   const LLT V6S64 = LLT::vector(6, 64);
215   const LLT V7S64 = LLT::vector(7, 64);
216   const LLT V8S64 = LLT::vector(8, 64);
217   const LLT V16S64 = LLT::vector(16, 64);
218 
219   std::initializer_list<LLT> AllS32Vectors =
220     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
221      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
222   std::initializer_list<LLT> AllS64Vectors =
223     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
224 
225   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
226   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
227   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
228   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
229   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
230   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
231   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
232 
233   const LLT CodePtr = FlatPtr;
234 
235   const std::initializer_list<LLT> AddrSpaces64 = {
236     GlobalPtr, ConstantPtr, FlatPtr
237   };
238 
239   const std::initializer_list<LLT> AddrSpaces32 = {
240     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
241   };
242 
243   const std::initializer_list<LLT> FPTypesBase = {
244     S32, S64
245   };
246 
247   const std::initializer_list<LLT> FPTypes16 = {
248     S32, S64, S16
249   };
250 
251   const std::initializer_list<LLT> FPTypesPK16 = {
252     S32, S64, S16, V2S16
253   };
254 
255   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
256 
257   setAction({G_BRCOND, S1}, Legal); // VCC branches
258   setAction({G_BRCOND, S32}, Legal); // SCC branches
259 
260   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
261   // elements for v3s16
262   getActionDefinitionsBuilder(G_PHI)
263     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
264     .legalFor(AllS32Vectors)
265     .legalFor(AllS64Vectors)
266     .legalFor(AddrSpaces64)
267     .legalFor(AddrSpaces32)
268     .clampScalar(0, S32, S256)
269     .widenScalarToNextPow2(0, 32)
270     .clampMaxNumElements(0, S32, 16)
271     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
272     .legalIf(isPointer(0));
273 
274   if (ST.hasVOP3PInsts()) {
275     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
276       .legalFor({S32, S16, V2S16})
277       .clampScalar(0, S16, S32)
278       .clampMaxNumElements(0, S16, 2)
279       .scalarize(0)
280       .widenScalarToNextPow2(0, 32);
281   } else if (ST.has16BitInsts()) {
282     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
283       .legalFor({S32, S16})
284       .clampScalar(0, S16, S32)
285       .scalarize(0)
286       .widenScalarToNextPow2(0, 32);
287   } else {
288     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
289       .legalFor({S32})
290       .clampScalar(0, S32, S32)
291       .scalarize(0);
292   }
293 
294   // FIXME: Not really legal. Placeholder for custom lowering.
295   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
296     .customFor({S32, S64})
297     .clampScalar(0, S32, S64)
298     .widenScalarToNextPow2(0, 32)
299     .scalarize(0);
300 
301   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
302     .legalFor({S32})
303     .clampScalar(0, S32, S32)
304     .scalarize(0);
305 
306   // Report legal for any types we can handle anywhere. For the cases only legal
307   // on the SALU, RegBankSelect will be able to re-legalize.
308   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
309     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
310     .clampScalar(0, S32, S64)
311     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
312     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
313     .widenScalarToNextPow2(0)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
317                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
318     .legalFor({{S32, S1}, {S32, S32}})
319     .minScalar(0, S32)
320     // TODO: .scalarize(0)
321     .lower();
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     .lower();
327 
328 
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   getActionDefinitionsBuilder(G_FCONSTANT)
337     .legalFor({S32, S64, S16})
338     .clampScalar(0, S16, S64);
339 
340   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
341       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
342                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
343       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344       .clampScalarOrElt(0, S32, S1024)
345       .legalIf(isMultiple32(0))
346       .widenScalarToNextPow2(0, 32)
347       .clampMaxNumElements(0, S32, 16);
348 
349   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
350 
351   // If the amount is divergent, we have to do a wave reduction to get the
352   // maximum value, so this is expanded during RegBankSelect.
353   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
354     .legalFor({{PrivatePtr, S32}});
355 
356   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
357     .unsupportedFor({PrivatePtr})
358     .custom();
359   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
360 
361   auto &FPOpActions = getActionDefinitionsBuilder(
362     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
363     .legalFor({S32, S64});
364   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
365     .customFor({S32, S64});
366   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
367     .customFor({S32, S64});
368 
369   if (ST.has16BitInsts()) {
370     if (ST.hasVOP3PInsts())
371       FPOpActions.legalFor({S16, V2S16});
372     else
373       FPOpActions.legalFor({S16});
374 
375     TrigActions.customFor({S16});
376     FDIVActions.customFor({S16});
377   }
378 
379   auto &MinNumMaxNum = getActionDefinitionsBuilder({
380       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
381 
382   if (ST.hasVOP3PInsts()) {
383     MinNumMaxNum.customFor(FPTypesPK16)
384       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
385       .clampMaxNumElements(0, S16, 2)
386       .clampScalar(0, S16, S64)
387       .scalarize(0);
388   } else if (ST.has16BitInsts()) {
389     MinNumMaxNum.customFor(FPTypes16)
390       .clampScalar(0, S16, S64)
391       .scalarize(0);
392   } else {
393     MinNumMaxNum.customFor(FPTypesBase)
394       .clampScalar(0, S32, S64)
395       .scalarize(0);
396   }
397 
398   if (ST.hasVOP3PInsts())
399     FPOpActions.clampMaxNumElements(0, S16, 2);
400 
401   FPOpActions
402     .scalarize(0)
403     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
404 
405   TrigActions
406     .scalarize(0)
407     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
408 
409   FDIVActions
410     .scalarize(0)
411     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
412 
413   getActionDefinitionsBuilder({G_FNEG, G_FABS})
414     .legalFor(FPTypesPK16)
415     .clampMaxNumElements(0, S16, 2)
416     .scalarize(0)
417     .clampScalar(0, S16, S64);
418 
419   if (ST.has16BitInsts()) {
420     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
421       .legalFor({S32, S64, S16})
422       .scalarize(0)
423       .clampScalar(0, S16, S64);
424   } else {
425     getActionDefinitionsBuilder(G_FSQRT)
426       .legalFor({S32, S64})
427       .scalarize(0)
428       .clampScalar(0, S32, S64);
429 
430     if (ST.hasFractBug()) {
431       getActionDefinitionsBuilder(G_FFLOOR)
432         .customFor({S64})
433         .legalFor({S32, S64})
434         .scalarize(0)
435         .clampScalar(0, S32, S64);
436     } else {
437       getActionDefinitionsBuilder(G_FFLOOR)
438         .legalFor({S32, S64})
439         .scalarize(0)
440         .clampScalar(0, S32, S64);
441     }
442   }
443 
444   getActionDefinitionsBuilder(G_FPTRUNC)
445     .legalFor({{S32, S64}, {S16, S32}})
446     .scalarize(0)
447     .lower();
448 
449   getActionDefinitionsBuilder(G_FPEXT)
450     .legalFor({{S64, S32}, {S32, S16}})
451     .lowerFor({{S64, S16}}) // FIXME: Implement
452     .scalarize(0);
453 
454   getActionDefinitionsBuilder(G_FSUB)
455       // Use actual fsub instruction
456       .legalFor({S32})
457       // Must use fadd + fneg
458       .lowerFor({S64, S16, V2S16})
459       .scalarize(0)
460       .clampScalar(0, S32, S64);
461 
462   // Whether this is legal depends on the floating point mode for the function.
463   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
464   if (ST.hasMadF16())
465     FMad.customFor({S32, S16});
466   else
467     FMad.customFor({S32});
468   FMad.scalarize(0)
469       .lower();
470 
471   // TODO: Do we need to clamp maximum bitwidth?
472   getActionDefinitionsBuilder(G_TRUNC)
473     .legalIf(isScalar(0))
474     .legalFor({{V2S16, V2S32}})
475     .clampMaxNumElements(0, S16, 2)
476     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
477     // situations (like an invalid implicit use), we don't want to infinite loop
478     // in the legalizer.
479     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
480     .alwaysLegal();
481 
482   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
483     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
484                {S32, S1}, {S64, S1}, {S16, S1}})
485     .scalarize(0)
486     .clampScalar(0, S32, S64)
487     .widenScalarToNextPow2(1, 32);
488 
489   // TODO: Split s1->s64 during regbankselect for VALU.
490   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
491     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
492     .lowerFor({{S32, S64}})
493     .lowerIf(typeIs(1, S1))
494     .customFor({{S64, S64}});
495   if (ST.has16BitInsts())
496     IToFP.legalFor({{S16, S16}});
497   IToFP.clampScalar(1, S32, S64)
498        .scalarize(0)
499        .widenScalarToNextPow2(1);
500 
501   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
502     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
503     .customFor({{S64, S64}});
504   if (ST.has16BitInsts())
505     FPToI.legalFor({{S16, S16}});
506   else
507     FPToI.minScalar(1, S32);
508 
509   FPToI.minScalar(0, S32)
510        .scalarize(0)
511        .lower();
512 
513   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
514     .scalarize(0)
515     .lower();
516 
517   if (ST.has16BitInsts()) {
518     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
519       .legalFor({S16, S32, S64})
520       .clampScalar(0, S16, S64)
521       .scalarize(0);
522   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
523     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
524       .legalFor({S32, S64})
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   } else {
528     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
529       .legalFor({S32})
530       .customFor({S64})
531       .clampScalar(0, S32, S64)
532       .scalarize(0);
533   }
534 
535   // FIXME: Clamp offset operand.
536   getActionDefinitionsBuilder(G_PTR_ADD)
537     .legalIf(isPointer(0))
538     .scalarize(0);
539 
540   getActionDefinitionsBuilder(G_PTRMASK)
541     .legalIf(typeInSet(1, {S64, S32}))
542     .minScalar(1, S32)
543     .maxScalarIf(sizeIs(0, 32), 1, S32)
544     .maxScalarIf(sizeIs(0, 64), 1, S64)
545     .scalarize(0);
546 
547   auto &CmpBuilder =
548     getActionDefinitionsBuilder(G_ICMP)
549     // The compare output type differs based on the register bank of the output,
550     // so make both s1 and s32 legal.
551     //
552     // Scalar compares producing output in scc will be promoted to s32, as that
553     // is the allocatable register type that will be needed for the copy from
554     // scc. This will be promoted during RegBankSelect, and we assume something
555     // before that won't try to use s32 result types.
556     //
557     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
558     // bank.
559     .legalForCartesianProduct(
560       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
561     .legalForCartesianProduct(
562       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
563   if (ST.has16BitInsts()) {
564     CmpBuilder.legalFor({{S1, S16}});
565   }
566 
567   CmpBuilder
568     .widenScalarToNextPow2(1)
569     .clampScalar(1, S32, S64)
570     .scalarize(0)
571     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
572 
573   getActionDefinitionsBuilder(G_FCMP)
574     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
575     .widenScalarToNextPow2(1)
576     .clampScalar(1, S32, S64)
577     .scalarize(0);
578 
579   // FIXME: fpow has a selection pattern that should move to custom lowering.
580   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
581   if (ST.has16BitInsts())
582     Exp2Ops.legalFor({S32, S16});
583   else
584     Exp2Ops.legalFor({S32});
585   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
586   Exp2Ops.scalarize(0);
587 
588   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
589   if (ST.has16BitInsts())
590     ExpOps.customFor({{S32}, {S16}});
591   else
592     ExpOps.customFor({S32});
593   ExpOps.clampScalar(0, MinScalarFPTy, S32)
594         .scalarize(0);
595 
596   // The 64-bit versions produce 32-bit results, but only on the SALU.
597   getActionDefinitionsBuilder(G_CTPOP)
598     .legalFor({{S32, S32}, {S32, S64}})
599     .clampScalar(0, S32, S32)
600     .clampScalar(1, S32, S64)
601     .scalarize(0)
602     .widenScalarToNextPow2(0, 32)
603     .widenScalarToNextPow2(1, 32);
604 
605   // The hardware instructions return a different result on 0 than the generic
606   // instructions expect. The hardware produces -1, but these produce the
607   // bitwidth.
608   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
609     .scalarize(0)
610     .clampScalar(0, S32, S32)
611     .clampScalar(1, S32, S64)
612     .widenScalarToNextPow2(0, 32)
613     .widenScalarToNextPow2(1, 32)
614     .lower();
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   getActionDefinitionsBuilder(G_BITREVERSE)
626     .legalFor({S32})
627     .clampScalar(0, S32, S32)
628     .scalarize(0);
629 
630   if (ST.has16BitInsts()) {
631     getActionDefinitionsBuilder(G_BSWAP)
632       .legalFor({S16, S32, V2S16})
633       .clampMaxNumElements(0, S16, 2)
634       // FIXME: Fixing non-power-of-2 before clamp is workaround for
635       // narrowScalar limitation.
636       .widenScalarToNextPow2(0)
637       .clampScalar(0, S16, S32)
638       .scalarize(0);
639 
640     if (ST.hasVOP3PInsts()) {
641       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642         .legalFor({S32, S16, V2S16})
643         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644         .clampMaxNumElements(0, S16, 2)
645         .minScalar(0, S16)
646         .widenScalarToNextPow2(0)
647         .scalarize(0)
648         .lower();
649     } else {
650       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
651         .legalFor({S32, S16})
652         .widenScalarToNextPow2(0)
653         .minScalar(0, S16)
654         .scalarize(0)
655         .lower();
656     }
657   } else {
658     // TODO: Should have same legality without v_perm_b32
659     getActionDefinitionsBuilder(G_BSWAP)
660       .legalFor({S32})
661       .lowerIf(scalarNarrowerThan(0, 32))
662       // FIXME: Fixing non-power-of-2 before clamp is workaround for
663       // narrowScalar limitation.
664       .widenScalarToNextPow2(0)
665       .maxScalar(0, S32)
666       .scalarize(0)
667       .lower();
668 
669     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
670       .legalFor({S32})
671       .minScalar(0, S32)
672       .widenScalarToNextPow2(0)
673       .scalarize(0)
674       .lower();
675   }
676 
677   getActionDefinitionsBuilder(G_INTTOPTR)
678     // List the common cases
679     .legalForCartesianProduct(AddrSpaces64, {S64})
680     .legalForCartesianProduct(AddrSpaces32, {S32})
681     .scalarize(0)
682     // Accept any address space as long as the size matches
683     .legalIf(sameSize(0, 1))
684     .widenScalarIf(smallerThan(1, 0),
685       [](const LegalityQuery &Query) {
686         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
687       })
688     .narrowScalarIf(largerThan(1, 0),
689       [](const LegalityQuery &Query) {
690         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
691       });
692 
693   getActionDefinitionsBuilder(G_PTRTOINT)
694     // List the common cases
695     .legalForCartesianProduct(AddrSpaces64, {S64})
696     .legalForCartesianProduct(AddrSpaces32, {S32})
697     .scalarize(0)
698     // Accept any address space as long as the size matches
699     .legalIf(sameSize(0, 1))
700     .widenScalarIf(smallerThan(0, 1),
701       [](const LegalityQuery &Query) {
702         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
703       })
704     .narrowScalarIf(
705       largerThan(0, 1),
706       [](const LegalityQuery &Query) {
707         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
708       });
709 
710   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
711     .scalarize(0)
712     .custom();
713 
714   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
715   // handle some operations by just promoting the register during
716   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
717   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
718     switch (AS) {
719     // FIXME: Private element size.
720     case AMDGPUAS::PRIVATE_ADDRESS:
721       return 32;
722     // FIXME: Check subtarget
723     case AMDGPUAS::LOCAL_ADDRESS:
724       return ST.useDS128() ? 128 : 64;
725 
726     // Treat constant and global as identical. SMRD loads are sometimes usable
727     // for global loads (ideally constant address space should be eliminated)
728     // depending on the context. Legality cannot be context dependent, but
729     // RegBankSelect can split the load as necessary depending on the pointer
730     // register bank/uniformity and if the memory is invariant or not written in
731     // a kernel.
732     case AMDGPUAS::CONSTANT_ADDRESS:
733     case AMDGPUAS::GLOBAL_ADDRESS:
734       return IsLoad ? 512 : 128;
735     default:
736       return 128;
737     }
738   };
739 
740   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
741                                     bool IsLoad) -> bool {
742     const LLT DstTy = Query.Types[0];
743 
744     // Split vector extloads.
745     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
746     unsigned Align = Query.MMODescrs[0].AlignInBits;
747 
748     if (MemSize < DstTy.getSizeInBits())
749       MemSize = std::max(MemSize, Align);
750 
751     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
752       return true;
753 
754     const LLT PtrTy = Query.Types[1];
755     unsigned AS = PtrTy.getAddressSpace();
756     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
757       return true;
758 
759     // Catch weird sized loads that don't evenly divide into the access sizes
760     // TODO: May be able to widen depending on alignment etc.
761     unsigned NumRegs = (MemSize + 31) / 32;
762     if (NumRegs == 3) {
763       if (!ST.hasDwordx3LoadStores())
764         return true;
765     } else {
766       // If the alignment allows, these should have been widened.
767       if (!isPowerOf2_32(NumRegs))
768         return true;
769     }
770 
771     if (Align < MemSize) {
772       const SITargetLowering *TLI = ST.getTargetLowering();
773       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
774     }
775 
776     return false;
777   };
778 
779   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
780     unsigned Size = Query.Types[0].getSizeInBits();
781     if (isPowerOf2_32(Size))
782       return false;
783 
784     if (Size == 96 && ST.hasDwordx3LoadStores())
785       return false;
786 
787     unsigned AddrSpace = Query.Types[1].getAddressSpace();
788     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
789       return false;
790 
791     unsigned Align = Query.MMODescrs[0].AlignInBits;
792     unsigned RoundedSize = NextPowerOf2(Size);
793     return (Align >= RoundedSize);
794   };
795 
796   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
797   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
798   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
799 
800   // TODO: Refine based on subtargets which support unaligned access or 128-bit
801   // LDS
802   // TODO: Unsupported flat for SI.
803 
804   for (unsigned Op : {G_LOAD, G_STORE}) {
805     const bool IsStore = Op == G_STORE;
806 
807     auto &Actions = getActionDefinitionsBuilder(Op);
808     // Whitelist the common cases.
809     // TODO: Loads to s16 on gfx9
810     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
811                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
812                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
813                                       {S128, GlobalPtr, 128, GlobalAlign32},
814                                       {S64, GlobalPtr, 64, GlobalAlign32},
815                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
816                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
817                                       {S32, GlobalPtr, 8, GlobalAlign8},
818                                       {S32, GlobalPtr, 16, GlobalAlign16},
819 
820                                       {S32, LocalPtr, 32, 32},
821                                       {S64, LocalPtr, 64, 32},
822                                       {V2S32, LocalPtr, 64, 32},
823                                       {S32, LocalPtr, 8, 8},
824                                       {S32, LocalPtr, 16, 16},
825                                       {V2S16, LocalPtr, 32, 32},
826 
827                                       {S32, PrivatePtr, 32, 32},
828                                       {S32, PrivatePtr, 8, 8},
829                                       {S32, PrivatePtr, 16, 16},
830                                       {V2S16, PrivatePtr, 32, 32},
831 
832                                       {S32, FlatPtr, 32, GlobalAlign32},
833                                       {S32, FlatPtr, 16, GlobalAlign16},
834                                       {S32, FlatPtr, 8, GlobalAlign8},
835                                       {V2S16, FlatPtr, 32, GlobalAlign32},
836 
837                                       {S32, ConstantPtr, 32, GlobalAlign32},
838                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
839                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
840                                       {S64, ConstantPtr, 64, GlobalAlign32},
841                                       {S128, ConstantPtr, 128, GlobalAlign32},
842                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
843     Actions
844         .customIf(typeIs(1, Constant32Ptr))
845         // Widen suitably aligned loads by loading extra elements.
846         .moreElementsIf([=](const LegalityQuery &Query) {
847             const LLT Ty = Query.Types[0];
848             return Op == G_LOAD && Ty.isVector() &&
849                    shouldWidenLoadResult(Query);
850           }, moreElementsToNextPow2(0))
851         .widenScalarIf([=](const LegalityQuery &Query) {
852             const LLT Ty = Query.Types[0];
853             return Op == G_LOAD && !Ty.isVector() &&
854                    shouldWidenLoadResult(Query);
855           }, widenScalarOrEltToNextPow2(0))
856         .narrowScalarIf(
857             [=](const LegalityQuery &Query) -> bool {
858               return !Query.Types[0].isVector() &&
859                      needToSplitMemOp(Query, Op == G_LOAD);
860             },
861             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
862               const LLT DstTy = Query.Types[0];
863               const LLT PtrTy = Query.Types[1];
864 
865               const unsigned DstSize = DstTy.getSizeInBits();
866               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
867 
868               // Split extloads.
869               if (DstSize > MemSize)
870                 return std::make_pair(0, LLT::scalar(MemSize));
871 
872               if (!isPowerOf2_32(DstSize)) {
873                 // We're probably decomposing an odd sized store. Try to split
874                 // to the widest type. TODO: Account for alignment. As-is it
875                 // should be OK, since the new parts will be further legalized.
876                 unsigned FloorSize = PowerOf2Floor(DstSize);
877                 return std::make_pair(0, LLT::scalar(FloorSize));
878               }
879 
880               if (DstSize > 32 && (DstSize % 32 != 0)) {
881                 // FIXME: Need a way to specify non-extload of larger size if
882                 // suitably aligned.
883                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
884               }
885 
886               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
887                                                      Op == G_LOAD);
888               if (MemSize > MaxSize)
889                 return std::make_pair(0, LLT::scalar(MaxSize));
890 
891               unsigned Align = Query.MMODescrs[0].AlignInBits;
892               return std::make_pair(0, LLT::scalar(Align));
893             })
894         .fewerElementsIf(
895             [=](const LegalityQuery &Query) -> bool {
896               return Query.Types[0].isVector() &&
897                      needToSplitMemOp(Query, Op == G_LOAD);
898             },
899             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
900               const LLT DstTy = Query.Types[0];
901               const LLT PtrTy = Query.Types[1];
902 
903               LLT EltTy = DstTy.getElementType();
904               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
905                                                      Op == G_LOAD);
906 
907               // FIXME: Handle widened to power of 2 results better. This ends
908               // up scalarizing.
909               // FIXME: 3 element stores scalarized on SI
910 
911               // Split if it's too large for the address space.
912               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
913                 unsigned NumElts = DstTy.getNumElements();
914                 unsigned EltSize = EltTy.getSizeInBits();
915 
916                 if (MaxSize % EltSize == 0) {
917                   return std::make_pair(
918                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
919                 }
920 
921                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
922 
923                 // FIXME: Refine when odd breakdowns handled
924                 // The scalars will need to be re-legalized.
925                 if (NumPieces == 1 || NumPieces >= NumElts ||
926                     NumElts % NumPieces != 0)
927                   return std::make_pair(0, EltTy);
928 
929                 return std::make_pair(0,
930                                       LLT::vector(NumElts / NumPieces, EltTy));
931               }
932 
933               // FIXME: We could probably handle weird extending loads better.
934               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
935               if (DstTy.getSizeInBits() > MemSize)
936                 return std::make_pair(0, EltTy);
937 
938               unsigned EltSize = EltTy.getSizeInBits();
939               unsigned DstSize = DstTy.getSizeInBits();
940               if (!isPowerOf2_32(DstSize)) {
941                 // We're probably decomposing an odd sized store. Try to split
942                 // to the widest type. TODO: Account for alignment. As-is it
943                 // should be OK, since the new parts will be further legalized.
944                 unsigned FloorSize = PowerOf2Floor(DstSize);
945                 return std::make_pair(
946                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
947               }
948 
949               // Need to split because of alignment.
950               unsigned Align = Query.MMODescrs[0].AlignInBits;
951               if (EltSize > Align &&
952                   (EltSize / Align < DstTy.getNumElements())) {
953                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
954               }
955 
956               // May need relegalization for the scalars.
957               return std::make_pair(0, EltTy);
958             })
959         .minScalar(0, S32);
960 
961     if (IsStore)
962       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
963 
964     // TODO: Need a bitcast lower option?
965     Actions
966         .legalIf([=](const LegalityQuery &Query) {
967           const LLT Ty0 = Query.Types[0];
968           unsigned Size = Ty0.getSizeInBits();
969           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
970           unsigned Align = Query.MMODescrs[0].AlignInBits;
971 
972           // FIXME: Widening store from alignment not valid.
973           if (MemSize < Size)
974             MemSize = std::max(MemSize, Align);
975 
976           // No extending vector loads.
977           if (Size > MemSize && Ty0.isVector())
978             return false;
979 
980           switch (MemSize) {
981           case 8:
982           case 16:
983             return Size == 32;
984           case 32:
985           case 64:
986           case 128:
987             return true;
988           case 96:
989             return ST.hasDwordx3LoadStores();
990           case 256:
991           case 512:
992             return true;
993           default:
994             return false;
995           }
996         })
997         .widenScalarToNextPow2(0)
998         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
999   }
1000 
1001   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1002                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1003                                                   {S32, GlobalPtr, 16, 2 * 8},
1004                                                   {S32, LocalPtr, 8, 8},
1005                                                   {S32, LocalPtr, 16, 16},
1006                                                   {S32, PrivatePtr, 8, 8},
1007                                                   {S32, PrivatePtr, 16, 16},
1008                                                   {S32, ConstantPtr, 8, 8},
1009                                                   {S32, ConstantPtr, 16, 2 * 8}});
1010   if (ST.hasFlatAddressSpace()) {
1011     ExtLoads.legalForTypesWithMemDesc(
1012         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1013   }
1014 
1015   ExtLoads.clampScalar(0, S32, S32)
1016           .widenScalarToNextPow2(0)
1017           .unsupportedIfMemSizeNotPow2()
1018           .lower();
1019 
1020   auto &Atomics = getActionDefinitionsBuilder(
1021     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1022      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1023      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1024      G_ATOMICRMW_UMIN})
1025     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1026                {S64, GlobalPtr}, {S64, LocalPtr}});
1027   if (ST.hasFlatAddressSpace()) {
1028     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1029   }
1030 
1031   if (ST.hasLDSFPAtomics()) {
1032     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1033       .legalFor({{S32, LocalPtr}});
1034   }
1035 
1036   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1037   // demarshalling
1038   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1039     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1040                 {S32, FlatPtr}, {S64, FlatPtr}})
1041     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1042                {S32, RegionPtr}, {S64, RegionPtr}});
1043   // TODO: Pointer types, any 32-bit or 64-bit vector
1044 
1045   // Condition should be s32 for scalar, s1 for vector.
1046   getActionDefinitionsBuilder(G_SELECT)
1047     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1048           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1049           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1050     .clampScalar(0, S16, S64)
1051     .scalarize(1)
1052     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1053     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1054     .clampMaxNumElements(0, S32, 2)
1055     .clampMaxNumElements(0, LocalPtr, 2)
1056     .clampMaxNumElements(0, PrivatePtr, 2)
1057     .scalarize(0)
1058     .widenScalarToNextPow2(0)
1059     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1060 
1061   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1062   // be more flexible with the shift amount type.
1063   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1064     .legalFor({{S32, S32}, {S64, S32}});
1065   if (ST.has16BitInsts()) {
1066     if (ST.hasVOP3PInsts()) {
1067       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1068             .clampMaxNumElements(0, S16, 2);
1069     } else
1070       Shifts.legalFor({{S16, S16}});
1071 
1072     // TODO: Support 16-bit shift amounts for all types
1073     Shifts.widenScalarIf(
1074       [=](const LegalityQuery &Query) {
1075         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1076         // 32-bit amount.
1077         const LLT ValTy = Query.Types[0];
1078         const LLT AmountTy = Query.Types[1];
1079         return ValTy.getSizeInBits() <= 16 &&
1080                AmountTy.getSizeInBits() < 16;
1081       }, changeTo(1, S16));
1082     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1083     Shifts.clampScalar(1, S32, S32);
1084     Shifts.clampScalar(0, S16, S64);
1085     Shifts.widenScalarToNextPow2(0, 16);
1086   } else {
1087     // Make sure we legalize the shift amount type first, as the general
1088     // expansion for the shifted type will produce much worse code if it hasn't
1089     // been truncated already.
1090     Shifts.clampScalar(1, S32, S32);
1091     Shifts.clampScalar(0, S32, S64);
1092     Shifts.widenScalarToNextPow2(0, 32);
1093   }
1094   Shifts.scalarize(0);
1095 
1096   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1097     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1098     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1099     unsigned IdxTypeIdx = 2;
1100 
1101     getActionDefinitionsBuilder(Op)
1102       .customIf([=](const LegalityQuery &Query) {
1103           const LLT EltTy = Query.Types[EltTypeIdx];
1104           const LLT VecTy = Query.Types[VecTypeIdx];
1105           const LLT IdxTy = Query.Types[IdxTypeIdx];
1106           return (EltTy.getSizeInBits() == 16 ||
1107                   EltTy.getSizeInBits() % 32 == 0) &&
1108                  VecTy.getSizeInBits() % 32 == 0 &&
1109                  VecTy.getSizeInBits() <= 1024 &&
1110                  IdxTy.getSizeInBits() == 32;
1111         })
1112       .clampScalar(EltTypeIdx, S32, S64)
1113       .clampScalar(VecTypeIdx, S32, S64)
1114       .clampScalar(IdxTypeIdx, S32, S32);
1115   }
1116 
1117   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1118     .unsupportedIf([=](const LegalityQuery &Query) {
1119         const LLT &EltTy = Query.Types[1].getElementType();
1120         return Query.Types[0] != EltTy;
1121       });
1122 
1123   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1124     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1125     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1126 
1127     // FIXME: Doesn't handle extract of illegal sizes.
1128     getActionDefinitionsBuilder(Op)
1129       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1130       // FIXME: Multiples of 16 should not be legal.
1131       .legalIf([=](const LegalityQuery &Query) {
1132           const LLT BigTy = Query.Types[BigTyIdx];
1133           const LLT LitTy = Query.Types[LitTyIdx];
1134           return (BigTy.getSizeInBits() % 32 == 0) &&
1135                  (LitTy.getSizeInBits() % 16 == 0);
1136         })
1137       .widenScalarIf(
1138         [=](const LegalityQuery &Query) {
1139           const LLT BigTy = Query.Types[BigTyIdx];
1140           return (BigTy.getScalarSizeInBits() < 16);
1141         },
1142         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1143       .widenScalarIf(
1144         [=](const LegalityQuery &Query) {
1145           const LLT LitTy = Query.Types[LitTyIdx];
1146           return (LitTy.getScalarSizeInBits() < 16);
1147         },
1148         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1149       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1150       .widenScalarToNextPow2(BigTyIdx, 32);
1151 
1152   }
1153 
1154   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1155     .legalForCartesianProduct(AllS32Vectors, {S32})
1156     .legalForCartesianProduct(AllS64Vectors, {S64})
1157     .clampNumElements(0, V16S32, V32S32)
1158     .clampNumElements(0, V2S64, V16S64)
1159     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1160 
1161   if (ST.hasScalarPackInsts()) {
1162     BuildVector
1163       // FIXME: Should probably widen s1 vectors straight to s32
1164       .minScalarOrElt(0, S16)
1165       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1166       .minScalar(1, S32);
1167 
1168     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1169       .legalFor({V2S16, S32})
1170       .lower();
1171     BuildVector.minScalarOrElt(0, S32);
1172   } else {
1173     BuildVector.customFor({V2S16, S16});
1174     BuildVector.minScalarOrElt(0, S32);
1175 
1176     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1177       .customFor({V2S16, S32})
1178       .lower();
1179   }
1180 
1181   BuildVector.legalIf(isRegisterType(0));
1182 
1183   // FIXME: Clamp maximum size
1184   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1185     .legalIf(isRegisterType(0));
1186 
1187   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1188   // pre-legalize.
1189   if (ST.hasVOP3PInsts()) {
1190     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1191       .customFor({V2S16, V2S16})
1192       .lower();
1193   } else
1194     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1195 
1196   // Merge/Unmerge
1197   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1198     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1199     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1200 
1201     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1202       const LLT Ty = Query.Types[TypeIdx];
1203       if (Ty.isVector()) {
1204         const LLT &EltTy = Ty.getElementType();
1205         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1206           return true;
1207         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1208           return true;
1209       }
1210       return false;
1211     };
1212 
1213     auto &Builder = getActionDefinitionsBuilder(Op)
1214       .lowerFor({{S16, V2S16}})
1215       .lowerIf([=](const LegalityQuery &Query) {
1216           const LLT BigTy = Query.Types[BigTyIdx];
1217           return BigTy.getSizeInBits() == 32;
1218         })
1219       // Try to widen to s16 first for small types.
1220       // TODO: Only do this on targets with legal s16 shifts
1221       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1222       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1223       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1224       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1225                            elementTypeIs(1, S16)),
1226                        changeTo(1, V2S16))
1227       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1228       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1229       // valid.
1230       .clampScalar(LitTyIdx, S32, S512)
1231       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1232       // Break up vectors with weird elements into scalars
1233       .fewerElementsIf(
1234         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1235         scalarize(0))
1236       .fewerElementsIf(
1237         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1238         scalarize(1))
1239       .clampScalar(BigTyIdx, S32, S1024);
1240 
1241     if (Op == G_MERGE_VALUES) {
1242       Builder.widenScalarIf(
1243         // TODO: Use 16-bit shifts if legal for 8-bit values?
1244         [=](const LegalityQuery &Query) {
1245           const LLT Ty = Query.Types[LitTyIdx];
1246           return Ty.getSizeInBits() < 32;
1247         },
1248         changeTo(LitTyIdx, S32));
1249     }
1250 
1251     Builder.widenScalarIf(
1252       [=](const LegalityQuery &Query) {
1253         const LLT Ty = Query.Types[BigTyIdx];
1254         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1255           Ty.getSizeInBits() % 16 != 0;
1256       },
1257       [=](const LegalityQuery &Query) {
1258         // Pick the next power of 2, or a multiple of 64 over 128.
1259         // Whichever is smaller.
1260         const LLT &Ty = Query.Types[BigTyIdx];
1261         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1262         if (NewSizeInBits >= 256) {
1263           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1264           if (RoundedTo < NewSizeInBits)
1265             NewSizeInBits = RoundedTo;
1266         }
1267         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1268       })
1269       .legalIf([=](const LegalityQuery &Query) {
1270           const LLT &BigTy = Query.Types[BigTyIdx];
1271           const LLT &LitTy = Query.Types[LitTyIdx];
1272 
1273           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1274             return false;
1275           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1276             return false;
1277 
1278           return BigTy.getSizeInBits() % 16 == 0 &&
1279                  LitTy.getSizeInBits() % 16 == 0 &&
1280                  BigTy.getSizeInBits() <= 1024;
1281         })
1282       // Any vectors left are the wrong size. Scalarize them.
1283       .scalarize(0)
1284       .scalarize(1);
1285   }
1286 
1287   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1288   // RegBankSelect.
1289   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1290     .legalFor({{S32}, {S64}});
1291 
1292   if (ST.hasVOP3PInsts()) {
1293     SextInReg.lowerFor({{V2S16}})
1294       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1295       // get more vector shift opportunities, since we'll get those when
1296       // expanded.
1297       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1298   } else if (ST.has16BitInsts()) {
1299     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1300   } else {
1301     // Prefer to promote to s32 before lowering if we don't have 16-bit
1302     // shifts. This avoid a lot of intermediate truncate and extend operations.
1303     SextInReg.lowerFor({{S32}, {S64}});
1304   }
1305 
1306   SextInReg
1307     .scalarize(0)
1308     .clampScalar(0, S32, S64)
1309     .lower();
1310 
1311   getActionDefinitionsBuilder(G_FSHR)
1312     .legalFor({{S32, S32}})
1313     .scalarize(0)
1314     .lower();
1315 
1316   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1317     .legalFor({S64});
1318 
1319   getActionDefinitionsBuilder({
1320       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1321       G_FCOPYSIGN,
1322 
1323       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1324       G_READ_REGISTER,
1325       G_WRITE_REGISTER,
1326 
1327       G_SADDO, G_SSUBO,
1328 
1329        // TODO: Implement
1330       G_FMINIMUM, G_FMAXIMUM,
1331       G_FSHL
1332     }).lower();
1333 
1334   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1335         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1336         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1337     .unsupported();
1338 
1339   computeTables();
1340   verify(*ST.getInstrInfo());
1341 }
1342 
1343 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1344                                          MachineRegisterInfo &MRI,
1345                                          MachineIRBuilder &B,
1346                                          GISelChangeObserver &Observer) const {
1347   switch (MI.getOpcode()) {
1348   case TargetOpcode::G_ADDRSPACE_CAST:
1349     return legalizeAddrSpaceCast(MI, MRI, B);
1350   case TargetOpcode::G_FRINT:
1351     return legalizeFrint(MI, MRI, B);
1352   case TargetOpcode::G_FCEIL:
1353     return legalizeFceil(MI, MRI, B);
1354   case TargetOpcode::G_INTRINSIC_TRUNC:
1355     return legalizeIntrinsicTrunc(MI, MRI, B);
1356   case TargetOpcode::G_SITOFP:
1357     return legalizeITOFP(MI, MRI, B, true);
1358   case TargetOpcode::G_UITOFP:
1359     return legalizeITOFP(MI, MRI, B, false);
1360   case TargetOpcode::G_FPTOSI:
1361     return legalizeFPTOI(MI, MRI, B, true);
1362   case TargetOpcode::G_FPTOUI:
1363     return legalizeFPTOI(MI, MRI, B, false);
1364   case TargetOpcode::G_FMINNUM:
1365   case TargetOpcode::G_FMAXNUM:
1366   case TargetOpcode::G_FMINNUM_IEEE:
1367   case TargetOpcode::G_FMAXNUM_IEEE:
1368     return legalizeMinNumMaxNum(MI, MRI, B);
1369   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1370     return legalizeExtractVectorElt(MI, MRI, B);
1371   case TargetOpcode::G_INSERT_VECTOR_ELT:
1372     return legalizeInsertVectorElt(MI, MRI, B);
1373   case TargetOpcode::G_SHUFFLE_VECTOR:
1374     return legalizeShuffleVector(MI, MRI, B);
1375   case TargetOpcode::G_FSIN:
1376   case TargetOpcode::G_FCOS:
1377     return legalizeSinCos(MI, MRI, B);
1378   case TargetOpcode::G_GLOBAL_VALUE:
1379     return legalizeGlobalValue(MI, MRI, B);
1380   case TargetOpcode::G_LOAD:
1381     return legalizeLoad(MI, MRI, B, Observer);
1382   case TargetOpcode::G_FMAD:
1383     return legalizeFMad(MI, MRI, B);
1384   case TargetOpcode::G_FDIV:
1385     return legalizeFDIV(MI, MRI, B);
1386   case TargetOpcode::G_UDIV:
1387   case TargetOpcode::G_UREM:
1388     return legalizeUDIV_UREM(MI, MRI, B);
1389   case TargetOpcode::G_SDIV:
1390   case TargetOpcode::G_SREM:
1391     return legalizeSDIV_SREM(MI, MRI, B);
1392   case TargetOpcode::G_ATOMIC_CMPXCHG:
1393     return legalizeAtomicCmpXChg(MI, MRI, B);
1394   case TargetOpcode::G_FLOG:
1395     return legalizeFlog(MI, B, numbers::ln2f);
1396   case TargetOpcode::G_FLOG10:
1397     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1398   case TargetOpcode::G_FEXP:
1399     return legalizeFExp(MI, B);
1400   case TargetOpcode::G_FPOW:
1401     return legalizeFPow(MI, B);
1402   case TargetOpcode::G_FFLOOR:
1403     return legalizeFFloor(MI, MRI, B);
1404   case TargetOpcode::G_BUILD_VECTOR:
1405     return legalizeBuildVector(MI, MRI, B);
1406   default:
1407     return false;
1408   }
1409 
1410   llvm_unreachable("expected switch to return");
1411 }
1412 
1413 Register AMDGPULegalizerInfo::getSegmentAperture(
1414   unsigned AS,
1415   MachineRegisterInfo &MRI,
1416   MachineIRBuilder &B) const {
1417   MachineFunction &MF = B.getMF();
1418   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1419   const LLT S32 = LLT::scalar(32);
1420 
1421   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1422 
1423   if (ST.hasApertureRegs()) {
1424     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1425     // getreg.
1426     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1427         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1428         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1429     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1430         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1431         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1432     unsigned Encoding =
1433         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1434         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1435         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1436 
1437     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1438 
1439     B.buildInstr(AMDGPU::S_GETREG_B32)
1440       .addDef(GetReg)
1441       .addImm(Encoding);
1442     MRI.setType(GetReg, S32);
1443 
1444     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1445     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1446   }
1447 
1448   Register QueuePtr = MRI.createGenericVirtualRegister(
1449     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1450 
1451   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1452   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1453     return Register();
1454 
1455   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1456   // private_segment_aperture_base_hi.
1457   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1458 
1459   // TODO: can we be smarter about machine pointer info?
1460   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1461   MachineMemOperand *MMO = MF.getMachineMemOperand(
1462       PtrInfo,
1463       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1464           MachineMemOperand::MOInvariant,
1465       4, commonAlignment(Align(64), StructOffset));
1466 
1467   Register LoadAddr;
1468 
1469   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1470   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1471 }
1472 
1473 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1474   MachineInstr &MI, MachineRegisterInfo &MRI,
1475   MachineIRBuilder &B) const {
1476   MachineFunction &MF = B.getMF();
1477 
1478   B.setInstr(MI);
1479 
1480   const LLT S32 = LLT::scalar(32);
1481   Register Dst = MI.getOperand(0).getReg();
1482   Register Src = MI.getOperand(1).getReg();
1483 
1484   LLT DstTy = MRI.getType(Dst);
1485   LLT SrcTy = MRI.getType(Src);
1486   unsigned DestAS = DstTy.getAddressSpace();
1487   unsigned SrcAS = SrcTy.getAddressSpace();
1488 
1489   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1490   // vector element.
1491   assert(!DstTy.isVector());
1492 
1493   const AMDGPUTargetMachine &TM
1494     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1495 
1496   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1497   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1498     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1499     return true;
1500   }
1501 
1502   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1503     // Truncate.
1504     B.buildExtract(Dst, Src, 0);
1505     MI.eraseFromParent();
1506     return true;
1507   }
1508 
1509   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1510     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1511     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1512 
1513     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1514     // another. Merge operands are required to be the same type, but creating an
1515     // extra ptrtoint would be kind of pointless.
1516     auto HighAddr = B.buildConstant(
1517       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1518     B.buildMerge(Dst, {Src, HighAddr});
1519     MI.eraseFromParent();
1520     return true;
1521   }
1522 
1523   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1524     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1525            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1526     unsigned NullVal = TM.getNullPointerValue(DestAS);
1527 
1528     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1529     auto FlatNull = B.buildConstant(SrcTy, 0);
1530 
1531     // Extract low 32-bits of the pointer.
1532     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1533 
1534     auto CmpRes =
1535         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1536     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1537 
1538     MI.eraseFromParent();
1539     return true;
1540   }
1541 
1542   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1543     return false;
1544 
1545   if (!ST.hasFlatAddressSpace())
1546     return false;
1547 
1548   auto SegmentNull =
1549       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1550   auto FlatNull =
1551       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1552 
1553   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1554   if (!ApertureReg.isValid())
1555     return false;
1556 
1557   auto CmpRes =
1558       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1559 
1560   // Coerce the type of the low half of the result so we can use merge_values.
1561   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1562 
1563   // TODO: Should we allow mismatched types but matching sizes in merges to
1564   // avoid the ptrtoint?
1565   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1566   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1567 
1568   MI.eraseFromParent();
1569   return true;
1570 }
1571 
1572 bool AMDGPULegalizerInfo::legalizeFrint(
1573   MachineInstr &MI, MachineRegisterInfo &MRI,
1574   MachineIRBuilder &B) const {
1575   B.setInstr(MI);
1576 
1577   Register Src = MI.getOperand(1).getReg();
1578   LLT Ty = MRI.getType(Src);
1579   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1580 
1581   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1582   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1583 
1584   auto C1 = B.buildFConstant(Ty, C1Val);
1585   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1586 
1587   // TODO: Should this propagate fast-math-flags?
1588   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1589   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1590 
1591   auto C2 = B.buildFConstant(Ty, C2Val);
1592   auto Fabs = B.buildFAbs(Ty, Src);
1593 
1594   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1595   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1596   return true;
1597 }
1598 
1599 bool AMDGPULegalizerInfo::legalizeFceil(
1600   MachineInstr &MI, MachineRegisterInfo &MRI,
1601   MachineIRBuilder &B) const {
1602   B.setInstr(MI);
1603 
1604   const LLT S1 = LLT::scalar(1);
1605   const LLT S64 = LLT::scalar(64);
1606 
1607   Register Src = MI.getOperand(1).getReg();
1608   assert(MRI.getType(Src) == S64);
1609 
1610   // result = trunc(src)
1611   // if (src > 0.0 && src != result)
1612   //   result += 1.0
1613 
1614   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1615 
1616   const auto Zero = B.buildFConstant(S64, 0.0);
1617   const auto One = B.buildFConstant(S64, 1.0);
1618   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1619   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1620   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1621   auto Add = B.buildSelect(S64, And, One, Zero);
1622 
1623   // TODO: Should this propagate fast-math-flags?
1624   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1625   return true;
1626 }
1627 
1628 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1629                                               MachineIRBuilder &B) {
1630   const unsigned FractBits = 52;
1631   const unsigned ExpBits = 11;
1632   LLT S32 = LLT::scalar(32);
1633 
1634   auto Const0 = B.buildConstant(S32, FractBits - 32);
1635   auto Const1 = B.buildConstant(S32, ExpBits);
1636 
1637   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1638     .addUse(Const0.getReg(0))
1639     .addUse(Const1.getReg(0));
1640 
1641   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1642 }
1643 
1644 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1645   MachineInstr &MI, MachineRegisterInfo &MRI,
1646   MachineIRBuilder &B) const {
1647   B.setInstr(MI);
1648 
1649   const LLT S1 = LLT::scalar(1);
1650   const LLT S32 = LLT::scalar(32);
1651   const LLT S64 = LLT::scalar(64);
1652 
1653   Register Src = MI.getOperand(1).getReg();
1654   assert(MRI.getType(Src) == S64);
1655 
1656   // TODO: Should this use extract since the low half is unused?
1657   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1658   Register Hi = Unmerge.getReg(1);
1659 
1660   // Extract the upper half, since this is where we will find the sign and
1661   // exponent.
1662   auto Exp = extractF64Exponent(Hi, B);
1663 
1664   const unsigned FractBits = 52;
1665 
1666   // Extract the sign bit.
1667   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1668   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1669 
1670   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1671 
1672   const auto Zero32 = B.buildConstant(S32, 0);
1673 
1674   // Extend back to 64-bits.
1675   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1676 
1677   auto Shr = B.buildAShr(S64, FractMask, Exp);
1678   auto Not = B.buildNot(S64, Shr);
1679   auto Tmp0 = B.buildAnd(S64, Src, Not);
1680   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1681 
1682   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1683   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1684 
1685   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1686   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1687   return true;
1688 }
1689 
1690 bool AMDGPULegalizerInfo::legalizeITOFP(
1691   MachineInstr &MI, MachineRegisterInfo &MRI,
1692   MachineIRBuilder &B, bool Signed) const {
1693   B.setInstr(MI);
1694 
1695   Register Dst = MI.getOperand(0).getReg();
1696   Register Src = MI.getOperand(1).getReg();
1697 
1698   const LLT S64 = LLT::scalar(64);
1699   const LLT S32 = LLT::scalar(32);
1700 
1701   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1702 
1703   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1704 
1705   auto CvtHi = Signed ?
1706     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1707     B.buildUITOFP(S64, Unmerge.getReg(1));
1708 
1709   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1710 
1711   auto ThirtyTwo = B.buildConstant(S32, 32);
1712   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1713     .addUse(CvtHi.getReg(0))
1714     .addUse(ThirtyTwo.getReg(0));
1715 
1716   // TODO: Should this propagate fast-math-flags?
1717   B.buildFAdd(Dst, LdExp, CvtLo);
1718   MI.eraseFromParent();
1719   return true;
1720 }
1721 
1722 // TODO: Copied from DAG implementation. Verify logic and document how this
1723 // actually works.
1724 bool AMDGPULegalizerInfo::legalizeFPTOI(
1725   MachineInstr &MI, MachineRegisterInfo &MRI,
1726   MachineIRBuilder &B, bool Signed) const {
1727   B.setInstr(MI);
1728 
1729   Register Dst = MI.getOperand(0).getReg();
1730   Register Src = MI.getOperand(1).getReg();
1731 
1732   const LLT S64 = LLT::scalar(64);
1733   const LLT S32 = LLT::scalar(32);
1734 
1735   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1736 
1737   unsigned Flags = MI.getFlags();
1738 
1739   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1740   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1741   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1742 
1743   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1744   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1745   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1746 
1747   auto Hi = Signed ?
1748     B.buildFPTOSI(S32, FloorMul) :
1749     B.buildFPTOUI(S32, FloorMul);
1750   auto Lo = B.buildFPTOUI(S32, Fma);
1751 
1752   B.buildMerge(Dst, { Lo, Hi });
1753   MI.eraseFromParent();
1754 
1755   return true;
1756 }
1757 
1758 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1759   MachineInstr &MI, MachineRegisterInfo &MRI,
1760   MachineIRBuilder &B) const {
1761   MachineFunction &MF = B.getMF();
1762   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1763 
1764   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1765                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1766 
1767   // With ieee_mode disabled, the instructions have the correct behavior
1768   // already for G_FMINNUM/G_FMAXNUM
1769   if (!MFI->getMode().IEEE)
1770     return !IsIEEEOp;
1771 
1772   if (IsIEEEOp)
1773     return true;
1774 
1775   MachineIRBuilder HelperBuilder(MI);
1776   GISelObserverWrapper DummyObserver;
1777   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1778   HelperBuilder.setInstr(MI);
1779   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1780 }
1781 
1782 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1783   MachineInstr &MI, MachineRegisterInfo &MRI,
1784   MachineIRBuilder &B) const {
1785   // TODO: Should move some of this into LegalizerHelper.
1786 
1787   // TODO: Promote dynamic indexing of s16 to s32
1788 
1789   // FIXME: Artifact combiner probably should have replaced the truncated
1790   // constant before this, so we shouldn't need
1791   // getConstantVRegValWithLookThrough.
1792   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1793     MI.getOperand(2).getReg(), MRI);
1794   if (!IdxVal) // Dynamic case will be selected to register indexing.
1795     return true;
1796 
1797   Register Dst = MI.getOperand(0).getReg();
1798   Register Vec = MI.getOperand(1).getReg();
1799 
1800   LLT VecTy = MRI.getType(Vec);
1801   LLT EltTy = VecTy.getElementType();
1802   assert(EltTy == MRI.getType(Dst));
1803 
1804   B.setInstr(MI);
1805 
1806   if (IdxVal->Value < VecTy.getNumElements())
1807     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1808   else
1809     B.buildUndef(Dst);
1810 
1811   MI.eraseFromParent();
1812   return true;
1813 }
1814 
1815 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B) const {
1818   // TODO: Should move some of this into LegalizerHelper.
1819 
1820   // TODO: Promote dynamic indexing of s16 to s32
1821 
1822   // FIXME: Artifact combiner probably should have replaced the truncated
1823   // constant before this, so we shouldn't need
1824   // getConstantVRegValWithLookThrough.
1825   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1826     MI.getOperand(3).getReg(), MRI);
1827   if (!IdxVal) // Dynamic case will be selected to register indexing.
1828     return true;
1829 
1830   Register Dst = MI.getOperand(0).getReg();
1831   Register Vec = MI.getOperand(1).getReg();
1832   Register Ins = MI.getOperand(2).getReg();
1833 
1834   LLT VecTy = MRI.getType(Vec);
1835   LLT EltTy = VecTy.getElementType();
1836   assert(EltTy == MRI.getType(Ins));
1837 
1838   B.setInstr(MI);
1839 
1840   if (IdxVal->Value < VecTy.getNumElements())
1841     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1842   else
1843     B.buildUndef(Dst);
1844 
1845   MI.eraseFromParent();
1846   return true;
1847 }
1848 
1849 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1850   MachineInstr &MI, MachineRegisterInfo &MRI,
1851   MachineIRBuilder &B) const {
1852   const LLT V2S16 = LLT::vector(2, 16);
1853 
1854   Register Dst = MI.getOperand(0).getReg();
1855   Register Src0 = MI.getOperand(1).getReg();
1856   LLT DstTy = MRI.getType(Dst);
1857   LLT SrcTy = MRI.getType(Src0);
1858 
1859   if (SrcTy == V2S16 && DstTy == V2S16 &&
1860       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1861     return true;
1862 
1863   MachineIRBuilder HelperBuilder(MI);
1864   GISelObserverWrapper DummyObserver;
1865   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1866   HelperBuilder.setInstr(MI);
1867   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1868 }
1869 
1870 bool AMDGPULegalizerInfo::legalizeSinCos(
1871   MachineInstr &MI, MachineRegisterInfo &MRI,
1872   MachineIRBuilder &B) const {
1873   B.setInstr(MI);
1874 
1875   Register DstReg = MI.getOperand(0).getReg();
1876   Register SrcReg = MI.getOperand(1).getReg();
1877   LLT Ty = MRI.getType(DstReg);
1878   unsigned Flags = MI.getFlags();
1879 
1880   Register TrigVal;
1881   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1882   if (ST.hasTrigReducedRange()) {
1883     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1884     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1885       .addUse(MulVal.getReg(0))
1886       .setMIFlags(Flags).getReg(0);
1887   } else
1888     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1889 
1890   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1891     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1892   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1893     .addUse(TrigVal)
1894     .setMIFlags(Flags);
1895   MI.eraseFromParent();
1896   return true;
1897 }
1898 
1899 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1900   Register DstReg, LLT PtrTy,
1901   MachineIRBuilder &B, const GlobalValue *GV,
1902   unsigned Offset, unsigned GAFlags) const {
1903   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1904   // to the following code sequence:
1905   //
1906   // For constant address space:
1907   //   s_getpc_b64 s[0:1]
1908   //   s_add_u32 s0, s0, $symbol
1909   //   s_addc_u32 s1, s1, 0
1910   //
1911   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1912   //   a fixup or relocation is emitted to replace $symbol with a literal
1913   //   constant, which is a pc-relative offset from the encoding of the $symbol
1914   //   operand to the global variable.
1915   //
1916   // For global address space:
1917   //   s_getpc_b64 s[0:1]
1918   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1919   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1920   //
1921   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1922   //   fixups or relocations are emitted to replace $symbol@*@lo and
1923   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1924   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1925   //   operand to the global variable.
1926   //
1927   // What we want here is an offset from the value returned by s_getpc
1928   // (which is the address of the s_add_u32 instruction) to the global
1929   // variable, but since the encoding of $symbol starts 4 bytes after the start
1930   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1931   // small. This requires us to add 4 to the global variable offset in order to
1932   // compute the correct address.
1933 
1934   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1935 
1936   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1937     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1938 
1939   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1940     .addDef(PCReg);
1941 
1942   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1943   if (GAFlags == SIInstrInfo::MO_NONE)
1944     MIB.addImm(0);
1945   else
1946     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1947 
1948   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1949 
1950   if (PtrTy.getSizeInBits() == 32)
1951     B.buildExtract(DstReg, PCReg, 0);
1952   return true;
1953  }
1954 
1955 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1956   MachineInstr &MI, MachineRegisterInfo &MRI,
1957   MachineIRBuilder &B) const {
1958   Register DstReg = MI.getOperand(0).getReg();
1959   LLT Ty = MRI.getType(DstReg);
1960   unsigned AS = Ty.getAddressSpace();
1961 
1962   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1963   MachineFunction &MF = B.getMF();
1964   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1965   B.setInstr(MI);
1966 
1967   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1968     if (!MFI->isEntryFunction()) {
1969       const Function &Fn = MF.getFunction();
1970       DiagnosticInfoUnsupported BadLDSDecl(
1971         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1972         DS_Warning);
1973       Fn.getContext().diagnose(BadLDSDecl);
1974 
1975       // We currently don't have a way to correctly allocate LDS objects that
1976       // aren't directly associated with a kernel. We do force inlining of
1977       // functions that use local objects. However, if these dead functions are
1978       // not eliminated, we don't want a compile time error. Just emit a warning
1979       // and a trap, since there should be no callable path here.
1980       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1981       B.buildUndef(DstReg);
1982       MI.eraseFromParent();
1983       return true;
1984     }
1985 
1986     // TODO: We could emit code to handle the initialization somewhere.
1987     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1988       const SITargetLowering *TLI = ST.getTargetLowering();
1989       if (!TLI->shouldUseLDSConstAddress(GV)) {
1990         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1991         return true; // Leave in place;
1992       }
1993 
1994       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1995       MI.eraseFromParent();
1996       return true;
1997     }
1998 
1999     const Function &Fn = MF.getFunction();
2000     DiagnosticInfoUnsupported BadInit(
2001       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2002     Fn.getContext().diagnose(BadInit);
2003     return true;
2004   }
2005 
2006   const SITargetLowering *TLI = ST.getTargetLowering();
2007 
2008   if (TLI->shouldEmitFixup(GV)) {
2009     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2010     MI.eraseFromParent();
2011     return true;
2012   }
2013 
2014   if (TLI->shouldEmitPCReloc(GV)) {
2015     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2016     MI.eraseFromParent();
2017     return true;
2018   }
2019 
2020   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2021   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2022 
2023   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2024       MachinePointerInfo::getGOT(MF),
2025       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2026           MachineMemOperand::MOInvariant,
2027       8 /*Size*/, Align(8));
2028 
2029   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2030 
2031   if (Ty.getSizeInBits() == 32) {
2032     // Truncate if this is a 32-bit constant adrdess.
2033     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2034     B.buildExtract(DstReg, Load, 0);
2035   } else
2036     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2037 
2038   MI.eraseFromParent();
2039   return true;
2040 }
2041 
2042 bool AMDGPULegalizerInfo::legalizeLoad(
2043   MachineInstr &MI, MachineRegisterInfo &MRI,
2044   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2045   B.setInstr(MI);
2046   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2047   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2048   Observer.changingInstr(MI);
2049   MI.getOperand(1).setReg(Cast.getReg(0));
2050   Observer.changedInstr(MI);
2051   return true;
2052 }
2053 
2054 bool AMDGPULegalizerInfo::legalizeFMad(
2055   MachineInstr &MI, MachineRegisterInfo &MRI,
2056   MachineIRBuilder &B) const {
2057   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2058   assert(Ty.isScalar());
2059 
2060   MachineFunction &MF = B.getMF();
2061   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2062 
2063   // TODO: Always legal with future ftz flag.
2064   // FIXME: Do we need just output?
2065   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2066     return true;
2067   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2068     return true;
2069 
2070   MachineIRBuilder HelperBuilder(MI);
2071   GISelObserverWrapper DummyObserver;
2072   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2073   HelperBuilder.setInstr(MI);
2074   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2075 }
2076 
2077 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2078   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2079   Register DstReg = MI.getOperand(0).getReg();
2080   Register PtrReg = MI.getOperand(1).getReg();
2081   Register CmpVal = MI.getOperand(2).getReg();
2082   Register NewVal = MI.getOperand(3).getReg();
2083 
2084   assert(SITargetLowering::isFlatGlobalAddrSpace(
2085            MRI.getType(PtrReg).getAddressSpace()) &&
2086          "this should not have been custom lowered");
2087 
2088   LLT ValTy = MRI.getType(CmpVal);
2089   LLT VecTy = LLT::vector(2, ValTy);
2090 
2091   B.setInstr(MI);
2092   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2093 
2094   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2095     .addDef(DstReg)
2096     .addUse(PtrReg)
2097     .addUse(PackedVal)
2098     .setMemRefs(MI.memoperands());
2099 
2100   MI.eraseFromParent();
2101   return true;
2102 }
2103 
2104 bool AMDGPULegalizerInfo::legalizeFlog(
2105   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2106   Register Dst = MI.getOperand(0).getReg();
2107   Register Src = MI.getOperand(1).getReg();
2108   LLT Ty = B.getMRI()->getType(Dst);
2109   unsigned Flags = MI.getFlags();
2110   B.setInstr(MI);
2111 
2112   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2113   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2114 
2115   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2116   MI.eraseFromParent();
2117   return true;
2118 }
2119 
2120 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2121                                        MachineIRBuilder &B) const {
2122   Register Dst = MI.getOperand(0).getReg();
2123   Register Src = MI.getOperand(1).getReg();
2124   unsigned Flags = MI.getFlags();
2125   LLT Ty = B.getMRI()->getType(Dst);
2126   B.setInstr(MI);
2127 
2128   auto K = B.buildFConstant(Ty, numbers::log2e);
2129   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2130   B.buildFExp2(Dst, Mul, Flags);
2131   MI.eraseFromParent();
2132   return true;
2133 }
2134 
2135 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2136                                        MachineIRBuilder &B) const {
2137   Register Dst = MI.getOperand(0).getReg();
2138   Register Src0 = MI.getOperand(1).getReg();
2139   Register Src1 = MI.getOperand(2).getReg();
2140   unsigned Flags = MI.getFlags();
2141   LLT Ty = B.getMRI()->getType(Dst);
2142   B.setInstr(MI);
2143   const LLT S16 = LLT::scalar(16);
2144   const LLT S32 = LLT::scalar(32);
2145 
2146   if (Ty == S32) {
2147     auto Log = B.buildFLog2(S32, Src0, Flags);
2148     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2149       .addUse(Log.getReg(0))
2150       .addUse(Src1)
2151       .setMIFlags(Flags);
2152     B.buildFExp2(Dst, Mul, Flags);
2153   } else if (Ty == S16) {
2154     // There's no f16 fmul_legacy, so we need to convert for it.
2155     auto Log = B.buildFLog2(S16, Src0, Flags);
2156     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2157     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2158     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2159       .addUse(Ext0.getReg(0))
2160       .addUse(Ext1.getReg(0))
2161       .setMIFlags(Flags);
2162 
2163     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2164   } else
2165     return false;
2166 
2167   MI.eraseFromParent();
2168   return true;
2169 }
2170 
2171 // Find a source register, ignoring any possible source modifiers.
2172 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2173   Register ModSrc = OrigSrc;
2174   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2175     ModSrc = SrcFNeg->getOperand(1).getReg();
2176     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2177       ModSrc = SrcFAbs->getOperand(1).getReg();
2178   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2179     ModSrc = SrcFAbs->getOperand(1).getReg();
2180   return ModSrc;
2181 }
2182 
2183 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2184                                          MachineRegisterInfo &MRI,
2185                                          MachineIRBuilder &B) const {
2186   B.setInstr(MI);
2187 
2188   const LLT S1 = LLT::scalar(1);
2189   const LLT S64 = LLT::scalar(64);
2190   Register Dst = MI.getOperand(0).getReg();
2191   Register OrigSrc = MI.getOperand(1).getReg();
2192   unsigned Flags = MI.getFlags();
2193   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2194          "this should not have been custom lowered");
2195 
2196   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2197   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2198   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2199   // V_FRACT bug is:
2200   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2201   //
2202   // Convert floor(x) to (x - fract(x))
2203 
2204   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2205     .addUse(OrigSrc)
2206     .setMIFlags(Flags);
2207 
2208   // Give source modifier matching some assistance before obscuring a foldable
2209   // pattern.
2210 
2211   // TODO: We can avoid the neg on the fract? The input sign to fract
2212   // shouldn't matter?
2213   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2214 
2215   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2216 
2217   Register Min = MRI.createGenericVirtualRegister(S64);
2218 
2219   // We don't need to concern ourselves with the snan handling difference, so
2220   // use the one which will directly select.
2221   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2222   if (MFI->getMode().IEEE)
2223     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2224   else
2225     B.buildFMinNum(Min, Fract, Const, Flags);
2226 
2227   Register CorrectedFract = Min;
2228   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2229     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2230     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2231   }
2232 
2233   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2234   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2235 
2236   MI.eraseFromParent();
2237   return true;
2238 }
2239 
2240 // Turn an illegal packed v2s16 build vector into bit operations.
2241 // TODO: This should probably be a bitcast action in LegalizerHelper.
2242 bool AMDGPULegalizerInfo::legalizeBuildVector(
2243   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2244   Register Dst = MI.getOperand(0).getReg();
2245   const LLT S32 = LLT::scalar(32);
2246   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2247 
2248   Register Src0 = MI.getOperand(1).getReg();
2249   Register Src1 = MI.getOperand(2).getReg();
2250   assert(MRI.getType(Src0) == LLT::scalar(16));
2251 
2252   B.setInstr(MI);
2253   auto Merge = B.buildMerge(S32, {Src0, Src1});
2254   B.buildBitcast(Dst, Merge);
2255 
2256   MI.eraseFromParent();
2257   return true;
2258 }
2259 
2260 // Return the use branch instruction, otherwise null if the usage is invalid.
2261 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2262                                        MachineRegisterInfo &MRI,
2263                                        MachineInstr *&Br,
2264                                        MachineBasicBlock *&UncondBrTarget) {
2265   Register CondDef = MI.getOperand(0).getReg();
2266   if (!MRI.hasOneNonDBGUse(CondDef))
2267     return nullptr;
2268 
2269   MachineBasicBlock *Parent = MI.getParent();
2270   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2271   if (UseMI.getParent() != Parent ||
2272       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2273     return nullptr;
2274 
2275   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2276   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2277   if (Next == Parent->end()) {
2278     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2279     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2280       return nullptr;
2281     UncondBrTarget = &*NextMBB;
2282   } else {
2283     if (Next->getOpcode() != AMDGPU::G_BR)
2284       return nullptr;
2285     Br = &*Next;
2286     UncondBrTarget = Br->getOperand(0).getMBB();
2287   }
2288 
2289   return &UseMI;
2290 }
2291 
2292 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2293                                                MachineRegisterInfo &MRI,
2294                                                Register LiveIn,
2295                                                Register PhyReg) const {
2296   assert(PhyReg.isPhysical() && "Physical register expected");
2297 
2298   // Insert the live-in copy, if required, by defining destination virtual
2299   // register.
2300   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2301   if (!MRI.getVRegDef(LiveIn)) {
2302     // FIXME: Should have scoped insert pt
2303     MachineBasicBlock &OrigInsBB = B.getMBB();
2304     auto OrigInsPt = B.getInsertPt();
2305 
2306     MachineBasicBlock &EntryMBB = B.getMF().front();
2307     EntryMBB.addLiveIn(PhyReg);
2308     B.setInsertPt(EntryMBB, EntryMBB.begin());
2309     B.buildCopy(LiveIn, PhyReg);
2310 
2311     B.setInsertPt(OrigInsBB, OrigInsPt);
2312   }
2313 
2314   return LiveIn;
2315 }
2316 
2317 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2318                                                 MachineRegisterInfo &MRI,
2319                                                 Register PhyReg, LLT Ty,
2320                                                 bool InsertLiveInCopy) const {
2321   assert(PhyReg.isPhysical() && "Physical register expected");
2322 
2323   // Get or create virtual live-in regester
2324   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2325   if (!LiveIn) {
2326     LiveIn = MRI.createGenericVirtualRegister(Ty);
2327     MRI.addLiveIn(PhyReg, LiveIn);
2328   }
2329 
2330   // When the actual true copy required is from virtual register to physical
2331   // register (to be inserted later), live-in copy insertion from physical
2332   // to register virtual register is not required
2333   if (!InsertLiveInCopy)
2334     return LiveIn;
2335 
2336   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2337 }
2338 
2339 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2340     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2341   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2342   const ArgDescriptor *Arg;
2343   const TargetRegisterClass *RC;
2344   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2345   if (!Arg) {
2346     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2347     return nullptr;
2348   }
2349   return Arg;
2350 }
2351 
2352 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2353                                          const ArgDescriptor *Arg) const {
2354   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2355     return false; // TODO: Handle these
2356 
2357   Register SrcReg = Arg->getRegister();
2358   assert(SrcReg.isPhysical() && "Physical register expected");
2359   assert(DstReg.isVirtual() && "Virtual register expected");
2360 
2361   MachineRegisterInfo &MRI = *B.getMRI();
2362 
2363   LLT Ty = MRI.getType(DstReg);
2364   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2365 
2366   if (Arg->isMasked()) {
2367     // TODO: Should we try to emit this once in the entry block?
2368     const LLT S32 = LLT::scalar(32);
2369     const unsigned Mask = Arg->getMask();
2370     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2371 
2372     Register AndMaskSrc = LiveIn;
2373 
2374     if (Shift != 0) {
2375       auto ShiftAmt = B.buildConstant(S32, Shift);
2376       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2377     }
2378 
2379     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2380   } else {
2381     B.buildCopy(DstReg, LiveIn);
2382   }
2383 
2384   return true;
2385 }
2386 
2387 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2388     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2389     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2390   B.setInstr(MI);
2391 
2392   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2393   if (!Arg)
2394     return false;
2395 
2396   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2397     return false;
2398 
2399   MI.eraseFromParent();
2400   return true;
2401 }
2402 
2403 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2404                                        MachineRegisterInfo &MRI,
2405                                        MachineIRBuilder &B) const {
2406   B.setInstr(MI);
2407   Register Dst = MI.getOperand(0).getReg();
2408   LLT DstTy = MRI.getType(Dst);
2409   LLT S16 = LLT::scalar(16);
2410   LLT S32 = LLT::scalar(32);
2411   LLT S64 = LLT::scalar(64);
2412 
2413   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2414     return true;
2415 
2416   if (DstTy == S16)
2417     return legalizeFDIV16(MI, MRI, B);
2418   if (DstTy == S32)
2419     return legalizeFDIV32(MI, MRI, B);
2420   if (DstTy == S64)
2421     return legalizeFDIV64(MI, MRI, B);
2422 
2423   return false;
2424 }
2425 
2426 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2427   const LLT S32 = LLT::scalar(32);
2428 
2429   auto Cvt0 = B.buildUITOFP(S32, Src);
2430   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2431   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2432   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2433   return B.buildFPTOUI(S32, Mul).getReg(0);
2434 }
2435 
2436 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2437                                                   Register DstReg,
2438                                                   Register Num,
2439                                                   Register Den,
2440                                                   bool IsRem) const {
2441   const LLT S1 = LLT::scalar(1);
2442   const LLT S32 = LLT::scalar(32);
2443 
2444   // RCP =  URECIP(Den) = 2^32 / Den + e
2445   // e is rounding error.
2446   auto RCP = buildDivRCP(B, Den);
2447 
2448   // RCP_LO = mul(RCP, Den)
2449   auto RCP_LO = B.buildMul(S32, RCP, Den);
2450 
2451   // RCP_HI = mulhu (RCP, Den) */
2452   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2453 
2454   // NEG_RCP_LO = -RCP_LO
2455   auto Zero = B.buildConstant(S32, 0);
2456   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2457 
2458   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2459   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2460   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2461 
2462   // Calculate the rounding error from the URECIP instruction
2463   // E = mulhu(ABS_RCP_LO, RCP)
2464   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2465 
2466   // RCP_A_E = RCP + E
2467   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2468 
2469   // RCP_S_E = RCP - E
2470   auto RCP_S_E = B.buildSub(S32, RCP, E);
2471 
2472   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2473   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2474 
2475   // Quotient = mulhu(Tmp0, Num)stmp
2476   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2477 
2478   // Num_S_Remainder = Quotient * Den
2479   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2480 
2481   // Remainder = Num - Num_S_Remainder
2482   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2483 
2484   // Remainder_GE_Den = Remainder >= Den
2485   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2486 
2487   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2488   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2489                                        Num, Num_S_Remainder);
2490 
2491   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2492   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2493 
2494   // Calculate Division result:
2495 
2496   // Quotient_A_One = Quotient + 1
2497   auto One = B.buildConstant(S32, 1);
2498   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2499 
2500   // Quotient_S_One = Quotient - 1
2501   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2502 
2503   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2504   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2505 
2506   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2507   if (IsRem) {
2508     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2509 
2510     // Calculate Rem result:
2511     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2512 
2513     // Remainder_A_Den = Remainder + Den
2514     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2515 
2516     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2517     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2518 
2519     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2520     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2521   } else {
2522     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2523   }
2524 }
2525 
2526 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2527                                               MachineRegisterInfo &MRI,
2528                                               MachineIRBuilder &B) const {
2529   B.setInstr(MI);
2530   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2531   Register DstReg = MI.getOperand(0).getReg();
2532   Register Num = MI.getOperand(1).getReg();
2533   Register Den = MI.getOperand(2).getReg();
2534   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2535   MI.eraseFromParent();
2536   return true;
2537 }
2538 
2539 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2540 //
2541 // Return lo, hi of result
2542 //
2543 // %cvt.lo = G_UITOFP Val.lo
2544 // %cvt.hi = G_UITOFP Val.hi
2545 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2546 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2547 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2548 // %mul2 = G_FMUL %mul1, 2**(-32)
2549 // %trunc = G_INTRINSIC_TRUNC %mul2
2550 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2551 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2552 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2553                                                        Register Val) {
2554   const LLT S32 = LLT::scalar(32);
2555   auto Unmerge = B.buildUnmerge(S32, Val);
2556 
2557   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2558   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2559 
2560   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2561                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2562 
2563   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2564   auto Mul1 =
2565       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2566 
2567   // 2**(-32)
2568   auto Mul2 =
2569       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2570   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2571 
2572   // -(2**32)
2573   auto Mad2 = B.buildFMAD(S32, Trunc,
2574                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2575 
2576   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2577   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2578 
2579   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2580 }
2581 
2582 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2583                                               MachineRegisterInfo &MRI,
2584                                               MachineIRBuilder &B) const {
2585   B.setInstr(MI);
2586 
2587   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2588   const LLT S32 = LLT::scalar(32);
2589   const LLT S64 = LLT::scalar(64);
2590   const LLT S1 = LLT::scalar(1);
2591   Register Numer = MI.getOperand(1).getReg();
2592   Register Denom = MI.getOperand(2).getReg();
2593   Register RcpLo, RcpHi;
2594 
2595   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2596 
2597   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2598 
2599   auto Zero64 = B.buildConstant(S64, 0);
2600   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2601 
2602   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2603   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2604 
2605   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2606   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2607   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2608 
2609   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2610   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2611   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2612   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2613 
2614   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2615   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2616   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2617   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2618   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2619 
2620   auto Zero32 = B.buildConstant(S32, 0);
2621   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2622   auto Add2_HiC =
2623       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2624   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2625   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2626 
2627   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2628   Register NumerLo = UnmergeNumer.getReg(0);
2629   Register NumerHi = UnmergeNumer.getReg(1);
2630 
2631   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2632   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2633   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2634   Register Mul3_Lo = UnmergeMul3.getReg(0);
2635   Register Mul3_Hi = UnmergeMul3.getReg(1);
2636   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2637   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2638   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2639   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2640 
2641   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2642   Register DenomLo = UnmergeDenom.getReg(0);
2643   Register DenomHi = UnmergeDenom.getReg(1);
2644 
2645   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2646   auto C1 = B.buildSExt(S32, CmpHi);
2647 
2648   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2649   auto C2 = B.buildSExt(S32, CmpLo);
2650 
2651   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2652   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2653 
2654   // TODO: Here and below portions of the code can be enclosed into if/endif.
2655   // Currently control flow is unconditional and we have 4 selects after
2656   // potential endif to substitute PHIs.
2657 
2658   // if C3 != 0 ...
2659   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2660   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2661   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2662   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2663 
2664   auto One64 = B.buildConstant(S64, 1);
2665   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2666 
2667   auto C4 =
2668       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2669   auto C5 =
2670       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2671   auto C6 = B.buildSelect(
2672       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2673 
2674   // if (C6 != 0)
2675   auto Add4 = B.buildAdd(S64, Add3, One64);
2676   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2677 
2678   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2679   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2680   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2681 
2682   // endif C6
2683   // endif C3
2684 
2685   if (IsDiv) {
2686     auto Sel1 = B.buildSelect(
2687         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2688     B.buildSelect(MI.getOperand(0),
2689                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2690   } else {
2691     auto Sel2 = B.buildSelect(
2692         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2693     B.buildSelect(MI.getOperand(0),
2694                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2695   }
2696 
2697   MI.eraseFromParent();
2698   return true;
2699 }
2700 
2701 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2702                                             MachineRegisterInfo &MRI,
2703                                             MachineIRBuilder &B) const {
2704   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2705   if (Ty == LLT::scalar(32))
2706     return legalizeUDIV_UREM32(MI, MRI, B);
2707   if (Ty == LLT::scalar(64))
2708     return legalizeUDIV_UREM64(MI, MRI, B);
2709   return false;
2710 }
2711 
2712 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2713                                               MachineRegisterInfo &MRI,
2714                                               MachineIRBuilder &B) const {
2715   B.setInstr(MI);
2716   const LLT S32 = LLT::scalar(32);
2717 
2718   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2719   Register DstReg = MI.getOperand(0).getReg();
2720   Register LHS = MI.getOperand(1).getReg();
2721   Register RHS = MI.getOperand(2).getReg();
2722 
2723   auto ThirtyOne = B.buildConstant(S32, 31);
2724   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2725   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2726 
2727   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2728   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2729 
2730   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2731   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2732 
2733   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2734   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2735 
2736   if (IsRem) {
2737     auto RSign = LHSign; // Remainder sign is the same as LHS
2738     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2739     B.buildSub(DstReg, UDivRem, RSign);
2740   } else {
2741     auto DSign = B.buildXor(S32, LHSign, RHSign);
2742     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2743     B.buildSub(DstReg, UDivRem, DSign);
2744   }
2745 
2746   MI.eraseFromParent();
2747   return true;
2748 }
2749 
2750 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2751                                             MachineRegisterInfo &MRI,
2752                                             MachineIRBuilder &B) const {
2753   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2754     return legalizeSDIV_SREM32(MI, MRI, B);
2755   return false;
2756 }
2757 
2758 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2759                                                  MachineRegisterInfo &MRI,
2760                                                  MachineIRBuilder &B) const {
2761   Register Res = MI.getOperand(0).getReg();
2762   Register LHS = MI.getOperand(1).getReg();
2763   Register RHS = MI.getOperand(2).getReg();
2764 
2765   uint16_t Flags = MI.getFlags();
2766 
2767   LLT ResTy = MRI.getType(Res);
2768   LLT S32 = LLT::scalar(32);
2769   LLT S64 = LLT::scalar(64);
2770 
2771   const MachineFunction &MF = B.getMF();
2772   bool Unsafe =
2773     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2774 
2775   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2776     return false;
2777 
2778   if (!Unsafe && ResTy == S32 &&
2779       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2780     return false;
2781 
2782   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2783     // 1 / x -> RCP(x)
2784     if (CLHS->isExactlyValue(1.0)) {
2785       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2786         .addUse(RHS)
2787         .setMIFlags(Flags);
2788 
2789       MI.eraseFromParent();
2790       return true;
2791     }
2792 
2793     // -1 / x -> RCP( FNEG(x) )
2794     if (CLHS->isExactlyValue(-1.0)) {
2795       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2796       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2797         .addUse(FNeg.getReg(0))
2798         .setMIFlags(Flags);
2799 
2800       MI.eraseFromParent();
2801       return true;
2802     }
2803   }
2804 
2805   // x / y -> x * (1.0 / y)
2806   if (Unsafe) {
2807     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2808       .addUse(RHS)
2809       .setMIFlags(Flags);
2810     B.buildFMul(Res, LHS, RCP, Flags);
2811 
2812     MI.eraseFromParent();
2813     return true;
2814   }
2815 
2816   return false;
2817 }
2818 
2819 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2820                                          MachineRegisterInfo &MRI,
2821                                          MachineIRBuilder &B) const {
2822   B.setInstr(MI);
2823   Register Res = MI.getOperand(0).getReg();
2824   Register LHS = MI.getOperand(1).getReg();
2825   Register RHS = MI.getOperand(2).getReg();
2826 
2827   uint16_t Flags = MI.getFlags();
2828 
2829   LLT S16 = LLT::scalar(16);
2830   LLT S32 = LLT::scalar(32);
2831 
2832   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2833   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2834 
2835   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2836     .addUse(RHSExt.getReg(0))
2837     .setMIFlags(Flags);
2838 
2839   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2840   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2841 
2842   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2843     .addUse(RDst.getReg(0))
2844     .addUse(RHS)
2845     .addUse(LHS)
2846     .setMIFlags(Flags);
2847 
2848   MI.eraseFromParent();
2849   return true;
2850 }
2851 
2852 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2853 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2854 static void toggleSPDenormMode(bool Enable,
2855                                MachineIRBuilder &B,
2856                                const GCNSubtarget &ST,
2857                                AMDGPU::SIModeRegisterDefaults Mode) {
2858   // Set SP denorm mode to this value.
2859   unsigned SPDenormMode =
2860     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2861 
2862   if (ST.hasDenormModeInst()) {
2863     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2864     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2865 
2866     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2867     B.buildInstr(AMDGPU::S_DENORM_MODE)
2868       .addImm(NewDenormModeValue);
2869 
2870   } else {
2871     // Select FP32 bit field in mode register.
2872     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2873                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2874                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2875 
2876     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2877       .addImm(SPDenormMode)
2878       .addImm(SPDenormModeBitField);
2879   }
2880 }
2881 
2882 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2883                                          MachineRegisterInfo &MRI,
2884                                          MachineIRBuilder &B) const {
2885   B.setInstr(MI);
2886   Register Res = MI.getOperand(0).getReg();
2887   Register LHS = MI.getOperand(1).getReg();
2888   Register RHS = MI.getOperand(2).getReg();
2889   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2890   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2891 
2892   uint16_t Flags = MI.getFlags();
2893 
2894   LLT S32 = LLT::scalar(32);
2895   LLT S1 = LLT::scalar(1);
2896 
2897   auto One = B.buildFConstant(S32, 1.0f);
2898 
2899   auto DenominatorScaled =
2900     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2901       .addUse(LHS)
2902       .addUse(RHS)
2903       .addImm(0)
2904       .setMIFlags(Flags);
2905   auto NumeratorScaled =
2906     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2907       .addUse(LHS)
2908       .addUse(RHS)
2909       .addImm(1)
2910       .setMIFlags(Flags);
2911 
2912   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2913     .addUse(DenominatorScaled.getReg(0))
2914     .setMIFlags(Flags);
2915   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2916 
2917   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2918   // aren't modeled as reading it.
2919   if (!Mode.allFP32Denormals())
2920     toggleSPDenormMode(true, B, ST, Mode);
2921 
2922   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2923   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2924   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2925   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2926   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2927   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2928 
2929   if (!Mode.allFP32Denormals())
2930     toggleSPDenormMode(false, B, ST, Mode);
2931 
2932   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2933     .addUse(Fma4.getReg(0))
2934     .addUse(Fma1.getReg(0))
2935     .addUse(Fma3.getReg(0))
2936     .addUse(NumeratorScaled.getReg(1))
2937     .setMIFlags(Flags);
2938 
2939   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2940     .addUse(Fmas.getReg(0))
2941     .addUse(RHS)
2942     .addUse(LHS)
2943     .setMIFlags(Flags);
2944 
2945   MI.eraseFromParent();
2946   return true;
2947 }
2948 
2949 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2950                                          MachineRegisterInfo &MRI,
2951                                          MachineIRBuilder &B) const {
2952   B.setInstr(MI);
2953   Register Res = MI.getOperand(0).getReg();
2954   Register LHS = MI.getOperand(1).getReg();
2955   Register RHS = MI.getOperand(2).getReg();
2956 
2957   uint16_t Flags = MI.getFlags();
2958 
2959   LLT S64 = LLT::scalar(64);
2960   LLT S1 = LLT::scalar(1);
2961 
2962   auto One = B.buildFConstant(S64, 1.0);
2963 
2964   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2965     .addUse(LHS)
2966     .addUse(RHS)
2967     .addImm(0)
2968     .setMIFlags(Flags);
2969 
2970   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2971 
2972   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2973     .addUse(DivScale0.getReg(0))
2974     .setMIFlags(Flags);
2975 
2976   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2977   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2978   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2979 
2980   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2981     .addUse(LHS)
2982     .addUse(RHS)
2983     .addImm(1)
2984     .setMIFlags(Flags);
2985 
2986   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2987   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
2988   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2989 
2990   Register Scale;
2991   if (!ST.hasUsableDivScaleConditionOutput()) {
2992     // Workaround a hardware bug on SI where the condition output from div_scale
2993     // is not usable.
2994 
2995     LLT S32 = LLT::scalar(32);
2996 
2997     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2998     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2999     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3000     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3001 
3002     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3003                               Scale1Unmerge.getReg(1));
3004     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3005                               Scale0Unmerge.getReg(1));
3006     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3007   } else {
3008     Scale = DivScale1.getReg(1);
3009   }
3010 
3011   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3012     .addUse(Fma4.getReg(0))
3013     .addUse(Fma3.getReg(0))
3014     .addUse(Mul.getReg(0))
3015     .addUse(Scale)
3016     .setMIFlags(Flags);
3017 
3018   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3019     .addUse(Fmas.getReg(0))
3020     .addUse(RHS)
3021     .addUse(LHS)
3022     .setMIFlags(Flags);
3023 
3024   MI.eraseFromParent();
3025   return true;
3026 }
3027 
3028 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3029                                                  MachineRegisterInfo &MRI,
3030                                                  MachineIRBuilder &B) const {
3031   B.setInstr(MI);
3032   Register Res = MI.getOperand(0).getReg();
3033   Register LHS = MI.getOperand(2).getReg();
3034   Register RHS = MI.getOperand(3).getReg();
3035   uint16_t Flags = MI.getFlags();
3036 
3037   LLT S32 = LLT::scalar(32);
3038   LLT S1 = LLT::scalar(1);
3039 
3040   auto Abs = B.buildFAbs(S32, RHS, Flags);
3041   const APFloat C0Val(1.0f);
3042 
3043   auto C0 = B.buildConstant(S32, 0x6f800000);
3044   auto C1 = B.buildConstant(S32, 0x2f800000);
3045   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3046 
3047   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3048   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3049 
3050   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3051 
3052   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3053     .addUse(Mul0.getReg(0))
3054     .setMIFlags(Flags);
3055 
3056   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3057 
3058   B.buildFMul(Res, Sel, Mul1, Flags);
3059 
3060   MI.eraseFromParent();
3061   return true;
3062 }
3063 
3064 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3065                                                  MachineRegisterInfo &MRI,
3066                                                  MachineIRBuilder &B) const {
3067   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3068   if (!MFI->isEntryFunction()) {
3069     return legalizePreloadedArgIntrin(MI, MRI, B,
3070                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3071   }
3072 
3073   B.setInstr(MI);
3074 
3075   uint64_t Offset =
3076     ST.getTargetLowering()->getImplicitParameterOffset(
3077       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3078   Register DstReg = MI.getOperand(0).getReg();
3079   LLT DstTy = MRI.getType(DstReg);
3080   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3081 
3082   const ArgDescriptor *Arg;
3083   const TargetRegisterClass *RC;
3084   std::tie(Arg, RC)
3085     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3086   if (!Arg)
3087     return false;
3088 
3089   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3090   if (!loadInputValue(KernargPtrReg, B, Arg))
3091     return false;
3092 
3093   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3094   MI.eraseFromParent();
3095   return true;
3096 }
3097 
3098 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3099                                               MachineRegisterInfo &MRI,
3100                                               MachineIRBuilder &B,
3101                                               unsigned AddrSpace) const {
3102   B.setInstr(MI);
3103   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3104   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3105   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3106   MI.eraseFromParent();
3107   return true;
3108 }
3109 
3110 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3111 // offset (the offset that is included in bounds checking and swizzling, to be
3112 // split between the instruction's voffset and immoffset fields) and soffset
3113 // (the offset that is excluded from bounds checking and swizzling, to go in
3114 // the instruction's soffset field).  This function takes the first kind of
3115 // offset and figures out how to split it between voffset and immoffset.
3116 std::tuple<Register, unsigned, unsigned>
3117 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3118                                         Register OrigOffset) const {
3119   const unsigned MaxImm = 4095;
3120   Register BaseReg;
3121   unsigned TotalConstOffset;
3122   MachineInstr *OffsetDef;
3123   const LLT S32 = LLT::scalar(32);
3124 
3125   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3126     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3127 
3128   unsigned ImmOffset = TotalConstOffset;
3129 
3130   // If the immediate value is too big for the immoffset field, put the value
3131   // and -4096 into the immoffset field so that the value that is copied/added
3132   // for the voffset field is a multiple of 4096, and it stands more chance
3133   // of being CSEd with the copy/add for another similar load/store.
3134   // However, do not do that rounding down to a multiple of 4096 if that is a
3135   // negative number, as it appears to be illegal to have a negative offset
3136   // in the vgpr, even if adding the immediate offset makes it positive.
3137   unsigned Overflow = ImmOffset & ~MaxImm;
3138   ImmOffset -= Overflow;
3139   if ((int32_t)Overflow < 0) {
3140     Overflow += ImmOffset;
3141     ImmOffset = 0;
3142   }
3143 
3144   if (Overflow != 0) {
3145     if (!BaseReg) {
3146       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3147     } else {
3148       auto OverflowVal = B.buildConstant(S32, Overflow);
3149       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3150     }
3151   }
3152 
3153   if (!BaseReg)
3154     BaseReg = B.buildConstant(S32, 0).getReg(0);
3155 
3156   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3157 }
3158 
3159 /// Handle register layout difference for f16 images for some subtargets.
3160 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3161                                              MachineRegisterInfo &MRI,
3162                                              Register Reg) const {
3163   if (!ST.hasUnpackedD16VMem())
3164     return Reg;
3165 
3166   const LLT S16 = LLT::scalar(16);
3167   const LLT S32 = LLT::scalar(32);
3168   LLT StoreVT = MRI.getType(Reg);
3169   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3170 
3171   auto Unmerge = B.buildUnmerge(S16, Reg);
3172 
3173   SmallVector<Register, 4> WideRegs;
3174   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3175     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3176 
3177   int NumElts = StoreVT.getNumElements();
3178 
3179   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3180 }
3181 
3182 Register AMDGPULegalizerInfo::fixStoreSourceType(
3183   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3184   MachineRegisterInfo *MRI = B.getMRI();
3185   LLT Ty = MRI->getType(VData);
3186 
3187   const LLT S16 = LLT::scalar(16);
3188 
3189   // Fixup illegal register types for i8 stores.
3190   if (Ty == LLT::scalar(8) || Ty == S16) {
3191     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3192     return AnyExt;
3193   }
3194 
3195   if (Ty.isVector()) {
3196     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3197       if (IsFormat)
3198         return handleD16VData(B, *MRI, VData);
3199     }
3200   }
3201 
3202   return VData;
3203 }
3204 
3205 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3206                                               MachineRegisterInfo &MRI,
3207                                               MachineIRBuilder &B,
3208                                               bool IsTyped,
3209                                               bool IsFormat) const {
3210   B.setInstr(MI);
3211 
3212   Register VData = MI.getOperand(1).getReg();
3213   LLT Ty = MRI.getType(VData);
3214   LLT EltTy = Ty.getScalarType();
3215   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3216   const LLT S32 = LLT::scalar(32);
3217 
3218   VData = fixStoreSourceType(B, VData, IsFormat);
3219   Register RSrc = MI.getOperand(2).getReg();
3220 
3221   MachineMemOperand *MMO = *MI.memoperands_begin();
3222   const int MemSize = MMO->getSize();
3223 
3224   unsigned ImmOffset;
3225   unsigned TotalOffset;
3226 
3227   // The typed intrinsics add an immediate after the registers.
3228   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3229 
3230   // The struct intrinsic variants add one additional operand over raw.
3231   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3232   Register VIndex;
3233   int OpOffset = 0;
3234   if (HasVIndex) {
3235     VIndex = MI.getOperand(3).getReg();
3236     OpOffset = 1;
3237   }
3238 
3239   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3240   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3241 
3242   unsigned Format = 0;
3243   if (IsTyped) {
3244     Format = MI.getOperand(5 + OpOffset).getImm();
3245     ++OpOffset;
3246   }
3247 
3248   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3249 
3250   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3251   if (TotalOffset != 0)
3252     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3253 
3254   unsigned Opc;
3255   if (IsTyped) {
3256     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3257                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3258   } else if (IsFormat) {
3259     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3260                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3261   } else {
3262     switch (MemSize) {
3263     case 1:
3264       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3265       break;
3266     case 2:
3267       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3268       break;
3269     default:
3270       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3271       break;
3272     }
3273   }
3274 
3275   if (!VIndex)
3276     VIndex = B.buildConstant(S32, 0).getReg(0);
3277 
3278   auto MIB = B.buildInstr(Opc)
3279     .addUse(VData)              // vdata
3280     .addUse(RSrc)               // rsrc
3281     .addUse(VIndex)             // vindex
3282     .addUse(VOffset)            // voffset
3283     .addUse(SOffset)            // soffset
3284     .addImm(ImmOffset);         // offset(imm)
3285 
3286   if (IsTyped)
3287     MIB.addImm(Format);
3288 
3289   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3290      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3291      .addMemOperand(MMO);
3292 
3293   MI.eraseFromParent();
3294   return true;
3295 }
3296 
3297 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3298                                              MachineRegisterInfo &MRI,
3299                                              MachineIRBuilder &B,
3300                                              bool IsFormat,
3301                                              bool IsTyped) const {
3302   B.setInstr(MI);
3303 
3304   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3305   MachineMemOperand *MMO = *MI.memoperands_begin();
3306   const int MemSize = MMO->getSize();
3307   const LLT S32 = LLT::scalar(32);
3308 
3309   Register Dst = MI.getOperand(0).getReg();
3310   Register RSrc = MI.getOperand(2).getReg();
3311 
3312   // The typed intrinsics add an immediate after the registers.
3313   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3314 
3315   // The struct intrinsic variants add one additional operand over raw.
3316   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3317   Register VIndex;
3318   int OpOffset = 0;
3319   if (HasVIndex) {
3320     VIndex = MI.getOperand(3).getReg();
3321     OpOffset = 1;
3322   }
3323 
3324   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3325   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3326 
3327   unsigned Format = 0;
3328   if (IsTyped) {
3329     Format = MI.getOperand(5 + OpOffset).getImm();
3330     ++OpOffset;
3331   }
3332 
3333   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3334   unsigned ImmOffset;
3335   unsigned TotalOffset;
3336 
3337   LLT Ty = MRI.getType(Dst);
3338   LLT EltTy = Ty.getScalarType();
3339   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3340   const bool Unpacked = ST.hasUnpackedD16VMem();
3341 
3342   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3343   if (TotalOffset != 0)
3344     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3345 
3346   unsigned Opc;
3347 
3348   if (IsTyped) {
3349     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3350                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3351   } else if (IsFormat) {
3352     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3353                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3354   } else {
3355     switch (MemSize) {
3356     case 1:
3357       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3358       break;
3359     case 2:
3360       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3361       break;
3362     default:
3363       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3364       break;
3365     }
3366   }
3367 
3368   Register LoadDstReg;
3369 
3370   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3371   LLT UnpackedTy = Ty.changeElementSize(32);
3372 
3373   if (IsExtLoad)
3374     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3375   else if (Unpacked && IsD16 && Ty.isVector())
3376     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3377   else
3378     LoadDstReg = Dst;
3379 
3380   if (!VIndex)
3381     VIndex = B.buildConstant(S32, 0).getReg(0);
3382 
3383   auto MIB = B.buildInstr(Opc)
3384     .addDef(LoadDstReg)         // vdata
3385     .addUse(RSrc)               // rsrc
3386     .addUse(VIndex)             // vindex
3387     .addUse(VOffset)            // voffset
3388     .addUse(SOffset)            // soffset
3389     .addImm(ImmOffset);         // offset(imm)
3390 
3391   if (IsTyped)
3392     MIB.addImm(Format);
3393 
3394   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3395      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3396      .addMemOperand(MMO);
3397 
3398   if (LoadDstReg != Dst) {
3399     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3400 
3401     // Widen result for extending loads was widened.
3402     if (IsExtLoad)
3403       B.buildTrunc(Dst, LoadDstReg);
3404     else {
3405       // Repack to original 16-bit vector result
3406       // FIXME: G_TRUNC should work, but legalization currently fails
3407       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3408       SmallVector<Register, 4> Repack;
3409       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3410         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3411       B.buildMerge(Dst, Repack);
3412     }
3413   }
3414 
3415   MI.eraseFromParent();
3416   return true;
3417 }
3418 
3419 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3420                                                MachineIRBuilder &B,
3421                                                bool IsInc) const {
3422   B.setInstr(MI);
3423   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3424                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3425   B.buildInstr(Opc)
3426     .addDef(MI.getOperand(0).getReg())
3427     .addUse(MI.getOperand(2).getReg())
3428     .addUse(MI.getOperand(3).getReg())
3429     .cloneMemRefs(MI);
3430   MI.eraseFromParent();
3431   return true;
3432 }
3433 
3434 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3435   switch (IntrID) {
3436   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3437   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3438     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3439   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3440   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3441     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3442   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3443   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3444     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3445   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3446   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3447     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3448   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3449   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3450     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3451   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3452   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3453     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3454   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3455   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3456     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3457   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3458   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3459     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3460   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3461   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3462     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3463   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3464   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3465     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3466   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3467   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3468     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3469   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3470   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3471     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3472   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3473   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3474     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3475   default:
3476     llvm_unreachable("unhandled atomic opcode");
3477   }
3478 }
3479 
3480 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3481                                                MachineIRBuilder &B,
3482                                                Intrinsic::ID IID) const {
3483   B.setInstr(MI);
3484 
3485   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3486                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3487 
3488   Register Dst = MI.getOperand(0).getReg();
3489   Register VData = MI.getOperand(2).getReg();
3490 
3491   Register CmpVal;
3492   int OpOffset = 0;
3493 
3494   if (IsCmpSwap) {
3495     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3496     ++OpOffset;
3497   }
3498 
3499   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3500   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3501 
3502   // The struct intrinsic variants add one additional operand over raw.
3503   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3504   Register VIndex;
3505   if (HasVIndex) {
3506     VIndex = MI.getOperand(4 + OpOffset).getReg();
3507     ++OpOffset;
3508   }
3509 
3510   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3511   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3512   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3513 
3514   MachineMemOperand *MMO = *MI.memoperands_begin();
3515 
3516   unsigned ImmOffset;
3517   unsigned TotalOffset;
3518   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3519   if (TotalOffset != 0)
3520     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3521 
3522   if (!VIndex)
3523     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3524 
3525   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3526     .addDef(Dst)
3527     .addUse(VData); // vdata
3528 
3529   if (IsCmpSwap)
3530     MIB.addReg(CmpVal);
3531 
3532   MIB.addUse(RSrc)               // rsrc
3533      .addUse(VIndex)             // vindex
3534      .addUse(VOffset)            // voffset
3535      .addUse(SOffset)            // soffset
3536      .addImm(ImmOffset)          // offset(imm)
3537      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3538      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3539      .addMemOperand(MMO);
3540 
3541   MI.eraseFromParent();
3542   return true;
3543 }
3544 
3545 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3546 /// vector with s16 typed elements.
3547 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3548                                         SmallVectorImpl<Register> &PackedAddrs,
3549                                         int AddrIdx, int DimIdx, int NumVAddrs,
3550                                         int NumGradients) {
3551   const LLT S16 = LLT::scalar(16);
3552   const LLT V2S16 = LLT::vector(2, 16);
3553 
3554   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3555     MachineOperand &SrcOp = MI.getOperand(I);
3556     if (!SrcOp.isReg())
3557       continue; // _L to _LZ may have eliminated this.
3558 
3559     Register AddrReg = SrcOp.getReg();
3560 
3561     if (I < DimIdx) {
3562       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3563       PackedAddrs.push_back(AddrReg);
3564     } else {
3565       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3566       // derivatives dx/dh and dx/dv are packed with undef.
3567       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3568           ((NumGradients / 2) % 2 == 1 &&
3569            (I == DimIdx + (NumGradients / 2) - 1 ||
3570             I == DimIdx + NumGradients - 1)) ||
3571           // Check for _L to _LZ optimization
3572           !MI.getOperand(I + 1).isReg()) {
3573         PackedAddrs.push_back(
3574             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3575                 .getReg(0));
3576       } else {
3577         PackedAddrs.push_back(
3578             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3579                 .getReg(0));
3580         ++I;
3581       }
3582     }
3583   }
3584 }
3585 
3586 /// Convert from separate vaddr components to a single vector address register,
3587 /// and replace the remaining operands with $noreg.
3588 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3589                                      int DimIdx, int NumVAddrs) {
3590   const LLT S32 = LLT::scalar(32);
3591 
3592   SmallVector<Register, 8> AddrRegs;
3593   for (int I = 0; I != NumVAddrs; ++I) {
3594     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3595     if (SrcOp.isReg()) {
3596       AddrRegs.push_back(SrcOp.getReg());
3597       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3598     }
3599   }
3600 
3601   int NumAddrRegs = AddrRegs.size();
3602   if (NumAddrRegs != 1) {
3603     // Round up to 8 elements for v5-v7
3604     // FIXME: Missing intermediate sized register classes and instructions.
3605     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3606       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3607       auto Undef = B.buildUndef(S32);
3608       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3609       NumAddrRegs = RoundedNumRegs;
3610     }
3611 
3612     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3613     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3614   }
3615 
3616   for (int I = 1; I != NumVAddrs; ++I) {
3617     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3618     if (SrcOp.isReg())
3619       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3620   }
3621 }
3622 
3623 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3624 ///
3625 /// Depending on the subtarget, load/store with 16-bit element data need to be
3626 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3627 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3628 /// registers.
3629 ///
3630 /// We don't want to directly select image instructions just yet, but also want
3631 /// to exposes all register repacking to the legalizer/combiners. We also don't
3632 /// want a selected instrution entering RegBankSelect. In order to avoid
3633 /// defining a multitude of intermediate image instructions, directly hack on
3634 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3635 /// now unnecessary arguments with $noreg.
3636 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3637     MachineInstr &MI, MachineIRBuilder &B,
3638     GISelChangeObserver &Observer,
3639     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3640   B.setInstr(MI);
3641 
3642   const int NumDefs = MI.getNumExplicitDefs();
3643   bool IsTFE = NumDefs == 2;
3644   // We are only processing the operands of d16 image operations on subtargets
3645   // that use the unpacked register layout, or need to repack the TFE result.
3646 
3647   // TODO: Do we need to guard against already legalized intrinsics?
3648   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3649     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3650 
3651   MachineRegisterInfo *MRI = B.getMRI();
3652   const LLT S32 = LLT::scalar(32);
3653   const LLT S16 = LLT::scalar(16);
3654   const LLT V2S16 = LLT::vector(2, 16);
3655 
3656   // Index of first address argument
3657   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3658 
3659   // Check for 16 bit addresses and pack if true.
3660   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3661   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3662   const bool IsA16 = AddrTy == S16;
3663 
3664   int NumVAddrs, NumGradients;
3665   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3666   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3667     getDMaskIdx(BaseOpcode, NumDefs);
3668   unsigned DMask = 0;
3669 
3670   int DMaskLanes = 0;
3671   if (!BaseOpcode->Atomic) {
3672     DMask = MI.getOperand(DMaskIdx).getImm();
3673     if (BaseOpcode->Gather4) {
3674       DMaskLanes = 4;
3675     } else if (DMask != 0) {
3676       DMaskLanes = countPopulation(DMask);
3677     } else if (!IsTFE && !BaseOpcode->Store) {
3678       // If dmask is 0, this is a no-op load. This can be eliminated.
3679       B.buildUndef(MI.getOperand(0));
3680       MI.eraseFromParent();
3681       return true;
3682     }
3683   }
3684 
3685   Observer.changingInstr(MI);
3686   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3687 
3688   unsigned NewOpcode = NumDefs == 0 ?
3689     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3690 
3691   // Track that we legalized this
3692   MI.setDesc(B.getTII().get(NewOpcode));
3693 
3694   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3695   // dmask to be at least 1 otherwise the instruction will fail
3696   if (IsTFE && DMask == 0) {
3697     DMask = 0x1;
3698     DMaskLanes = 1;
3699     MI.getOperand(DMaskIdx).setImm(DMask);
3700   }
3701 
3702   if (BaseOpcode->Atomic) {
3703     Register VData0 = MI.getOperand(2).getReg();
3704     LLT Ty = MRI->getType(VData0);
3705 
3706     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3707     if (Ty.isVector())
3708       return false;
3709 
3710     if (BaseOpcode->AtomicX2) {
3711       Register VData1 = MI.getOperand(3).getReg();
3712       // The two values are packed in one register.
3713       LLT PackedTy = LLT::vector(2, Ty);
3714       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3715       MI.getOperand(2).setReg(Concat.getReg(0));
3716       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3717     }
3718   }
3719 
3720   int CorrectedNumVAddrs = NumVAddrs;
3721 
3722   // Optimize _L to _LZ when _L is zero
3723   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3724         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3725     const ConstantFP *ConstantLod;
3726     const int LodIdx = AddrIdx + NumVAddrs - 1;
3727 
3728     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3729       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3730         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3731         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3732           LZMappingInfo->LZ, ImageDimIntr->Dim);
3733 
3734         // The starting indexes should remain in the same place.
3735         --NumVAddrs;
3736         --CorrectedNumVAddrs;
3737 
3738         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3739           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3740         MI.RemoveOperand(LodIdx);
3741       }
3742     }
3743   }
3744 
3745   // Optimize _mip away, when 'lod' is zero
3746   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3747     int64_t ConstantLod;
3748     const int LodIdx = AddrIdx + NumVAddrs - 1;
3749 
3750     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3751       if (ConstantLod == 0) {
3752         // TODO: Change intrinsic opcode and remove operand instead or replacing
3753         // it with 0, as the _L to _LZ handling is done above.
3754         MI.getOperand(LodIdx).ChangeToImmediate(0);
3755         --CorrectedNumVAddrs;
3756       }
3757     }
3758   }
3759 
3760   // If the register allocator cannot place the address registers contiguously
3761   // without introducing moves, then using the non-sequential address encoding
3762   // is always preferable, since it saves VALU instructions and is usually a
3763   // wash in terms of code size or even better.
3764   //
3765   // However, we currently have no way of hinting to the register allocator
3766   // that MIMG addresses should be placed contiguously when it is possible to
3767   // do so, so force non-NSA for the common 2-address case as a heuristic.
3768   //
3769   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3770   // allocation when possible.
3771   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3772 
3773   // Rewrite the addressing register layout before doing anything else.
3774   if (IsA16) {
3775     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3776     // should be introduced.
3777     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3778       return false;
3779 
3780     if (NumVAddrs > 1) {
3781       SmallVector<Register, 4> PackedRegs;
3782       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3783                                   NumGradients);
3784 
3785       if (!UseNSA && PackedRegs.size() > 1) {
3786         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3787         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3788         PackedRegs[0] = Concat.getReg(0);
3789         PackedRegs.resize(1);
3790       }
3791 
3792       const int NumPacked = PackedRegs.size();
3793       for (int I = 0; I != NumVAddrs; ++I) {
3794         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3795         if (!SrcOp.isReg()) {
3796           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3797           continue;
3798         }
3799 
3800         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3801 
3802         if (I < NumPacked)
3803           SrcOp.setReg(PackedRegs[I]);
3804         else
3805           SrcOp.setReg(AMDGPU::NoRegister);
3806       }
3807     }
3808   } else if (!UseNSA && NumVAddrs > 1) {
3809     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3810   }
3811 
3812 
3813   if (BaseOpcode->Store) { // No TFE for stores?
3814     // TODO: Handle dmask trim
3815     Register VData = MI.getOperand(1).getReg();
3816     LLT Ty = MRI->getType(VData);
3817     if (!Ty.isVector() || Ty.getElementType() != S16)
3818       return true;
3819 
3820     B.setInstr(MI);
3821 
3822     Register RepackedReg = handleD16VData(B, *MRI, VData);
3823     if (RepackedReg != VData) {
3824       MI.getOperand(1).setReg(RepackedReg);
3825     }
3826 
3827     return true;
3828   }
3829 
3830   Register DstReg = MI.getOperand(0).getReg();
3831   LLT Ty = MRI->getType(DstReg);
3832   const LLT EltTy = Ty.getScalarType();
3833   const bool IsD16 = Ty.getScalarType() == S16;
3834   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3835 
3836   // Confirm that the return type is large enough for the dmask specified
3837   if (NumElts < DMaskLanes)
3838     return false;
3839 
3840   if (NumElts > 4 || DMaskLanes > 4)
3841     return false;
3842 
3843   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3844   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3845 
3846   // The raw dword aligned data component of the load. The only legal cases
3847   // where this matters should be when using the packed D16 format, for
3848   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3849   LLT RoundedTy;
3850 
3851   // S32 vector to to cover all data, plus TFE result element.
3852   LLT TFETy;
3853 
3854   // Register type to use for each loaded component. Will be S32 or V2S16.
3855   LLT RegTy;
3856 
3857   if (IsD16 && ST.hasUnpackedD16VMem()) {
3858     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3859     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3860     RegTy = S32;
3861   } else {
3862     unsigned EltSize = EltTy.getSizeInBits();
3863     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3864     unsigned RoundedSize = 32 * RoundedElts;
3865     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3866     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3867     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3868   }
3869 
3870   // The return type does not need adjustment.
3871   // TODO: Should we change s16 case to s32 or <2 x s16>?
3872   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3873     return true;
3874 
3875   Register Dst1Reg;
3876 
3877   // Insert after the instruction.
3878   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3879 
3880   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3881   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3882   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3883   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3884 
3885   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3886 
3887   MI.getOperand(0).setReg(NewResultReg);
3888 
3889   // In the IR, TFE is supposed to be used with a 2 element struct return
3890   // type. The intruction really returns these two values in one contiguous
3891   // register, with one additional dword beyond the loaded data. Rewrite the
3892   // return type to use a single register result.
3893 
3894   if (IsTFE) {
3895     Dst1Reg = MI.getOperand(1).getReg();
3896     if (MRI->getType(Dst1Reg) != S32)
3897       return false;
3898 
3899     // TODO: Make sure the TFE operand bit is set.
3900     MI.RemoveOperand(1);
3901 
3902     // Handle the easy case that requires no repack instructions.
3903     if (Ty == S32) {
3904       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3905       return true;
3906     }
3907   }
3908 
3909   // Now figure out how to copy the new result register back into the old
3910   // result.
3911   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3912 
3913   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3914 
3915   if (ResultNumRegs == 1) {
3916     assert(!IsTFE);
3917     ResultRegs[0] = NewResultReg;
3918   } else {
3919     // We have to repack into a new vector of some kind.
3920     for (int I = 0; I != NumDataRegs; ++I)
3921       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3922     B.buildUnmerge(ResultRegs, NewResultReg);
3923 
3924     // Drop the final TFE element to get the data part. The TFE result is
3925     // directly written to the right place already.
3926     if (IsTFE)
3927       ResultRegs.resize(NumDataRegs);
3928   }
3929 
3930   // For an s16 scalar result, we form an s32 result with a truncate regardless
3931   // of packed vs. unpacked.
3932   if (IsD16 && !Ty.isVector()) {
3933     B.buildTrunc(DstReg, ResultRegs[0]);
3934     return true;
3935   }
3936 
3937   // Avoid a build/concat_vector of 1 entry.
3938   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3939     B.buildBitcast(DstReg, ResultRegs[0]);
3940     return true;
3941   }
3942 
3943   assert(Ty.isVector());
3944 
3945   if (IsD16) {
3946     // For packed D16 results with TFE enabled, all the data components are
3947     // S32. Cast back to the expected type.
3948     //
3949     // TODO: We don't really need to use load s32 elements. We would only need one
3950     // cast for the TFE result if a multiple of v2s16 was used.
3951     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3952       for (Register &Reg : ResultRegs)
3953         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3954     } else if (ST.hasUnpackedD16VMem()) {
3955       for (Register &Reg : ResultRegs)
3956         Reg = B.buildTrunc(S16, Reg).getReg(0);
3957     }
3958   }
3959 
3960   auto padWithUndef = [&](LLT Ty, int NumElts) {
3961     if (NumElts == 0)
3962       return;
3963     Register Undef = B.buildUndef(Ty).getReg(0);
3964     for (int I = 0; I != NumElts; ++I)
3965       ResultRegs.push_back(Undef);
3966   };
3967 
3968   // Pad out any elements eliminated due to the dmask.
3969   LLT ResTy = MRI->getType(ResultRegs[0]);
3970   if (!ResTy.isVector()) {
3971     padWithUndef(ResTy, NumElts - ResultRegs.size());
3972     B.buildBuildVector(DstReg, ResultRegs);
3973     return true;
3974   }
3975 
3976   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3977   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3978 
3979   // Deal with the one annoying legal case.
3980   const LLT V3S16 = LLT::vector(3, 16);
3981   if (Ty == V3S16) {
3982     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3983     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3984     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3985     return true;
3986   }
3987 
3988   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3989   B.buildConcatVectors(DstReg, ResultRegs);
3990   return true;
3991 }
3992 
3993 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3994   MachineInstr &MI, MachineIRBuilder &B,
3995   GISelChangeObserver &Observer) const {
3996   Register Dst = MI.getOperand(0).getReg();
3997   LLT Ty = B.getMRI()->getType(Dst);
3998   unsigned Size = Ty.getSizeInBits();
3999   MachineFunction &MF = B.getMF();
4000 
4001   Observer.changingInstr(MI);
4002 
4003   // FIXME: We don't really need this intermediate instruction. The intrinsic
4004   // should be fixed to have a memory operand. Since it's readnone, we're not
4005   // allowed to add one.
4006   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4007   MI.RemoveOperand(1); // Remove intrinsic ID
4008 
4009   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4010   // TODO: Should this use datalayout alignment?
4011   const unsigned MemSize = (Size + 7) / 8;
4012   const Align MemAlign(4);
4013   MachineMemOperand *MMO = MF.getMachineMemOperand(
4014       MachinePointerInfo(),
4015       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4016           MachineMemOperand::MOInvariant,
4017       MemSize, MemAlign);
4018   MI.addMemOperand(MF, MMO);
4019 
4020   // There are no 96-bit result scalar loads, but widening to 128-bit should
4021   // always be legal. We may need to restore this to a 96-bit result if it turns
4022   // out this needs to be converted to a vector load during RegBankSelect.
4023   if (!isPowerOf2_32(Size)) {
4024     LegalizerHelper Helper(MF, *this, Observer, B);
4025     B.setInstr(MI);
4026 
4027     if (Ty.isVector())
4028       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4029     else
4030       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4031   }
4032 
4033   Observer.changedInstr(MI);
4034   return true;
4035 }
4036 
4037 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4038                                                 MachineRegisterInfo &MRI,
4039                                                 MachineIRBuilder &B) const {
4040   B.setInstr(MI);
4041 
4042   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4043   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4044       !ST.isTrapHandlerEnabled()) {
4045     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4046   } else {
4047     // Pass queue pointer to trap handler as input, and insert trap instruction
4048     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4049     const ArgDescriptor *Arg =
4050         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4051     if (!Arg)
4052       return false;
4053     MachineRegisterInfo &MRI = *B.getMRI();
4054     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4055     Register LiveIn = getLiveInRegister(
4056         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4057         /*InsertLiveInCopy=*/false);
4058     if (!loadInputValue(LiveIn, B, Arg))
4059       return false;
4060     B.buildCopy(SGPR01, LiveIn);
4061     B.buildInstr(AMDGPU::S_TRAP)
4062         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4063         .addReg(SGPR01, RegState::Implicit);
4064   }
4065 
4066   MI.eraseFromParent();
4067   return true;
4068 }
4069 
4070 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4071     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4072   B.setInstr(MI);
4073 
4074   // Is non-HSA path or trap-handler disabled? then, report a warning
4075   // accordingly
4076   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4077       !ST.isTrapHandlerEnabled()) {
4078     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4079                                      "debugtrap handler not supported",
4080                                      MI.getDebugLoc(), DS_Warning);
4081     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4082     Ctx.diagnose(NoTrap);
4083   } else {
4084     // Insert debug-trap instruction
4085     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4086   }
4087 
4088   MI.eraseFromParent();
4089   return true;
4090 }
4091 
4092 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4093                                             MachineIRBuilder &B,
4094                                             GISelChangeObserver &Observer) const {
4095   MachineRegisterInfo &MRI = *B.getMRI();
4096 
4097   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4098   auto IntrID = MI.getIntrinsicID();
4099   switch (IntrID) {
4100   case Intrinsic::amdgcn_if:
4101   case Intrinsic::amdgcn_else: {
4102     MachineInstr *Br = nullptr;
4103     MachineBasicBlock *UncondBrTarget = nullptr;
4104     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4105       const SIRegisterInfo *TRI
4106         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4107 
4108       B.setInstr(*BrCond);
4109       Register Def = MI.getOperand(1).getReg();
4110       Register Use = MI.getOperand(3).getReg();
4111 
4112       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4113       if (IntrID == Intrinsic::amdgcn_if) {
4114         B.buildInstr(AMDGPU::SI_IF)
4115           .addDef(Def)
4116           .addUse(Use)
4117           .addMBB(UncondBrTarget);
4118       } else {
4119         B.buildInstr(AMDGPU::SI_ELSE)
4120           .addDef(Def)
4121           .addUse(Use)
4122           .addMBB(UncondBrTarget)
4123           .addImm(0);
4124       }
4125 
4126       if (Br) {
4127         Br->getOperand(0).setMBB(CondBrTarget);
4128       } else {
4129         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4130         // since we're swapping branch targets it needs to be reinserted.
4131         // FIXME: IRTranslator should probably not do this
4132         B.buildBr(*CondBrTarget);
4133       }
4134 
4135       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4136       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4137       MI.eraseFromParent();
4138       BrCond->eraseFromParent();
4139       return true;
4140     }
4141 
4142     return false;
4143   }
4144   case Intrinsic::amdgcn_loop: {
4145     MachineInstr *Br = nullptr;
4146     MachineBasicBlock *UncondBrTarget = nullptr;
4147     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4148       const SIRegisterInfo *TRI
4149         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4150 
4151       B.setInstr(*BrCond);
4152 
4153       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4154       Register Reg = MI.getOperand(2).getReg();
4155       B.buildInstr(AMDGPU::SI_LOOP)
4156         .addUse(Reg)
4157         .addMBB(UncondBrTarget);
4158 
4159       if (Br)
4160         Br->getOperand(0).setMBB(CondBrTarget);
4161       else
4162         B.buildBr(*CondBrTarget);
4163 
4164       MI.eraseFromParent();
4165       BrCond->eraseFromParent();
4166       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4167       return true;
4168     }
4169 
4170     return false;
4171   }
4172   case Intrinsic::amdgcn_kernarg_segment_ptr:
4173     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4174       B.setInstr(MI);
4175       // This only makes sense to call in a kernel, so just lower to null.
4176       B.buildConstant(MI.getOperand(0).getReg(), 0);
4177       MI.eraseFromParent();
4178       return true;
4179     }
4180 
4181     return legalizePreloadedArgIntrin(
4182       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4183   case Intrinsic::amdgcn_implicitarg_ptr:
4184     return legalizeImplicitArgPtr(MI, MRI, B);
4185   case Intrinsic::amdgcn_workitem_id_x:
4186     return legalizePreloadedArgIntrin(MI, MRI, B,
4187                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4188   case Intrinsic::amdgcn_workitem_id_y:
4189     return legalizePreloadedArgIntrin(MI, MRI, B,
4190                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4191   case Intrinsic::amdgcn_workitem_id_z:
4192     return legalizePreloadedArgIntrin(MI, MRI, B,
4193                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4194   case Intrinsic::amdgcn_workgroup_id_x:
4195     return legalizePreloadedArgIntrin(MI, MRI, B,
4196                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4197   case Intrinsic::amdgcn_workgroup_id_y:
4198     return legalizePreloadedArgIntrin(MI, MRI, B,
4199                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4200   case Intrinsic::amdgcn_workgroup_id_z:
4201     return legalizePreloadedArgIntrin(MI, MRI, B,
4202                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4203   case Intrinsic::amdgcn_dispatch_ptr:
4204     return legalizePreloadedArgIntrin(MI, MRI, B,
4205                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4206   case Intrinsic::amdgcn_queue_ptr:
4207     return legalizePreloadedArgIntrin(MI, MRI, B,
4208                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4209   case Intrinsic::amdgcn_implicit_buffer_ptr:
4210     return legalizePreloadedArgIntrin(
4211       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4212   case Intrinsic::amdgcn_dispatch_id:
4213     return legalizePreloadedArgIntrin(MI, MRI, B,
4214                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4215   case Intrinsic::amdgcn_fdiv_fast:
4216     return legalizeFDIVFastIntrin(MI, MRI, B);
4217   case Intrinsic::amdgcn_is_shared:
4218     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4219   case Intrinsic::amdgcn_is_private:
4220     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4221   case Intrinsic::amdgcn_wavefrontsize: {
4222     B.setInstr(MI);
4223     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4224     MI.eraseFromParent();
4225     return true;
4226   }
4227   case Intrinsic::amdgcn_s_buffer_load:
4228     return legalizeSBufferLoad(MI, B, Observer);
4229   case Intrinsic::amdgcn_raw_buffer_store:
4230   case Intrinsic::amdgcn_struct_buffer_store:
4231     return legalizeBufferStore(MI, MRI, B, false, false);
4232   case Intrinsic::amdgcn_raw_buffer_store_format:
4233   case Intrinsic::amdgcn_struct_buffer_store_format:
4234     return legalizeBufferStore(MI, MRI, B, false, true);
4235   case Intrinsic::amdgcn_raw_tbuffer_store:
4236   case Intrinsic::amdgcn_struct_tbuffer_store:
4237     return legalizeBufferStore(MI, MRI, B, true, true);
4238   case Intrinsic::amdgcn_raw_buffer_load:
4239   case Intrinsic::amdgcn_struct_buffer_load:
4240     return legalizeBufferLoad(MI, MRI, B, false, false);
4241   case Intrinsic::amdgcn_raw_buffer_load_format:
4242   case Intrinsic::amdgcn_struct_buffer_load_format:
4243     return legalizeBufferLoad(MI, MRI, B, true, false);
4244   case Intrinsic::amdgcn_raw_tbuffer_load:
4245   case Intrinsic::amdgcn_struct_tbuffer_load:
4246     return legalizeBufferLoad(MI, MRI, B, true, true);
4247   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4248   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4249   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4250   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4251   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4252   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4253   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4254   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4255   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4256   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4257   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4258   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4259   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4260   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4261   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4262   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4263   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4264   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4265   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4266   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4267   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4268   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4269   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4270   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4271   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4272   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4273     return legalizeBufferAtomic(MI, B, IntrID);
4274   case Intrinsic::amdgcn_atomic_inc:
4275     return legalizeAtomicIncDec(MI, B, true);
4276   case Intrinsic::amdgcn_atomic_dec:
4277     return legalizeAtomicIncDec(MI, B, false);
4278   case Intrinsic::trap:
4279     return legalizeTrapIntrinsic(MI, MRI, B);
4280   case Intrinsic::debugtrap:
4281     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4282   default: {
4283     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4284             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4285       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4286     return true;
4287   }
4288   }
4289 
4290   return true;
4291 }
4292