1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Round the number of elements to the next power of two elements
40 static LLT getPow2VectorType(LLT Ty) {
41   unsigned NElts = Ty.getNumElements();
42   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
43   return Ty.changeNumElements(Pow2NElts);
44 }
45 
46 // Round the number of bits to the next power of two bits
47 static LLT getPow2ScalarType(LLT Ty) {
48   unsigned Bits = Ty.getSizeInBits();
49   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
50   return LLT::scalar(Pow2Bits);
51 }
52 
53 static LegalityPredicate isMultiple32(unsigned TypeIdx,
54                                       unsigned MaxSize = 1024) {
55   return [=](const LegalityQuery &Query) {
56     const LLT Ty = Query.Types[TypeIdx];
57     const LLT EltTy = Ty.getScalarType();
58     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
59   };
60 }
61 
62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
63   return [=](const LegalityQuery &Query) {
64     const LLT Ty = Query.Types[TypeIdx];
65     return Ty.isVector() &&
66            Ty.getNumElements() % 2 != 0 &&
67            Ty.getElementType().getSizeInBits() < 32 &&
68            Ty.getSizeInBits() % 32 != 0;
69   };
70 }
71 
72 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     const LLT EltTy = Ty.getScalarType();
76     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
77   };
78 }
79 
80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     const LLT EltTy = Ty.getElementType();
84     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
85   };
86 }
87 
88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getElementType();
92     unsigned Size = Ty.getSizeInBits();
93     unsigned Pieces = (Size + 63) / 64;
94     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
95     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
96   };
97 }
98 
99 // Increase the number of vector elements to reach the next multiple of 32-bit
100 // type.
101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104 
105     const LLT EltTy = Ty.getElementType();
106     const int Size = Ty.getSizeInBits();
107     const int EltSize = EltTy.getSizeInBits();
108     const int NextMul32 = (Size + 31) / 32;
109 
110     assert(EltSize < 32);
111 
112     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
113     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
114   };
115 }
116 
117 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
118   return [=](const LegalityQuery &Query) {
119     const LLT QueryTy = Query.Types[TypeIdx];
120     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
121   };
122 }
123 
124 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
125   return [=](const LegalityQuery &Query) {
126     const LLT QueryTy = Query.Types[TypeIdx];
127     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
128   };
129 }
130 
131 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
132   return [=](const LegalityQuery &Query) {
133     const LLT QueryTy = Query.Types[TypeIdx];
134     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
135   };
136 }
137 
138 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
139 // v2s16.
140 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
141   return [=](const LegalityQuery &Query) {
142     const LLT Ty = Query.Types[TypeIdx];
143     if (Ty.isVector()) {
144       const int EltSize = Ty.getElementType().getSizeInBits();
145       return EltSize == 32 || EltSize == 64 ||
146             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
147              EltSize == 128 || EltSize == 256;
148     }
149 
150     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
151   };
152 }
153 
154 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT QueryTy = Query.Types[TypeIdx];
157     if (!QueryTy.isVector())
158       return false;
159     const LLT EltTy = QueryTy.getElementType();
160     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
161   };
162 }
163 
164 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
165   return [=](const LegalityQuery &Query) {
166     const LLT Ty = Query.Types[TypeIdx];
167     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
168            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
169   };
170 }
171 
172 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
173                                          const GCNTargetMachine &TM)
174   :  ST(ST_) {
175   using namespace TargetOpcode;
176 
177   auto GetAddrSpacePtr = [&TM](unsigned AS) {
178     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
179   };
180 
181   const LLT S1 = LLT::scalar(1);
182   const LLT S16 = LLT::scalar(16);
183   const LLT S32 = LLT::scalar(32);
184   const LLT S64 = LLT::scalar(64);
185   const LLT S128 = LLT::scalar(128);
186   const LLT S256 = LLT::scalar(256);
187   const LLT S512 = LLT::scalar(512);
188   const LLT S1024 = LLT::scalar(1024);
189 
190   const LLT V2S16 = LLT::vector(2, 16);
191   const LLT V4S16 = LLT::vector(4, 16);
192 
193   const LLT V2S32 = LLT::vector(2, 32);
194   const LLT V3S32 = LLT::vector(3, 32);
195   const LLT V4S32 = LLT::vector(4, 32);
196   const LLT V5S32 = LLT::vector(5, 32);
197   const LLT V6S32 = LLT::vector(6, 32);
198   const LLT V7S32 = LLT::vector(7, 32);
199   const LLT V8S32 = LLT::vector(8, 32);
200   const LLT V9S32 = LLT::vector(9, 32);
201   const LLT V10S32 = LLT::vector(10, 32);
202   const LLT V11S32 = LLT::vector(11, 32);
203   const LLT V12S32 = LLT::vector(12, 32);
204   const LLT V13S32 = LLT::vector(13, 32);
205   const LLT V14S32 = LLT::vector(14, 32);
206   const LLT V15S32 = LLT::vector(15, 32);
207   const LLT V16S32 = LLT::vector(16, 32);
208   const LLT V32S32 = LLT::vector(32, 32);
209 
210   const LLT V2S64 = LLT::vector(2, 64);
211   const LLT V3S64 = LLT::vector(3, 64);
212   const LLT V4S64 = LLT::vector(4, 64);
213   const LLT V5S64 = LLT::vector(5, 64);
214   const LLT V6S64 = LLT::vector(6, 64);
215   const LLT V7S64 = LLT::vector(7, 64);
216   const LLT V8S64 = LLT::vector(8, 64);
217   const LLT V16S64 = LLT::vector(16, 64);
218 
219   std::initializer_list<LLT> AllS32Vectors =
220     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
221      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
222   std::initializer_list<LLT> AllS64Vectors =
223     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
224 
225   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
226   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
227   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
228   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
229   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
230   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
231   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
232 
233   const LLT CodePtr = FlatPtr;
234 
235   const std::initializer_list<LLT> AddrSpaces64 = {
236     GlobalPtr, ConstantPtr, FlatPtr
237   };
238 
239   const std::initializer_list<LLT> AddrSpaces32 = {
240     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
241   };
242 
243   const std::initializer_list<LLT> FPTypesBase = {
244     S32, S64
245   };
246 
247   const std::initializer_list<LLT> FPTypes16 = {
248     S32, S64, S16
249   };
250 
251   const std::initializer_list<LLT> FPTypesPK16 = {
252     S32, S64, S16, V2S16
253   };
254 
255   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
256 
257   setAction({G_BRCOND, S1}, Legal); // VCC branches
258   setAction({G_BRCOND, S32}, Legal); // SCC branches
259 
260   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
261   // elements for v3s16
262   getActionDefinitionsBuilder(G_PHI)
263     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
264     .legalFor(AllS32Vectors)
265     .legalFor(AllS64Vectors)
266     .legalFor(AddrSpaces64)
267     .legalFor(AddrSpaces32)
268     .clampScalar(0, S32, S256)
269     .widenScalarToNextPow2(0, 32)
270     .clampMaxNumElements(0, S32, 16)
271     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
272     .legalIf(isPointer(0));
273 
274   if (ST.hasVOP3PInsts()) {
275     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
276       .legalFor({S32, S16, V2S16})
277       .clampScalar(0, S16, S32)
278       .clampMaxNumElements(0, S16, 2)
279       .scalarize(0)
280       .widenScalarToNextPow2(0, 32);
281   } else if (ST.has16BitInsts()) {
282     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
283       .legalFor({S32, S16})
284       .clampScalar(0, S16, S32)
285       .scalarize(0)
286       .widenScalarToNextPow2(0, 32);
287   } else {
288     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
289       .legalFor({S32})
290       .clampScalar(0, S32, S32)
291       .scalarize(0);
292   }
293 
294   // FIXME: Not really legal. Placeholder for custom lowering.
295   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
296     .customFor({S32, S64})
297     .clampScalar(0, S32, S64)
298     .widenScalarToNextPow2(0, 32)
299     .scalarize(0);
300 
301   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
302     .legalFor({S32})
303     .clampScalar(0, S32, S32)
304     .scalarize(0);
305 
306   // Report legal for any types we can handle anywhere. For the cases only legal
307   // on the SALU, RegBankSelect will be able to re-legalize.
308   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
309     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
310     .clampScalar(0, S32, S64)
311     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
312     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
313     .widenScalarToNextPow2(0)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
317                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
318     .legalFor({{S32, S1}, {S32, S32}})
319     .minScalar(0, S32)
320     // TODO: .scalarize(0)
321     .lower();
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     .lower();
327 
328 
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   getActionDefinitionsBuilder(G_FCONSTANT)
337     .legalFor({S32, S64, S16})
338     .clampScalar(0, S16, S64);
339 
340   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
341       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
342                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
343       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344       .clampScalarOrElt(0, S32, S1024)
345       .legalIf(isMultiple32(0))
346       .widenScalarToNextPow2(0, 32)
347       .clampMaxNumElements(0, S32, 16);
348 
349   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
350   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
351     .unsupportedFor({PrivatePtr})
352     .custom();
353   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
354 
355   auto &FPOpActions = getActionDefinitionsBuilder(
356     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
357     .legalFor({S32, S64});
358   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
359     .customFor({S32, S64});
360   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
361     .customFor({S32, S64});
362 
363   if (ST.has16BitInsts()) {
364     if (ST.hasVOP3PInsts())
365       FPOpActions.legalFor({S16, V2S16});
366     else
367       FPOpActions.legalFor({S16});
368 
369     TrigActions.customFor({S16});
370     FDIVActions.customFor({S16});
371   }
372 
373   auto &MinNumMaxNum = getActionDefinitionsBuilder({
374       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
375 
376   if (ST.hasVOP3PInsts()) {
377     MinNumMaxNum.customFor(FPTypesPK16)
378       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
379       .clampMaxNumElements(0, S16, 2)
380       .clampScalar(0, S16, S64)
381       .scalarize(0);
382   } else if (ST.has16BitInsts()) {
383     MinNumMaxNum.customFor(FPTypes16)
384       .clampScalar(0, S16, S64)
385       .scalarize(0);
386   } else {
387     MinNumMaxNum.customFor(FPTypesBase)
388       .clampScalar(0, S32, S64)
389       .scalarize(0);
390   }
391 
392   if (ST.hasVOP3PInsts())
393     FPOpActions.clampMaxNumElements(0, S16, 2);
394 
395   FPOpActions
396     .scalarize(0)
397     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
398 
399   TrigActions
400     .scalarize(0)
401     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
402 
403   FDIVActions
404     .scalarize(0)
405     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
406 
407   getActionDefinitionsBuilder({G_FNEG, G_FABS})
408     .legalFor(FPTypesPK16)
409     .clampMaxNumElements(0, S16, 2)
410     .scalarize(0)
411     .clampScalar(0, S16, S64);
412 
413   if (ST.has16BitInsts()) {
414     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
415       .legalFor({S32, S64, S16})
416       .scalarize(0)
417       .clampScalar(0, S16, S64);
418   } else {
419     getActionDefinitionsBuilder(G_FSQRT)
420       .legalFor({S32, S64})
421       .scalarize(0)
422       .clampScalar(0, S32, S64);
423 
424     if (ST.hasFractBug()) {
425       getActionDefinitionsBuilder(G_FFLOOR)
426         .customFor({S64})
427         .legalFor({S32, S64})
428         .scalarize(0)
429         .clampScalar(0, S32, S64);
430     } else {
431       getActionDefinitionsBuilder(G_FFLOOR)
432         .legalFor({S32, S64})
433         .scalarize(0)
434         .clampScalar(0, S32, S64);
435     }
436   }
437 
438   getActionDefinitionsBuilder(G_FPTRUNC)
439     .legalFor({{S32, S64}, {S16, S32}})
440     .scalarize(0)
441     .lower();
442 
443   getActionDefinitionsBuilder(G_FPEXT)
444     .legalFor({{S64, S32}, {S32, S16}})
445     .lowerFor({{S64, S16}}) // FIXME: Implement
446     .scalarize(0);
447 
448   getActionDefinitionsBuilder(G_FSUB)
449       // Use actual fsub instruction
450       .legalFor({S32})
451       // Must use fadd + fneg
452       .lowerFor({S64, S16, V2S16})
453       .scalarize(0)
454       .clampScalar(0, S32, S64);
455 
456   // Whether this is legal depends on the floating point mode for the function.
457   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
458   if (ST.hasMadF16())
459     FMad.customFor({S32, S16});
460   else
461     FMad.customFor({S32});
462   FMad.scalarize(0)
463       .lower();
464 
465   // TODO: Do we need to clamp maximum bitwidth?
466   getActionDefinitionsBuilder(G_TRUNC)
467     .legalIf(isScalar(0))
468     .legalFor({{V2S16, V2S32}})
469     .clampMaxNumElements(0, S16, 2)
470     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
471     // situations (like an invalid implicit use), we don't want to infinite loop
472     // in the legalizer.
473     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
474     .alwaysLegal();
475 
476   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
477     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
478                {S32, S1}, {S64, S1}, {S16, S1}})
479     .scalarize(0)
480     .clampScalar(0, S32, S64)
481     .widenScalarToNextPow2(1, 32);
482 
483   // TODO: Split s1->s64 during regbankselect for VALU.
484   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
485     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
486     .lowerFor({{S32, S64}})
487     .lowerIf(typeIs(1, S1))
488     .customFor({{S64, S64}});
489   if (ST.has16BitInsts())
490     IToFP.legalFor({{S16, S16}});
491   IToFP.clampScalar(1, S32, S64)
492        .scalarize(0)
493        .widenScalarToNextPow2(1);
494 
495   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
496     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
497     .customFor({{S64, S64}});
498   if (ST.has16BitInsts())
499     FPToI.legalFor({{S16, S16}});
500   else
501     FPToI.minScalar(1, S32);
502 
503   FPToI.minScalar(0, S32)
504        .scalarize(0)
505        .lower();
506 
507   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
508     .scalarize(0)
509     .lower();
510 
511   if (ST.has16BitInsts()) {
512     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
513       .legalFor({S16, S32, S64})
514       .clampScalar(0, S16, S64)
515       .scalarize(0);
516   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
517     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
518       .legalFor({S32, S64})
519       .clampScalar(0, S32, S64)
520       .scalarize(0);
521   } else {
522     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
523       .legalFor({S32})
524       .customFor({S64})
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   }
528 
529   // FIXME: Clamp offset operand.
530   getActionDefinitionsBuilder(G_PTR_ADD)
531     .legalIf(isPointer(0))
532     .scalarize(0);
533 
534   getActionDefinitionsBuilder(G_PTRMASK)
535     .legalIf(typeInSet(1, {S64, S32}))
536     .minScalar(1, S32)
537     .maxScalarIf(sizeIs(0, 32), 1, S32)
538     .maxScalarIf(sizeIs(0, 64), 1, S64)
539     .scalarize(0);
540 
541   auto &CmpBuilder =
542     getActionDefinitionsBuilder(G_ICMP)
543     // The compare output type differs based on the register bank of the output,
544     // so make both s1 and s32 legal.
545     //
546     // Scalar compares producing output in scc will be promoted to s32, as that
547     // is the allocatable register type that will be needed for the copy from
548     // scc. This will be promoted during RegBankSelect, and we assume something
549     // before that won't try to use s32 result types.
550     //
551     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
552     // bank.
553     .legalForCartesianProduct(
554       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
555     .legalForCartesianProduct(
556       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
557   if (ST.has16BitInsts()) {
558     CmpBuilder.legalFor({{S1, S16}});
559   }
560 
561   CmpBuilder
562     .widenScalarToNextPow2(1)
563     .clampScalar(1, S32, S64)
564     .scalarize(0)
565     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
566 
567   getActionDefinitionsBuilder(G_FCMP)
568     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
569     .widenScalarToNextPow2(1)
570     .clampScalar(1, S32, S64)
571     .scalarize(0);
572 
573   // FIXME: fpow has a selection pattern that should move to custom lowering.
574   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
575   if (ST.has16BitInsts())
576     Exp2Ops.legalFor({S32, S16});
577   else
578     Exp2Ops.legalFor({S32});
579   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
580   Exp2Ops.scalarize(0);
581 
582   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
583   if (ST.has16BitInsts())
584     ExpOps.customFor({{S32}, {S16}});
585   else
586     ExpOps.customFor({S32});
587   ExpOps.clampScalar(0, MinScalarFPTy, S32)
588         .scalarize(0);
589 
590   // The 64-bit versions produce 32-bit results, but only on the SALU.
591   getActionDefinitionsBuilder(G_CTPOP)
592     .legalFor({{S32, S32}, {S32, S64}})
593     .clampScalar(0, S32, S32)
594     .clampScalar(1, S32, S64)
595     .scalarize(0)
596     .widenScalarToNextPow2(0, 32)
597     .widenScalarToNextPow2(1, 32);
598 
599   // The hardware instructions return a different result on 0 than the generic
600   // instructions expect. The hardware produces -1, but these produce the
601   // bitwidth.
602   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
603     .scalarize(0)
604     .clampScalar(0, S32, S32)
605     .clampScalar(1, S32, S64)
606     .widenScalarToNextPow2(0, 32)
607     .widenScalarToNextPow2(1, 32)
608     .lower();
609 
610   // The 64-bit versions produce 32-bit results, but only on the SALU.
611   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
612     .legalFor({{S32, S32}, {S32, S64}})
613     .clampScalar(0, S32, S32)
614     .clampScalar(1, S32, S64)
615     .scalarize(0)
616     .widenScalarToNextPow2(0, 32)
617     .widenScalarToNextPow2(1, 32);
618 
619   getActionDefinitionsBuilder(G_BITREVERSE)
620     .legalFor({S32})
621     .clampScalar(0, S32, S32)
622     .scalarize(0);
623 
624   if (ST.has16BitInsts()) {
625     getActionDefinitionsBuilder(G_BSWAP)
626       .legalFor({S16, S32, V2S16})
627       .clampMaxNumElements(0, S16, 2)
628       // FIXME: Fixing non-power-of-2 before clamp is workaround for
629       // narrowScalar limitation.
630       .widenScalarToNextPow2(0)
631       .clampScalar(0, S16, S32)
632       .scalarize(0);
633 
634     if (ST.hasVOP3PInsts()) {
635       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
636         .legalFor({S32, S16, V2S16})
637         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
638         .clampMaxNumElements(0, S16, 2)
639         .minScalar(0, S16)
640         .widenScalarToNextPow2(0)
641         .scalarize(0)
642         .lower();
643     } else {
644       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
645         .legalFor({S32, S16})
646         .widenScalarToNextPow2(0)
647         .minScalar(0, S16)
648         .scalarize(0)
649         .lower();
650     }
651   } else {
652     // TODO: Should have same legality without v_perm_b32
653     getActionDefinitionsBuilder(G_BSWAP)
654       .legalFor({S32})
655       .lowerIf(scalarNarrowerThan(0, 32))
656       // FIXME: Fixing non-power-of-2 before clamp is workaround for
657       // narrowScalar limitation.
658       .widenScalarToNextPow2(0)
659       .maxScalar(0, S32)
660       .scalarize(0)
661       .lower();
662 
663     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
664       .legalFor({S32})
665       .minScalar(0, S32)
666       .widenScalarToNextPow2(0)
667       .scalarize(0)
668       .lower();
669   }
670 
671   getActionDefinitionsBuilder(G_INTTOPTR)
672     // List the common cases
673     .legalForCartesianProduct(AddrSpaces64, {S64})
674     .legalForCartesianProduct(AddrSpaces32, {S32})
675     .scalarize(0)
676     // Accept any address space as long as the size matches
677     .legalIf(sameSize(0, 1))
678     .widenScalarIf(smallerThan(1, 0),
679       [](const LegalityQuery &Query) {
680         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
681       })
682     .narrowScalarIf(largerThan(1, 0),
683       [](const LegalityQuery &Query) {
684         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
685       });
686 
687   getActionDefinitionsBuilder(G_PTRTOINT)
688     // List the common cases
689     .legalForCartesianProduct(AddrSpaces64, {S64})
690     .legalForCartesianProduct(AddrSpaces32, {S32})
691     .scalarize(0)
692     // Accept any address space as long as the size matches
693     .legalIf(sameSize(0, 1))
694     .widenScalarIf(smallerThan(0, 1),
695       [](const LegalityQuery &Query) {
696         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
697       })
698     .narrowScalarIf(
699       largerThan(0, 1),
700       [](const LegalityQuery &Query) {
701         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
702       });
703 
704   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
705     .scalarize(0)
706     .custom();
707 
708   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
709   // handle some operations by just promoting the register during
710   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
711   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
712     switch (AS) {
713     // FIXME: Private element size.
714     case AMDGPUAS::PRIVATE_ADDRESS:
715       return 32;
716     // FIXME: Check subtarget
717     case AMDGPUAS::LOCAL_ADDRESS:
718       return ST.useDS128() ? 128 : 64;
719 
720     // Treat constant and global as identical. SMRD loads are sometimes usable
721     // for global loads (ideally constant address space should be eliminated)
722     // depending on the context. Legality cannot be context dependent, but
723     // RegBankSelect can split the load as necessary depending on the pointer
724     // register bank/uniformity and if the memory is invariant or not written in
725     // a kernel.
726     case AMDGPUAS::CONSTANT_ADDRESS:
727     case AMDGPUAS::GLOBAL_ADDRESS:
728       return IsLoad ? 512 : 128;
729     default:
730       return 128;
731     }
732   };
733 
734   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
735                                     bool IsLoad) -> bool {
736     const LLT DstTy = Query.Types[0];
737 
738     // Split vector extloads.
739     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
740     unsigned Align = Query.MMODescrs[0].AlignInBits;
741 
742     if (MemSize < DstTy.getSizeInBits())
743       MemSize = std::max(MemSize, Align);
744 
745     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
746       return true;
747 
748     const LLT PtrTy = Query.Types[1];
749     unsigned AS = PtrTy.getAddressSpace();
750     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
751       return true;
752 
753     // Catch weird sized loads that don't evenly divide into the access sizes
754     // TODO: May be able to widen depending on alignment etc.
755     unsigned NumRegs = (MemSize + 31) / 32;
756     if (NumRegs == 3) {
757       if (!ST.hasDwordx3LoadStores())
758         return true;
759     } else {
760       // If the alignment allows, these should have been widened.
761       if (!isPowerOf2_32(NumRegs))
762         return true;
763     }
764 
765     if (Align < MemSize) {
766       const SITargetLowering *TLI = ST.getTargetLowering();
767       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
768     }
769 
770     return false;
771   };
772 
773   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
774     unsigned Size = Query.Types[0].getSizeInBits();
775     if (isPowerOf2_32(Size))
776       return false;
777 
778     if (Size == 96 && ST.hasDwordx3LoadStores())
779       return false;
780 
781     unsigned AddrSpace = Query.Types[1].getAddressSpace();
782     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
783       return false;
784 
785     unsigned Align = Query.MMODescrs[0].AlignInBits;
786     unsigned RoundedSize = NextPowerOf2(Size);
787     return (Align >= RoundedSize);
788   };
789 
790   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
791   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
792   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
793 
794   // TODO: Refine based on subtargets which support unaligned access or 128-bit
795   // LDS
796   // TODO: Unsupported flat for SI.
797 
798   for (unsigned Op : {G_LOAD, G_STORE}) {
799     const bool IsStore = Op == G_STORE;
800 
801     auto &Actions = getActionDefinitionsBuilder(Op);
802     // Whitelist the common cases.
803     // TODO: Loads to s16 on gfx9
804     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
805                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
806                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
807                                       {S128, GlobalPtr, 128, GlobalAlign32},
808                                       {S64, GlobalPtr, 64, GlobalAlign32},
809                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
810                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
811                                       {S32, GlobalPtr, 8, GlobalAlign8},
812                                       {S32, GlobalPtr, 16, GlobalAlign16},
813 
814                                       {S32, LocalPtr, 32, 32},
815                                       {S64, LocalPtr, 64, 32},
816                                       {V2S32, LocalPtr, 64, 32},
817                                       {S32, LocalPtr, 8, 8},
818                                       {S32, LocalPtr, 16, 16},
819                                       {V2S16, LocalPtr, 32, 32},
820 
821                                       {S32, PrivatePtr, 32, 32},
822                                       {S32, PrivatePtr, 8, 8},
823                                       {S32, PrivatePtr, 16, 16},
824                                       {V2S16, PrivatePtr, 32, 32},
825 
826                                       {S32, FlatPtr, 32, GlobalAlign32},
827                                       {S32, FlatPtr, 16, GlobalAlign16},
828                                       {S32, FlatPtr, 8, GlobalAlign8},
829                                       {V2S16, FlatPtr, 32, GlobalAlign32},
830 
831                                       {S32, ConstantPtr, 32, GlobalAlign32},
832                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
833                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
834                                       {S64, ConstantPtr, 64, GlobalAlign32},
835                                       {S128, ConstantPtr, 128, GlobalAlign32},
836                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
837     Actions
838         .customIf(typeIs(1, Constant32Ptr))
839         // Widen suitably aligned loads by loading extra elements.
840         .moreElementsIf([=](const LegalityQuery &Query) {
841             const LLT Ty = Query.Types[0];
842             return Op == G_LOAD && Ty.isVector() &&
843                    shouldWidenLoadResult(Query);
844           }, moreElementsToNextPow2(0))
845         .widenScalarIf([=](const LegalityQuery &Query) {
846             const LLT Ty = Query.Types[0];
847             return Op == G_LOAD && !Ty.isVector() &&
848                    shouldWidenLoadResult(Query);
849           }, widenScalarOrEltToNextPow2(0))
850         .narrowScalarIf(
851             [=](const LegalityQuery &Query) -> bool {
852               return !Query.Types[0].isVector() &&
853                      needToSplitMemOp(Query, Op == G_LOAD);
854             },
855             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
856               const LLT DstTy = Query.Types[0];
857               const LLT PtrTy = Query.Types[1];
858 
859               const unsigned DstSize = DstTy.getSizeInBits();
860               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
861 
862               // Split extloads.
863               if (DstSize > MemSize)
864                 return std::make_pair(0, LLT::scalar(MemSize));
865 
866               if (!isPowerOf2_32(DstSize)) {
867                 // We're probably decomposing an odd sized store. Try to split
868                 // to the widest type. TODO: Account for alignment. As-is it
869                 // should be OK, since the new parts will be further legalized.
870                 unsigned FloorSize = PowerOf2Floor(DstSize);
871                 return std::make_pair(0, LLT::scalar(FloorSize));
872               }
873 
874               if (DstSize > 32 && (DstSize % 32 != 0)) {
875                 // FIXME: Need a way to specify non-extload of larger size if
876                 // suitably aligned.
877                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
878               }
879 
880               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
881                                                      Op == G_LOAD);
882               if (MemSize > MaxSize)
883                 return std::make_pair(0, LLT::scalar(MaxSize));
884 
885               unsigned Align = Query.MMODescrs[0].AlignInBits;
886               return std::make_pair(0, LLT::scalar(Align));
887             })
888         .fewerElementsIf(
889             [=](const LegalityQuery &Query) -> bool {
890               return Query.Types[0].isVector() &&
891                      needToSplitMemOp(Query, Op == G_LOAD);
892             },
893             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
894               const LLT DstTy = Query.Types[0];
895               const LLT PtrTy = Query.Types[1];
896 
897               LLT EltTy = DstTy.getElementType();
898               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
899                                                      Op == G_LOAD);
900 
901               // FIXME: Handle widened to power of 2 results better. This ends
902               // up scalarizing.
903               // FIXME: 3 element stores scalarized on SI
904 
905               // Split if it's too large for the address space.
906               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
907                 unsigned NumElts = DstTy.getNumElements();
908                 unsigned EltSize = EltTy.getSizeInBits();
909 
910                 if (MaxSize % EltSize == 0) {
911                   return std::make_pair(
912                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
913                 }
914 
915                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
916 
917                 // FIXME: Refine when odd breakdowns handled
918                 // The scalars will need to be re-legalized.
919                 if (NumPieces == 1 || NumPieces >= NumElts ||
920                     NumElts % NumPieces != 0)
921                   return std::make_pair(0, EltTy);
922 
923                 return std::make_pair(0,
924                                       LLT::vector(NumElts / NumPieces, EltTy));
925               }
926 
927               // FIXME: We could probably handle weird extending loads better.
928               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
929               if (DstTy.getSizeInBits() > MemSize)
930                 return std::make_pair(0, EltTy);
931 
932               unsigned EltSize = EltTy.getSizeInBits();
933               unsigned DstSize = DstTy.getSizeInBits();
934               if (!isPowerOf2_32(DstSize)) {
935                 // We're probably decomposing an odd sized store. Try to split
936                 // to the widest type. TODO: Account for alignment. As-is it
937                 // should be OK, since the new parts will be further legalized.
938                 unsigned FloorSize = PowerOf2Floor(DstSize);
939                 return std::make_pair(
940                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
941               }
942 
943               // Need to split because of alignment.
944               unsigned Align = Query.MMODescrs[0].AlignInBits;
945               if (EltSize > Align &&
946                   (EltSize / Align < DstTy.getNumElements())) {
947                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
948               }
949 
950               // May need relegalization for the scalars.
951               return std::make_pair(0, EltTy);
952             })
953         .minScalar(0, S32);
954 
955     if (IsStore)
956       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
957 
958     // TODO: Need a bitcast lower option?
959     Actions
960         .legalIf([=](const LegalityQuery &Query) {
961           const LLT Ty0 = Query.Types[0];
962           unsigned Size = Ty0.getSizeInBits();
963           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
964           unsigned Align = Query.MMODescrs[0].AlignInBits;
965 
966           // FIXME: Widening store from alignment not valid.
967           if (MemSize < Size)
968             MemSize = std::max(MemSize, Align);
969 
970           // No extending vector loads.
971           if (Size > MemSize && Ty0.isVector())
972             return false;
973 
974           switch (MemSize) {
975           case 8:
976           case 16:
977             return Size == 32;
978           case 32:
979           case 64:
980           case 128:
981             return true;
982           case 96:
983             return ST.hasDwordx3LoadStores();
984           case 256:
985           case 512:
986             return true;
987           default:
988             return false;
989           }
990         })
991         .widenScalarToNextPow2(0)
992         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
993   }
994 
995   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
996                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
997                                                   {S32, GlobalPtr, 16, 2 * 8},
998                                                   {S32, LocalPtr, 8, 8},
999                                                   {S32, LocalPtr, 16, 16},
1000                                                   {S32, PrivatePtr, 8, 8},
1001                                                   {S32, PrivatePtr, 16, 16},
1002                                                   {S32, ConstantPtr, 8, 8},
1003                                                   {S32, ConstantPtr, 16, 2 * 8}});
1004   if (ST.hasFlatAddressSpace()) {
1005     ExtLoads.legalForTypesWithMemDesc(
1006         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1007   }
1008 
1009   ExtLoads.clampScalar(0, S32, S32)
1010           .widenScalarToNextPow2(0)
1011           .unsupportedIfMemSizeNotPow2()
1012           .lower();
1013 
1014   auto &Atomics = getActionDefinitionsBuilder(
1015     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1016      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1017      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1018      G_ATOMICRMW_UMIN})
1019     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1020                {S64, GlobalPtr}, {S64, LocalPtr}});
1021   if (ST.hasFlatAddressSpace()) {
1022     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1023   }
1024 
1025   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1026     .legalFor({{S32, LocalPtr}});
1027 
1028   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1029   // demarshalling
1030   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1031     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1032                 {S32, FlatPtr}, {S64, FlatPtr}})
1033     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1034                {S32, RegionPtr}, {S64, RegionPtr}});
1035   // TODO: Pointer types, any 32-bit or 64-bit vector
1036 
1037   // Condition should be s32 for scalar, s1 for vector.
1038   getActionDefinitionsBuilder(G_SELECT)
1039     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1040           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1041           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1042     .clampScalar(0, S16, S64)
1043     .scalarize(1)
1044     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1045     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1046     .clampMaxNumElements(0, S32, 2)
1047     .clampMaxNumElements(0, LocalPtr, 2)
1048     .clampMaxNumElements(0, PrivatePtr, 2)
1049     .scalarize(0)
1050     .widenScalarToNextPow2(0)
1051     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1052 
1053   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1054   // be more flexible with the shift amount type.
1055   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1056     .legalFor({{S32, S32}, {S64, S32}});
1057   if (ST.has16BitInsts()) {
1058     if (ST.hasVOP3PInsts()) {
1059       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1060             .clampMaxNumElements(0, S16, 2);
1061     } else
1062       Shifts.legalFor({{S16, S16}});
1063 
1064     // TODO: Support 16-bit shift amounts for all types
1065     Shifts.widenScalarIf(
1066       [=](const LegalityQuery &Query) {
1067         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1068         // 32-bit amount.
1069         const LLT ValTy = Query.Types[0];
1070         const LLT AmountTy = Query.Types[1];
1071         return ValTy.getSizeInBits() <= 16 &&
1072                AmountTy.getSizeInBits() < 16;
1073       }, changeTo(1, S16));
1074     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1075     Shifts.clampScalar(1, S32, S32);
1076     Shifts.clampScalar(0, S16, S64);
1077     Shifts.widenScalarToNextPow2(0, 16);
1078   } else {
1079     // Make sure we legalize the shift amount type first, as the general
1080     // expansion for the shifted type will produce much worse code if it hasn't
1081     // been truncated already.
1082     Shifts.clampScalar(1, S32, S32);
1083     Shifts.clampScalar(0, S32, S64);
1084     Shifts.widenScalarToNextPow2(0, 32);
1085   }
1086   Shifts.scalarize(0);
1087 
1088   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1089     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1090     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1091     unsigned IdxTypeIdx = 2;
1092 
1093     getActionDefinitionsBuilder(Op)
1094       .customIf([=](const LegalityQuery &Query) {
1095           const LLT EltTy = Query.Types[EltTypeIdx];
1096           const LLT VecTy = Query.Types[VecTypeIdx];
1097           const LLT IdxTy = Query.Types[IdxTypeIdx];
1098           return (EltTy.getSizeInBits() == 16 ||
1099                   EltTy.getSizeInBits() % 32 == 0) &&
1100                  VecTy.getSizeInBits() % 32 == 0 &&
1101                  VecTy.getSizeInBits() <= 1024 &&
1102                  IdxTy.getSizeInBits() == 32;
1103         })
1104       .clampScalar(EltTypeIdx, S32, S64)
1105       .clampScalar(VecTypeIdx, S32, S64)
1106       .clampScalar(IdxTypeIdx, S32, S32);
1107   }
1108 
1109   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1110     .unsupportedIf([=](const LegalityQuery &Query) {
1111         const LLT &EltTy = Query.Types[1].getElementType();
1112         return Query.Types[0] != EltTy;
1113       });
1114 
1115   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1116     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1117     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1118 
1119     // FIXME: Doesn't handle extract of illegal sizes.
1120     getActionDefinitionsBuilder(Op)
1121       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1122       // FIXME: Multiples of 16 should not be legal.
1123       .legalIf([=](const LegalityQuery &Query) {
1124           const LLT BigTy = Query.Types[BigTyIdx];
1125           const LLT LitTy = Query.Types[LitTyIdx];
1126           return (BigTy.getSizeInBits() % 32 == 0) &&
1127                  (LitTy.getSizeInBits() % 16 == 0);
1128         })
1129       .widenScalarIf(
1130         [=](const LegalityQuery &Query) {
1131           const LLT BigTy = Query.Types[BigTyIdx];
1132           return (BigTy.getScalarSizeInBits() < 16);
1133         },
1134         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1135       .widenScalarIf(
1136         [=](const LegalityQuery &Query) {
1137           const LLT LitTy = Query.Types[LitTyIdx];
1138           return (LitTy.getScalarSizeInBits() < 16);
1139         },
1140         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1141       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1142       .widenScalarToNextPow2(BigTyIdx, 32);
1143 
1144   }
1145 
1146   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1147     .legalForCartesianProduct(AllS32Vectors, {S32})
1148     .legalForCartesianProduct(AllS64Vectors, {S64})
1149     .clampNumElements(0, V16S32, V32S32)
1150     .clampNumElements(0, V2S64, V16S64)
1151     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1152 
1153   if (ST.hasScalarPackInsts()) {
1154     BuildVector
1155       // FIXME: Should probably widen s1 vectors straight to s32
1156       .minScalarOrElt(0, S16)
1157       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1158       .minScalar(1, S32);
1159 
1160     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1161       .legalFor({V2S16, S32})
1162       .lower();
1163     BuildVector.minScalarOrElt(0, S32);
1164   } else {
1165     BuildVector.customFor({V2S16, S16});
1166     BuildVector.minScalarOrElt(0, S32);
1167 
1168     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1169       .customFor({V2S16, S32})
1170       .lower();
1171   }
1172 
1173   BuildVector.legalIf(isRegisterType(0));
1174 
1175   // FIXME: Clamp maximum size
1176   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1177     .legalIf(isRegisterType(0));
1178 
1179   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1180   // pre-legalize.
1181   if (ST.hasVOP3PInsts()) {
1182     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1183       .customFor({V2S16, V2S16})
1184       .lower();
1185   } else
1186     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1187 
1188   // Merge/Unmerge
1189   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1190     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1191     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1192 
1193     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1194       const LLT Ty = Query.Types[TypeIdx];
1195       if (Ty.isVector()) {
1196         const LLT &EltTy = Ty.getElementType();
1197         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1198           return true;
1199         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1200           return true;
1201       }
1202       return false;
1203     };
1204 
1205     auto &Builder = getActionDefinitionsBuilder(Op)
1206       .lowerFor({{S16, V2S16}})
1207       .lowerIf([=](const LegalityQuery &Query) {
1208           const LLT BigTy = Query.Types[BigTyIdx];
1209           return BigTy.getSizeInBits() == 32;
1210         })
1211       // Try to widen to s16 first for small types.
1212       // TODO: Only do this on targets with legal s16 shifts
1213       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1214       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1215       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1216       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1217                            elementTypeIs(1, S16)),
1218                        changeTo(1, V2S16))
1219       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1220       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1221       // valid.
1222       .clampScalar(LitTyIdx, S32, S512)
1223       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1224       // Break up vectors with weird elements into scalars
1225       .fewerElementsIf(
1226         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1227         scalarize(0))
1228       .fewerElementsIf(
1229         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1230         scalarize(1))
1231       .clampScalar(BigTyIdx, S32, S1024);
1232 
1233     if (Op == G_MERGE_VALUES) {
1234       Builder.widenScalarIf(
1235         // TODO: Use 16-bit shifts if legal for 8-bit values?
1236         [=](const LegalityQuery &Query) {
1237           const LLT Ty = Query.Types[LitTyIdx];
1238           return Ty.getSizeInBits() < 32;
1239         },
1240         changeTo(LitTyIdx, S32));
1241     }
1242 
1243     Builder.widenScalarIf(
1244       [=](const LegalityQuery &Query) {
1245         const LLT Ty = Query.Types[BigTyIdx];
1246         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1247           Ty.getSizeInBits() % 16 != 0;
1248       },
1249       [=](const LegalityQuery &Query) {
1250         // Pick the next power of 2, or a multiple of 64 over 128.
1251         // Whichever is smaller.
1252         const LLT &Ty = Query.Types[BigTyIdx];
1253         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1254         if (NewSizeInBits >= 256) {
1255           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1256           if (RoundedTo < NewSizeInBits)
1257             NewSizeInBits = RoundedTo;
1258         }
1259         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1260       })
1261       .legalIf([=](const LegalityQuery &Query) {
1262           const LLT &BigTy = Query.Types[BigTyIdx];
1263           const LLT &LitTy = Query.Types[LitTyIdx];
1264 
1265           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1266             return false;
1267           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1268             return false;
1269 
1270           return BigTy.getSizeInBits() % 16 == 0 &&
1271                  LitTy.getSizeInBits() % 16 == 0 &&
1272                  BigTy.getSizeInBits() <= 1024;
1273         })
1274       // Any vectors left are the wrong size. Scalarize them.
1275       .scalarize(0)
1276       .scalarize(1);
1277   }
1278 
1279   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1280   // RegBankSelect.
1281   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1282     .legalFor({{S32}, {S64}});
1283 
1284   if (ST.hasVOP3PInsts()) {
1285     SextInReg.lowerFor({{V2S16}})
1286       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1287       // get more vector shift opportunities, since we'll get those when
1288       // expanded.
1289       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1290   } else if (ST.has16BitInsts()) {
1291     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1292   } else {
1293     // Prefer to promote to s32 before lowering if we don't have 16-bit
1294     // shifts. This avoid a lot of intermediate truncate and extend operations.
1295     SextInReg.lowerFor({{S32}, {S64}});
1296   }
1297 
1298   SextInReg
1299     .scalarize(0)
1300     .clampScalar(0, S32, S64)
1301     .lower();
1302 
1303   getActionDefinitionsBuilder(G_FSHR)
1304     .legalFor({{S32, S32}})
1305     .scalarize(0)
1306     .lower();
1307 
1308   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1309     .legalFor({S64});
1310 
1311   getActionDefinitionsBuilder({
1312       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1313       G_FCOPYSIGN,
1314 
1315       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1316       G_READ_REGISTER,
1317       G_WRITE_REGISTER,
1318 
1319       G_SADDO, G_SSUBO,
1320 
1321        // TODO: Implement
1322       G_FMINIMUM, G_FMAXIMUM,
1323       G_FSHL
1324     }).lower();
1325 
1326   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1327         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1328         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1329     .unsupported();
1330 
1331   computeTables();
1332   verify(*ST.getInstrInfo());
1333 }
1334 
1335 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1336                                          MachineRegisterInfo &MRI,
1337                                          MachineIRBuilder &B,
1338                                          GISelChangeObserver &Observer) const {
1339   switch (MI.getOpcode()) {
1340   case TargetOpcode::G_ADDRSPACE_CAST:
1341     return legalizeAddrSpaceCast(MI, MRI, B);
1342   case TargetOpcode::G_FRINT:
1343     return legalizeFrint(MI, MRI, B);
1344   case TargetOpcode::G_FCEIL:
1345     return legalizeFceil(MI, MRI, B);
1346   case TargetOpcode::G_INTRINSIC_TRUNC:
1347     return legalizeIntrinsicTrunc(MI, MRI, B);
1348   case TargetOpcode::G_SITOFP:
1349     return legalizeITOFP(MI, MRI, B, true);
1350   case TargetOpcode::G_UITOFP:
1351     return legalizeITOFP(MI, MRI, B, false);
1352   case TargetOpcode::G_FPTOSI:
1353     return legalizeFPTOI(MI, MRI, B, true);
1354   case TargetOpcode::G_FPTOUI:
1355     return legalizeFPTOI(MI, MRI, B, false);
1356   case TargetOpcode::G_FMINNUM:
1357   case TargetOpcode::G_FMAXNUM:
1358   case TargetOpcode::G_FMINNUM_IEEE:
1359   case TargetOpcode::G_FMAXNUM_IEEE:
1360     return legalizeMinNumMaxNum(MI, MRI, B);
1361   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1362     return legalizeExtractVectorElt(MI, MRI, B);
1363   case TargetOpcode::G_INSERT_VECTOR_ELT:
1364     return legalizeInsertVectorElt(MI, MRI, B);
1365   case TargetOpcode::G_SHUFFLE_VECTOR:
1366     return legalizeShuffleVector(MI, MRI, B);
1367   case TargetOpcode::G_FSIN:
1368   case TargetOpcode::G_FCOS:
1369     return legalizeSinCos(MI, MRI, B);
1370   case TargetOpcode::G_GLOBAL_VALUE:
1371     return legalizeGlobalValue(MI, MRI, B);
1372   case TargetOpcode::G_LOAD:
1373     return legalizeLoad(MI, MRI, B, Observer);
1374   case TargetOpcode::G_FMAD:
1375     return legalizeFMad(MI, MRI, B);
1376   case TargetOpcode::G_FDIV:
1377     return legalizeFDIV(MI, MRI, B);
1378   case TargetOpcode::G_UDIV:
1379   case TargetOpcode::G_UREM:
1380     return legalizeUDIV_UREM(MI, MRI, B);
1381   case TargetOpcode::G_SDIV:
1382   case TargetOpcode::G_SREM:
1383     return legalizeSDIV_SREM(MI, MRI, B);
1384   case TargetOpcode::G_ATOMIC_CMPXCHG:
1385     return legalizeAtomicCmpXChg(MI, MRI, B);
1386   case TargetOpcode::G_FLOG:
1387     return legalizeFlog(MI, B, numbers::ln2f);
1388   case TargetOpcode::G_FLOG10:
1389     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1390   case TargetOpcode::G_FEXP:
1391     return legalizeFExp(MI, B);
1392   case TargetOpcode::G_FPOW:
1393     return legalizeFPow(MI, B);
1394   case TargetOpcode::G_FFLOOR:
1395     return legalizeFFloor(MI, MRI, B);
1396   case TargetOpcode::G_BUILD_VECTOR:
1397     return legalizeBuildVector(MI, MRI, B);
1398   default:
1399     return false;
1400   }
1401 
1402   llvm_unreachable("expected switch to return");
1403 }
1404 
1405 Register AMDGPULegalizerInfo::getSegmentAperture(
1406   unsigned AS,
1407   MachineRegisterInfo &MRI,
1408   MachineIRBuilder &B) const {
1409   MachineFunction &MF = B.getMF();
1410   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1411   const LLT S32 = LLT::scalar(32);
1412 
1413   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1414 
1415   if (ST.hasApertureRegs()) {
1416     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1417     // getreg.
1418     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1419         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1420         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1421     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1422         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1423         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1424     unsigned Encoding =
1425         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1426         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1427         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1428 
1429     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1430 
1431     B.buildInstr(AMDGPU::S_GETREG_B32)
1432       .addDef(GetReg)
1433       .addImm(Encoding);
1434     MRI.setType(GetReg, S32);
1435 
1436     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1437     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1438   }
1439 
1440   Register QueuePtr = MRI.createGenericVirtualRegister(
1441     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1442 
1443   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1444   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1445     return Register();
1446 
1447   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1448   // private_segment_aperture_base_hi.
1449   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1450 
1451   // TODO: can we be smarter about machine pointer info?
1452   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1453   MachineMemOperand *MMO = MF.getMachineMemOperand(
1454       PtrInfo,
1455       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1456           MachineMemOperand::MOInvariant,
1457       4, commonAlignment(Align(64), StructOffset));
1458 
1459   Register LoadAddr;
1460 
1461   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1462   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1463 }
1464 
1465 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1466   MachineInstr &MI, MachineRegisterInfo &MRI,
1467   MachineIRBuilder &B) const {
1468   MachineFunction &MF = B.getMF();
1469 
1470   B.setInstr(MI);
1471 
1472   const LLT S32 = LLT::scalar(32);
1473   Register Dst = MI.getOperand(0).getReg();
1474   Register Src = MI.getOperand(1).getReg();
1475 
1476   LLT DstTy = MRI.getType(Dst);
1477   LLT SrcTy = MRI.getType(Src);
1478   unsigned DestAS = DstTy.getAddressSpace();
1479   unsigned SrcAS = SrcTy.getAddressSpace();
1480 
1481   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1482   // vector element.
1483   assert(!DstTy.isVector());
1484 
1485   const AMDGPUTargetMachine &TM
1486     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1487 
1488   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1489   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1490     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1491     return true;
1492   }
1493 
1494   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1495     // Truncate.
1496     B.buildExtract(Dst, Src, 0);
1497     MI.eraseFromParent();
1498     return true;
1499   }
1500 
1501   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1502     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1503     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1504 
1505     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1506     // another. Merge operands are required to be the same type, but creating an
1507     // extra ptrtoint would be kind of pointless.
1508     auto HighAddr = B.buildConstant(
1509       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1510     B.buildMerge(Dst, {Src, HighAddr});
1511     MI.eraseFromParent();
1512     return true;
1513   }
1514 
1515   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1516     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1517            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1518     unsigned NullVal = TM.getNullPointerValue(DestAS);
1519 
1520     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1521     auto FlatNull = B.buildConstant(SrcTy, 0);
1522 
1523     // Extract low 32-bits of the pointer.
1524     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1525 
1526     auto CmpRes =
1527         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1528     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1529 
1530     MI.eraseFromParent();
1531     return true;
1532   }
1533 
1534   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1535     return false;
1536 
1537   if (!ST.hasFlatAddressSpace())
1538     return false;
1539 
1540   auto SegmentNull =
1541       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1542   auto FlatNull =
1543       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1544 
1545   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1546   if (!ApertureReg.isValid())
1547     return false;
1548 
1549   auto CmpRes =
1550       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1551 
1552   // Coerce the type of the low half of the result so we can use merge_values.
1553   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1554 
1555   // TODO: Should we allow mismatched types but matching sizes in merges to
1556   // avoid the ptrtoint?
1557   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1558   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1559 
1560   MI.eraseFromParent();
1561   return true;
1562 }
1563 
1564 bool AMDGPULegalizerInfo::legalizeFrint(
1565   MachineInstr &MI, MachineRegisterInfo &MRI,
1566   MachineIRBuilder &B) const {
1567   B.setInstr(MI);
1568 
1569   Register Src = MI.getOperand(1).getReg();
1570   LLT Ty = MRI.getType(Src);
1571   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1572 
1573   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1574   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1575 
1576   auto C1 = B.buildFConstant(Ty, C1Val);
1577   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1578 
1579   // TODO: Should this propagate fast-math-flags?
1580   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1581   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1582 
1583   auto C2 = B.buildFConstant(Ty, C2Val);
1584   auto Fabs = B.buildFAbs(Ty, Src);
1585 
1586   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1587   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1588   return true;
1589 }
1590 
1591 bool AMDGPULegalizerInfo::legalizeFceil(
1592   MachineInstr &MI, MachineRegisterInfo &MRI,
1593   MachineIRBuilder &B) const {
1594   B.setInstr(MI);
1595 
1596   const LLT S1 = LLT::scalar(1);
1597   const LLT S64 = LLT::scalar(64);
1598 
1599   Register Src = MI.getOperand(1).getReg();
1600   assert(MRI.getType(Src) == S64);
1601 
1602   // result = trunc(src)
1603   // if (src > 0.0 && src != result)
1604   //   result += 1.0
1605 
1606   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1607 
1608   const auto Zero = B.buildFConstant(S64, 0.0);
1609   const auto One = B.buildFConstant(S64, 1.0);
1610   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1611   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1612   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1613   auto Add = B.buildSelect(S64, And, One, Zero);
1614 
1615   // TODO: Should this propagate fast-math-flags?
1616   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1617   return true;
1618 }
1619 
1620 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1621                                               MachineIRBuilder &B) {
1622   const unsigned FractBits = 52;
1623   const unsigned ExpBits = 11;
1624   LLT S32 = LLT::scalar(32);
1625 
1626   auto Const0 = B.buildConstant(S32, FractBits - 32);
1627   auto Const1 = B.buildConstant(S32, ExpBits);
1628 
1629   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1630     .addUse(Const0.getReg(0))
1631     .addUse(Const1.getReg(0));
1632 
1633   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1634 }
1635 
1636 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1637   MachineInstr &MI, MachineRegisterInfo &MRI,
1638   MachineIRBuilder &B) const {
1639   B.setInstr(MI);
1640 
1641   const LLT S1 = LLT::scalar(1);
1642   const LLT S32 = LLT::scalar(32);
1643   const LLT S64 = LLT::scalar(64);
1644 
1645   Register Src = MI.getOperand(1).getReg();
1646   assert(MRI.getType(Src) == S64);
1647 
1648   // TODO: Should this use extract since the low half is unused?
1649   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1650   Register Hi = Unmerge.getReg(1);
1651 
1652   // Extract the upper half, since this is where we will find the sign and
1653   // exponent.
1654   auto Exp = extractF64Exponent(Hi, B);
1655 
1656   const unsigned FractBits = 52;
1657 
1658   // Extract the sign bit.
1659   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1660   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1661 
1662   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1663 
1664   const auto Zero32 = B.buildConstant(S32, 0);
1665 
1666   // Extend back to 64-bits.
1667   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1668 
1669   auto Shr = B.buildAShr(S64, FractMask, Exp);
1670   auto Not = B.buildNot(S64, Shr);
1671   auto Tmp0 = B.buildAnd(S64, Src, Not);
1672   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1673 
1674   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1675   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1676 
1677   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1678   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1679   return true;
1680 }
1681 
1682 bool AMDGPULegalizerInfo::legalizeITOFP(
1683   MachineInstr &MI, MachineRegisterInfo &MRI,
1684   MachineIRBuilder &B, bool Signed) const {
1685   B.setInstr(MI);
1686 
1687   Register Dst = MI.getOperand(0).getReg();
1688   Register Src = MI.getOperand(1).getReg();
1689 
1690   const LLT S64 = LLT::scalar(64);
1691   const LLT S32 = LLT::scalar(32);
1692 
1693   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1694 
1695   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1696 
1697   auto CvtHi = Signed ?
1698     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1699     B.buildUITOFP(S64, Unmerge.getReg(1));
1700 
1701   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1702 
1703   auto ThirtyTwo = B.buildConstant(S32, 32);
1704   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1705     .addUse(CvtHi.getReg(0))
1706     .addUse(ThirtyTwo.getReg(0));
1707 
1708   // TODO: Should this propagate fast-math-flags?
1709   B.buildFAdd(Dst, LdExp, CvtLo);
1710   MI.eraseFromParent();
1711   return true;
1712 }
1713 
1714 // TODO: Copied from DAG implementation. Verify logic and document how this
1715 // actually works.
1716 bool AMDGPULegalizerInfo::legalizeFPTOI(
1717   MachineInstr &MI, MachineRegisterInfo &MRI,
1718   MachineIRBuilder &B, bool Signed) const {
1719   B.setInstr(MI);
1720 
1721   Register Dst = MI.getOperand(0).getReg();
1722   Register Src = MI.getOperand(1).getReg();
1723 
1724   const LLT S64 = LLT::scalar(64);
1725   const LLT S32 = LLT::scalar(32);
1726 
1727   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1728 
1729   unsigned Flags = MI.getFlags();
1730 
1731   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1732   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1733   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1734 
1735   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1736   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1737   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1738 
1739   auto Hi = Signed ?
1740     B.buildFPTOSI(S32, FloorMul) :
1741     B.buildFPTOUI(S32, FloorMul);
1742   auto Lo = B.buildFPTOUI(S32, Fma);
1743 
1744   B.buildMerge(Dst, { Lo, Hi });
1745   MI.eraseFromParent();
1746 
1747   return true;
1748 }
1749 
1750 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1751   MachineInstr &MI, MachineRegisterInfo &MRI,
1752   MachineIRBuilder &B) const {
1753   MachineFunction &MF = B.getMF();
1754   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1755 
1756   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1757                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1758 
1759   // With ieee_mode disabled, the instructions have the correct behavior
1760   // already for G_FMINNUM/G_FMAXNUM
1761   if (!MFI->getMode().IEEE)
1762     return !IsIEEEOp;
1763 
1764   if (IsIEEEOp)
1765     return true;
1766 
1767   MachineIRBuilder HelperBuilder(MI);
1768   GISelObserverWrapper DummyObserver;
1769   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1770   HelperBuilder.setInstr(MI);
1771   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1772 }
1773 
1774 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1775   MachineInstr &MI, MachineRegisterInfo &MRI,
1776   MachineIRBuilder &B) const {
1777   // TODO: Should move some of this into LegalizerHelper.
1778 
1779   // TODO: Promote dynamic indexing of s16 to s32
1780 
1781   // FIXME: Artifact combiner probably should have replaced the truncated
1782   // constant before this, so we shouldn't need
1783   // getConstantVRegValWithLookThrough.
1784   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1785     MI.getOperand(2).getReg(), MRI);
1786   if (!IdxVal) // Dynamic case will be selected to register indexing.
1787     return true;
1788 
1789   Register Dst = MI.getOperand(0).getReg();
1790   Register Vec = MI.getOperand(1).getReg();
1791 
1792   LLT VecTy = MRI.getType(Vec);
1793   LLT EltTy = VecTy.getElementType();
1794   assert(EltTy == MRI.getType(Dst));
1795 
1796   B.setInstr(MI);
1797 
1798   if (IdxVal->Value < VecTy.getNumElements())
1799     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1800   else
1801     B.buildUndef(Dst);
1802 
1803   MI.eraseFromParent();
1804   return true;
1805 }
1806 
1807 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1808   MachineInstr &MI, MachineRegisterInfo &MRI,
1809   MachineIRBuilder &B) const {
1810   // TODO: Should move some of this into LegalizerHelper.
1811 
1812   // TODO: Promote dynamic indexing of s16 to s32
1813 
1814   // FIXME: Artifact combiner probably should have replaced the truncated
1815   // constant before this, so we shouldn't need
1816   // getConstantVRegValWithLookThrough.
1817   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1818     MI.getOperand(3).getReg(), MRI);
1819   if (!IdxVal) // Dynamic case will be selected to register indexing.
1820     return true;
1821 
1822   Register Dst = MI.getOperand(0).getReg();
1823   Register Vec = MI.getOperand(1).getReg();
1824   Register Ins = MI.getOperand(2).getReg();
1825 
1826   LLT VecTy = MRI.getType(Vec);
1827   LLT EltTy = VecTy.getElementType();
1828   assert(EltTy == MRI.getType(Ins));
1829 
1830   B.setInstr(MI);
1831 
1832   if (IdxVal->Value < VecTy.getNumElements())
1833     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1834   else
1835     B.buildUndef(Dst);
1836 
1837   MI.eraseFromParent();
1838   return true;
1839 }
1840 
1841 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1842   MachineInstr &MI, MachineRegisterInfo &MRI,
1843   MachineIRBuilder &B) const {
1844   const LLT V2S16 = LLT::vector(2, 16);
1845 
1846   Register Dst = MI.getOperand(0).getReg();
1847   Register Src0 = MI.getOperand(1).getReg();
1848   LLT DstTy = MRI.getType(Dst);
1849   LLT SrcTy = MRI.getType(Src0);
1850 
1851   if (SrcTy == V2S16 && DstTy == V2S16 &&
1852       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1853     return true;
1854 
1855   MachineIRBuilder HelperBuilder(MI);
1856   GISelObserverWrapper DummyObserver;
1857   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1858   HelperBuilder.setInstr(MI);
1859   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1860 }
1861 
1862 bool AMDGPULegalizerInfo::legalizeSinCos(
1863   MachineInstr &MI, MachineRegisterInfo &MRI,
1864   MachineIRBuilder &B) const {
1865   B.setInstr(MI);
1866 
1867   Register DstReg = MI.getOperand(0).getReg();
1868   Register SrcReg = MI.getOperand(1).getReg();
1869   LLT Ty = MRI.getType(DstReg);
1870   unsigned Flags = MI.getFlags();
1871 
1872   Register TrigVal;
1873   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1874   if (ST.hasTrigReducedRange()) {
1875     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1876     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1877       .addUse(MulVal.getReg(0))
1878       .setMIFlags(Flags).getReg(0);
1879   } else
1880     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1881 
1882   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1883     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1884   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1885     .addUse(TrigVal)
1886     .setMIFlags(Flags);
1887   MI.eraseFromParent();
1888   return true;
1889 }
1890 
1891 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1892   Register DstReg, LLT PtrTy,
1893   MachineIRBuilder &B, const GlobalValue *GV,
1894   unsigned Offset, unsigned GAFlags) const {
1895   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1896   // to the following code sequence:
1897   //
1898   // For constant address space:
1899   //   s_getpc_b64 s[0:1]
1900   //   s_add_u32 s0, s0, $symbol
1901   //   s_addc_u32 s1, s1, 0
1902   //
1903   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1904   //   a fixup or relocation is emitted to replace $symbol with a literal
1905   //   constant, which is a pc-relative offset from the encoding of the $symbol
1906   //   operand to the global variable.
1907   //
1908   // For global address space:
1909   //   s_getpc_b64 s[0:1]
1910   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1911   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1912   //
1913   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1914   //   fixups or relocations are emitted to replace $symbol@*@lo and
1915   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1916   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1917   //   operand to the global variable.
1918   //
1919   // What we want here is an offset from the value returned by s_getpc
1920   // (which is the address of the s_add_u32 instruction) to the global
1921   // variable, but since the encoding of $symbol starts 4 bytes after the start
1922   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1923   // small. This requires us to add 4 to the global variable offset in order to
1924   // compute the correct address.
1925 
1926   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1927 
1928   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1929     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1930 
1931   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1932     .addDef(PCReg);
1933 
1934   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1935   if (GAFlags == SIInstrInfo::MO_NONE)
1936     MIB.addImm(0);
1937   else
1938     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1939 
1940   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1941 
1942   if (PtrTy.getSizeInBits() == 32)
1943     B.buildExtract(DstReg, PCReg, 0);
1944   return true;
1945  }
1946 
1947 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1948   MachineInstr &MI, MachineRegisterInfo &MRI,
1949   MachineIRBuilder &B) const {
1950   Register DstReg = MI.getOperand(0).getReg();
1951   LLT Ty = MRI.getType(DstReg);
1952   unsigned AS = Ty.getAddressSpace();
1953 
1954   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1955   MachineFunction &MF = B.getMF();
1956   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1957   B.setInstr(MI);
1958 
1959   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1960     if (!MFI->isEntryFunction()) {
1961       const Function &Fn = MF.getFunction();
1962       DiagnosticInfoUnsupported BadLDSDecl(
1963         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1964         DS_Warning);
1965       Fn.getContext().diagnose(BadLDSDecl);
1966 
1967       // We currently don't have a way to correctly allocate LDS objects that
1968       // aren't directly associated with a kernel. We do force inlining of
1969       // functions that use local objects. However, if these dead functions are
1970       // not eliminated, we don't want a compile time error. Just emit a warning
1971       // and a trap, since there should be no callable path here.
1972       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1973       B.buildUndef(DstReg);
1974       MI.eraseFromParent();
1975       return true;
1976     }
1977 
1978     // TODO: We could emit code to handle the initialization somewhere.
1979     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1980       const SITargetLowering *TLI = ST.getTargetLowering();
1981       if (!TLI->shouldUseLDSConstAddress(GV)) {
1982         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1983         return true; // Leave in place;
1984       }
1985 
1986       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1987       MI.eraseFromParent();
1988       return true;
1989     }
1990 
1991     const Function &Fn = MF.getFunction();
1992     DiagnosticInfoUnsupported BadInit(
1993       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1994     Fn.getContext().diagnose(BadInit);
1995     return true;
1996   }
1997 
1998   const SITargetLowering *TLI = ST.getTargetLowering();
1999 
2000   if (TLI->shouldEmitFixup(GV)) {
2001     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2002     MI.eraseFromParent();
2003     return true;
2004   }
2005 
2006   if (TLI->shouldEmitPCReloc(GV)) {
2007     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2008     MI.eraseFromParent();
2009     return true;
2010   }
2011 
2012   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2013   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2014 
2015   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2016       MachinePointerInfo::getGOT(MF),
2017       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2018           MachineMemOperand::MOInvariant,
2019       8 /*Size*/, Align(8));
2020 
2021   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2022 
2023   if (Ty.getSizeInBits() == 32) {
2024     // Truncate if this is a 32-bit constant adrdess.
2025     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2026     B.buildExtract(DstReg, Load, 0);
2027   } else
2028     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2029 
2030   MI.eraseFromParent();
2031   return true;
2032 }
2033 
2034 bool AMDGPULegalizerInfo::legalizeLoad(
2035   MachineInstr &MI, MachineRegisterInfo &MRI,
2036   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2037   B.setInstr(MI);
2038   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2039   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2040   Observer.changingInstr(MI);
2041   MI.getOperand(1).setReg(Cast.getReg(0));
2042   Observer.changedInstr(MI);
2043   return true;
2044 }
2045 
2046 bool AMDGPULegalizerInfo::legalizeFMad(
2047   MachineInstr &MI, MachineRegisterInfo &MRI,
2048   MachineIRBuilder &B) const {
2049   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2050   assert(Ty.isScalar());
2051 
2052   MachineFunction &MF = B.getMF();
2053   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2054 
2055   // TODO: Always legal with future ftz flag.
2056   // FIXME: Do we need just output?
2057   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2058     return true;
2059   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2060     return true;
2061 
2062   MachineIRBuilder HelperBuilder(MI);
2063   GISelObserverWrapper DummyObserver;
2064   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2065   HelperBuilder.setInstr(MI);
2066   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2067 }
2068 
2069 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2070   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2071   Register DstReg = MI.getOperand(0).getReg();
2072   Register PtrReg = MI.getOperand(1).getReg();
2073   Register CmpVal = MI.getOperand(2).getReg();
2074   Register NewVal = MI.getOperand(3).getReg();
2075 
2076   assert(SITargetLowering::isFlatGlobalAddrSpace(
2077            MRI.getType(PtrReg).getAddressSpace()) &&
2078          "this should not have been custom lowered");
2079 
2080   LLT ValTy = MRI.getType(CmpVal);
2081   LLT VecTy = LLT::vector(2, ValTy);
2082 
2083   B.setInstr(MI);
2084   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2085 
2086   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2087     .addDef(DstReg)
2088     .addUse(PtrReg)
2089     .addUse(PackedVal)
2090     .setMemRefs(MI.memoperands());
2091 
2092   MI.eraseFromParent();
2093   return true;
2094 }
2095 
2096 bool AMDGPULegalizerInfo::legalizeFlog(
2097   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2098   Register Dst = MI.getOperand(0).getReg();
2099   Register Src = MI.getOperand(1).getReg();
2100   LLT Ty = B.getMRI()->getType(Dst);
2101   unsigned Flags = MI.getFlags();
2102   B.setInstr(MI);
2103 
2104   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2105   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2106 
2107   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2108   MI.eraseFromParent();
2109   return true;
2110 }
2111 
2112 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2113                                        MachineIRBuilder &B) const {
2114   Register Dst = MI.getOperand(0).getReg();
2115   Register Src = MI.getOperand(1).getReg();
2116   unsigned Flags = MI.getFlags();
2117   LLT Ty = B.getMRI()->getType(Dst);
2118   B.setInstr(MI);
2119 
2120   auto K = B.buildFConstant(Ty, numbers::log2e);
2121   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2122   B.buildFExp2(Dst, Mul, Flags);
2123   MI.eraseFromParent();
2124   return true;
2125 }
2126 
2127 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2128                                        MachineIRBuilder &B) const {
2129   Register Dst = MI.getOperand(0).getReg();
2130   Register Src0 = MI.getOperand(1).getReg();
2131   Register Src1 = MI.getOperand(2).getReg();
2132   unsigned Flags = MI.getFlags();
2133   LLT Ty = B.getMRI()->getType(Dst);
2134   B.setInstr(MI);
2135   const LLT S16 = LLT::scalar(16);
2136   const LLT S32 = LLT::scalar(32);
2137 
2138   if (Ty == S32) {
2139     auto Log = B.buildFLog2(S32, Src0, Flags);
2140     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2141       .addUse(Log.getReg(0))
2142       .addUse(Src1)
2143       .setMIFlags(Flags);
2144     B.buildFExp2(Dst, Mul, Flags);
2145   } else if (Ty == S16) {
2146     // There's no f16 fmul_legacy, so we need to convert for it.
2147     auto Log = B.buildFLog2(S16, Src0, Flags);
2148     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2149     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2150     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2151       .addUse(Ext0.getReg(0))
2152       .addUse(Ext1.getReg(0))
2153       .setMIFlags(Flags);
2154 
2155     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2156   } else
2157     return false;
2158 
2159   MI.eraseFromParent();
2160   return true;
2161 }
2162 
2163 // Find a source register, ignoring any possible source modifiers.
2164 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2165   Register ModSrc = OrigSrc;
2166   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2167     ModSrc = SrcFNeg->getOperand(1).getReg();
2168     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2169       ModSrc = SrcFAbs->getOperand(1).getReg();
2170   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2171     ModSrc = SrcFAbs->getOperand(1).getReg();
2172   return ModSrc;
2173 }
2174 
2175 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2176                                          MachineRegisterInfo &MRI,
2177                                          MachineIRBuilder &B) const {
2178   B.setInstr(MI);
2179 
2180   const LLT S1 = LLT::scalar(1);
2181   const LLT S64 = LLT::scalar(64);
2182   Register Dst = MI.getOperand(0).getReg();
2183   Register OrigSrc = MI.getOperand(1).getReg();
2184   unsigned Flags = MI.getFlags();
2185   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2186          "this should not have been custom lowered");
2187 
2188   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2189   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2190   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2191   // V_FRACT bug is:
2192   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2193   //
2194   // Convert floor(x) to (x - fract(x))
2195 
2196   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2197     .addUse(OrigSrc)
2198     .setMIFlags(Flags);
2199 
2200   // Give source modifier matching some assistance before obscuring a foldable
2201   // pattern.
2202 
2203   // TODO: We can avoid the neg on the fract? The input sign to fract
2204   // shouldn't matter?
2205   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2206 
2207   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2208 
2209   Register Min = MRI.createGenericVirtualRegister(S64);
2210 
2211   // We don't need to concern ourselves with the snan handling difference, so
2212   // use the one which will directly select.
2213   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2214   if (MFI->getMode().IEEE)
2215     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2216   else
2217     B.buildFMinNum(Min, Fract, Const, Flags);
2218 
2219   Register CorrectedFract = Min;
2220   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2221     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2222     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2223   }
2224 
2225   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2226   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2227 
2228   MI.eraseFromParent();
2229   return true;
2230 }
2231 
2232 // Turn an illegal packed v2s16 build vector into bit operations.
2233 // TODO: This should probably be a bitcast action in LegalizerHelper.
2234 bool AMDGPULegalizerInfo::legalizeBuildVector(
2235   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2236   Register Dst = MI.getOperand(0).getReg();
2237   const LLT S32 = LLT::scalar(32);
2238   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2239 
2240   Register Src0 = MI.getOperand(1).getReg();
2241   Register Src1 = MI.getOperand(2).getReg();
2242   assert(MRI.getType(Src0) == LLT::scalar(16));
2243 
2244   B.setInstr(MI);
2245   auto Merge = B.buildMerge(S32, {Src0, Src1});
2246   B.buildBitcast(Dst, Merge);
2247 
2248   MI.eraseFromParent();
2249   return true;
2250 }
2251 
2252 // Return the use branch instruction, otherwise null if the usage is invalid.
2253 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2254                                        MachineRegisterInfo &MRI,
2255                                        MachineInstr *&Br,
2256                                        MachineBasicBlock *&UncondBrTarget) {
2257   Register CondDef = MI.getOperand(0).getReg();
2258   if (!MRI.hasOneNonDBGUse(CondDef))
2259     return nullptr;
2260 
2261   MachineBasicBlock *Parent = MI.getParent();
2262   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2263   if (UseMI.getParent() != Parent ||
2264       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2265     return nullptr;
2266 
2267   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2268   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2269   if (Next == Parent->end()) {
2270     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2271     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2272       return nullptr;
2273     UncondBrTarget = &*NextMBB;
2274   } else {
2275     if (Next->getOpcode() != AMDGPU::G_BR)
2276       return nullptr;
2277     Br = &*Next;
2278     UncondBrTarget = Br->getOperand(0).getMBB();
2279   }
2280 
2281   return &UseMI;
2282 }
2283 
2284 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2285                                                MachineRegisterInfo &MRI,
2286                                                Register LiveIn,
2287                                                Register PhyReg) const {
2288   assert(PhyReg.isPhysical() && "Physical register expected");
2289 
2290   // Insert the live-in copy, if required, by defining destination virtual
2291   // register.
2292   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2293   if (!MRI.getVRegDef(LiveIn)) {
2294     // FIXME: Should have scoped insert pt
2295     MachineBasicBlock &OrigInsBB = B.getMBB();
2296     auto OrigInsPt = B.getInsertPt();
2297 
2298     MachineBasicBlock &EntryMBB = B.getMF().front();
2299     EntryMBB.addLiveIn(PhyReg);
2300     B.setInsertPt(EntryMBB, EntryMBB.begin());
2301     B.buildCopy(LiveIn, PhyReg);
2302 
2303     B.setInsertPt(OrigInsBB, OrigInsPt);
2304   }
2305 
2306   return LiveIn;
2307 }
2308 
2309 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2310                                                 MachineRegisterInfo &MRI,
2311                                                 Register PhyReg, LLT Ty,
2312                                                 bool InsertLiveInCopy) const {
2313   assert(PhyReg.isPhysical() && "Physical register expected");
2314 
2315   // Get or create virtual live-in regester
2316   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2317   if (!LiveIn) {
2318     LiveIn = MRI.createGenericVirtualRegister(Ty);
2319     MRI.addLiveIn(PhyReg, LiveIn);
2320   }
2321 
2322   // When the actual true copy required is from virtual register to physical
2323   // register (to be inserted later), live-in copy insertion from physical
2324   // to register virtual register is not required
2325   if (!InsertLiveInCopy)
2326     return LiveIn;
2327 
2328   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2329 }
2330 
2331 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2332     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2333   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2334   const ArgDescriptor *Arg;
2335   const TargetRegisterClass *RC;
2336   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2337   if (!Arg) {
2338     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2339     return nullptr;
2340   }
2341   return Arg;
2342 }
2343 
2344 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2345                                          const ArgDescriptor *Arg) const {
2346   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2347     return false; // TODO: Handle these
2348 
2349   Register SrcReg = Arg->getRegister();
2350   assert(SrcReg.isPhysical() && "Physical register expected");
2351   assert(DstReg.isVirtual() && "Virtual register expected");
2352 
2353   MachineRegisterInfo &MRI = *B.getMRI();
2354 
2355   LLT Ty = MRI.getType(DstReg);
2356   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2357 
2358   if (Arg->isMasked()) {
2359     // TODO: Should we try to emit this once in the entry block?
2360     const LLT S32 = LLT::scalar(32);
2361     const unsigned Mask = Arg->getMask();
2362     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2363 
2364     Register AndMaskSrc = LiveIn;
2365 
2366     if (Shift != 0) {
2367       auto ShiftAmt = B.buildConstant(S32, Shift);
2368       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2369     }
2370 
2371     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2372   } else {
2373     B.buildCopy(DstReg, LiveIn);
2374   }
2375 
2376   return true;
2377 }
2378 
2379 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2380     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2381     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2382   B.setInstr(MI);
2383 
2384   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2385   if (!Arg)
2386     return false;
2387 
2388   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2389     return false;
2390 
2391   MI.eraseFromParent();
2392   return true;
2393 }
2394 
2395 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2396                                        MachineRegisterInfo &MRI,
2397                                        MachineIRBuilder &B) const {
2398   B.setInstr(MI);
2399   Register Dst = MI.getOperand(0).getReg();
2400   LLT DstTy = MRI.getType(Dst);
2401   LLT S16 = LLT::scalar(16);
2402   LLT S32 = LLT::scalar(32);
2403   LLT S64 = LLT::scalar(64);
2404 
2405   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2406     return true;
2407 
2408   if (DstTy == S16)
2409     return legalizeFDIV16(MI, MRI, B);
2410   if (DstTy == S32)
2411     return legalizeFDIV32(MI, MRI, B);
2412   if (DstTy == S64)
2413     return legalizeFDIV64(MI, MRI, B);
2414 
2415   return false;
2416 }
2417 
2418 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2419   const LLT S32 = LLT::scalar(32);
2420 
2421   auto Cvt0 = B.buildUITOFP(S32, Src);
2422   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2423   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2424   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2425   return B.buildFPTOUI(S32, Mul).getReg(0);
2426 }
2427 
2428 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2429                                                   Register DstReg,
2430                                                   Register Num,
2431                                                   Register Den,
2432                                                   bool IsRem) const {
2433   const LLT S1 = LLT::scalar(1);
2434   const LLT S32 = LLT::scalar(32);
2435 
2436   // RCP =  URECIP(Den) = 2^32 / Den + e
2437   // e is rounding error.
2438   auto RCP = buildDivRCP(B, Den);
2439 
2440   // RCP_LO = mul(RCP, Den)
2441   auto RCP_LO = B.buildMul(S32, RCP, Den);
2442 
2443   // RCP_HI = mulhu (RCP, Den) */
2444   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2445 
2446   // NEG_RCP_LO = -RCP_LO
2447   auto Zero = B.buildConstant(S32, 0);
2448   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2449 
2450   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2451   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2452   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2453 
2454   // Calculate the rounding error from the URECIP instruction
2455   // E = mulhu(ABS_RCP_LO, RCP)
2456   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2457 
2458   // RCP_A_E = RCP + E
2459   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2460 
2461   // RCP_S_E = RCP - E
2462   auto RCP_S_E = B.buildSub(S32, RCP, E);
2463 
2464   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2465   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2466 
2467   // Quotient = mulhu(Tmp0, Num)stmp
2468   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2469 
2470   // Num_S_Remainder = Quotient * Den
2471   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2472 
2473   // Remainder = Num - Num_S_Remainder
2474   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2475 
2476   // Remainder_GE_Den = Remainder >= Den
2477   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2478 
2479   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2480   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2481                                        Num, Num_S_Remainder);
2482 
2483   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2484   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2485 
2486   // Calculate Division result:
2487 
2488   // Quotient_A_One = Quotient + 1
2489   auto One = B.buildConstant(S32, 1);
2490   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2491 
2492   // Quotient_S_One = Quotient - 1
2493   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2494 
2495   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2496   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2497 
2498   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2499   if (IsRem) {
2500     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2501 
2502     // Calculate Rem result:
2503     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2504 
2505     // Remainder_A_Den = Remainder + Den
2506     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2507 
2508     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2509     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2510 
2511     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2512     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2513   } else {
2514     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2515   }
2516 }
2517 
2518 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2519                                               MachineRegisterInfo &MRI,
2520                                               MachineIRBuilder &B) const {
2521   B.setInstr(MI);
2522   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2523   Register DstReg = MI.getOperand(0).getReg();
2524   Register Num = MI.getOperand(1).getReg();
2525   Register Den = MI.getOperand(2).getReg();
2526   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2527   MI.eraseFromParent();
2528   return true;
2529 }
2530 
2531 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2532 //
2533 // Return lo, hi of result
2534 //
2535 // %cvt.lo = G_UITOFP Val.lo
2536 // %cvt.hi = G_UITOFP Val.hi
2537 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2538 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2539 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2540 // %mul2 = G_FMUL %mul1, 2**(-32)
2541 // %trunc = G_INTRINSIC_TRUNC %mul2
2542 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2543 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2544 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2545                                                        Register Val) {
2546   const LLT S32 = LLT::scalar(32);
2547   auto Unmerge = B.buildUnmerge(S32, Val);
2548 
2549   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2550   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2551 
2552   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2553                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2554 
2555   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2556   auto Mul1 =
2557       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2558 
2559   // 2**(-32)
2560   auto Mul2 =
2561       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2562   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2563 
2564   // -(2**32)
2565   auto Mad2 = B.buildFMAD(S32, Trunc,
2566                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2567 
2568   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2569   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2570 
2571   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2572 }
2573 
2574 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2575                                               MachineRegisterInfo &MRI,
2576                                               MachineIRBuilder &B) const {
2577   B.setInstr(MI);
2578 
2579   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2580   const LLT S32 = LLT::scalar(32);
2581   const LLT S64 = LLT::scalar(64);
2582   const LLT S1 = LLT::scalar(1);
2583   Register Numer = MI.getOperand(1).getReg();
2584   Register Denom = MI.getOperand(2).getReg();
2585   Register RcpLo, RcpHi;
2586 
2587   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2588 
2589   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2590 
2591   auto Zero64 = B.buildConstant(S64, 0);
2592   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2593 
2594   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2595   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2596 
2597   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2598   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2599   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2600 
2601   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2602   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2603   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2604   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2605 
2606   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2607   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2608   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2609   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2610   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2611 
2612   auto Zero32 = B.buildConstant(S32, 0);
2613   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2614   auto Add2_HiC =
2615       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2616   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2617   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2618 
2619   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2620   Register NumerLo = UnmergeNumer.getReg(0);
2621   Register NumerHi = UnmergeNumer.getReg(1);
2622 
2623   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2624   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2625   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2626   Register Mul3_Lo = UnmergeMul3.getReg(0);
2627   Register Mul3_Hi = UnmergeMul3.getReg(1);
2628   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2629   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2630   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2631   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2632 
2633   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2634   Register DenomLo = UnmergeDenom.getReg(0);
2635   Register DenomHi = UnmergeDenom.getReg(1);
2636 
2637   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2638   auto C1 = B.buildSExt(S32, CmpHi);
2639 
2640   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2641   auto C2 = B.buildSExt(S32, CmpLo);
2642 
2643   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2644   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2645 
2646   // TODO: Here and below portions of the code can be enclosed into if/endif.
2647   // Currently control flow is unconditional and we have 4 selects after
2648   // potential endif to substitute PHIs.
2649 
2650   // if C3 != 0 ...
2651   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2652   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2653   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2654   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2655 
2656   auto One64 = B.buildConstant(S64, 1);
2657   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2658 
2659   auto C4 =
2660       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2661   auto C5 =
2662       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2663   auto C6 = B.buildSelect(
2664       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2665 
2666   // if (C6 != 0)
2667   auto Add4 = B.buildAdd(S64, Add3, One64);
2668   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2669 
2670   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2671   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2672   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2673 
2674   // endif C6
2675   // endif C3
2676 
2677   if (IsDiv) {
2678     auto Sel1 = B.buildSelect(
2679         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2680     B.buildSelect(MI.getOperand(0),
2681                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2682   } else {
2683     auto Sel2 = B.buildSelect(
2684         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2685     B.buildSelect(MI.getOperand(0),
2686                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2687   }
2688 
2689   MI.eraseFromParent();
2690   return true;
2691 }
2692 
2693 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2694                                             MachineRegisterInfo &MRI,
2695                                             MachineIRBuilder &B) const {
2696   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2697   if (Ty == LLT::scalar(32))
2698     return legalizeUDIV_UREM32(MI, MRI, B);
2699   if (Ty == LLT::scalar(64))
2700     return legalizeUDIV_UREM64(MI, MRI, B);
2701   return false;
2702 }
2703 
2704 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2705                                               MachineRegisterInfo &MRI,
2706                                               MachineIRBuilder &B) const {
2707   B.setInstr(MI);
2708   const LLT S32 = LLT::scalar(32);
2709 
2710   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2711   Register DstReg = MI.getOperand(0).getReg();
2712   Register LHS = MI.getOperand(1).getReg();
2713   Register RHS = MI.getOperand(2).getReg();
2714 
2715   auto ThirtyOne = B.buildConstant(S32, 31);
2716   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2717   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2718 
2719   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2720   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2721 
2722   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2723   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2724 
2725   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2726   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2727 
2728   if (IsRem) {
2729     auto RSign = LHSign; // Remainder sign is the same as LHS
2730     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2731     B.buildSub(DstReg, UDivRem, RSign);
2732   } else {
2733     auto DSign = B.buildXor(S32, LHSign, RHSign);
2734     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2735     B.buildSub(DstReg, UDivRem, DSign);
2736   }
2737 
2738   MI.eraseFromParent();
2739   return true;
2740 }
2741 
2742 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2743                                             MachineRegisterInfo &MRI,
2744                                             MachineIRBuilder &B) const {
2745   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2746     return legalizeSDIV_SREM32(MI, MRI, B);
2747   return false;
2748 }
2749 
2750 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2751                                                  MachineRegisterInfo &MRI,
2752                                                  MachineIRBuilder &B) const {
2753   Register Res = MI.getOperand(0).getReg();
2754   Register LHS = MI.getOperand(1).getReg();
2755   Register RHS = MI.getOperand(2).getReg();
2756 
2757   uint16_t Flags = MI.getFlags();
2758 
2759   LLT ResTy = MRI.getType(Res);
2760   LLT S32 = LLT::scalar(32);
2761   LLT S64 = LLT::scalar(64);
2762 
2763   const MachineFunction &MF = B.getMF();
2764   bool Unsafe =
2765     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2766 
2767   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2768     return false;
2769 
2770   if (!Unsafe && ResTy == S32 &&
2771       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2772     return false;
2773 
2774   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2775     // 1 / x -> RCP(x)
2776     if (CLHS->isExactlyValue(1.0)) {
2777       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2778         .addUse(RHS)
2779         .setMIFlags(Flags);
2780 
2781       MI.eraseFromParent();
2782       return true;
2783     }
2784 
2785     // -1 / x -> RCP( FNEG(x) )
2786     if (CLHS->isExactlyValue(-1.0)) {
2787       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2788       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2789         .addUse(FNeg.getReg(0))
2790         .setMIFlags(Flags);
2791 
2792       MI.eraseFromParent();
2793       return true;
2794     }
2795   }
2796 
2797   // x / y -> x * (1.0 / y)
2798   if (Unsafe) {
2799     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2800       .addUse(RHS)
2801       .setMIFlags(Flags);
2802     B.buildFMul(Res, LHS, RCP, Flags);
2803 
2804     MI.eraseFromParent();
2805     return true;
2806   }
2807 
2808   return false;
2809 }
2810 
2811 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2812                                          MachineRegisterInfo &MRI,
2813                                          MachineIRBuilder &B) const {
2814   B.setInstr(MI);
2815   Register Res = MI.getOperand(0).getReg();
2816   Register LHS = MI.getOperand(1).getReg();
2817   Register RHS = MI.getOperand(2).getReg();
2818 
2819   uint16_t Flags = MI.getFlags();
2820 
2821   LLT S16 = LLT::scalar(16);
2822   LLT S32 = LLT::scalar(32);
2823 
2824   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2825   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2826 
2827   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2828     .addUse(RHSExt.getReg(0))
2829     .setMIFlags(Flags);
2830 
2831   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2832   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2833 
2834   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2835     .addUse(RDst.getReg(0))
2836     .addUse(RHS)
2837     .addUse(LHS)
2838     .setMIFlags(Flags);
2839 
2840   MI.eraseFromParent();
2841   return true;
2842 }
2843 
2844 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2845 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2846 static void toggleSPDenormMode(bool Enable,
2847                                MachineIRBuilder &B,
2848                                const GCNSubtarget &ST,
2849                                AMDGPU::SIModeRegisterDefaults Mode) {
2850   // Set SP denorm mode to this value.
2851   unsigned SPDenormMode =
2852     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2853 
2854   if (ST.hasDenormModeInst()) {
2855     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2856     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2857 
2858     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2859     B.buildInstr(AMDGPU::S_DENORM_MODE)
2860       .addImm(NewDenormModeValue);
2861 
2862   } else {
2863     // Select FP32 bit field in mode register.
2864     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2865                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2866                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2867 
2868     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2869       .addImm(SPDenormMode)
2870       .addImm(SPDenormModeBitField);
2871   }
2872 }
2873 
2874 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2875                                          MachineRegisterInfo &MRI,
2876                                          MachineIRBuilder &B) const {
2877   B.setInstr(MI);
2878   Register Res = MI.getOperand(0).getReg();
2879   Register LHS = MI.getOperand(1).getReg();
2880   Register RHS = MI.getOperand(2).getReg();
2881   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2882   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2883 
2884   uint16_t Flags = MI.getFlags();
2885 
2886   LLT S32 = LLT::scalar(32);
2887   LLT S1 = LLT::scalar(1);
2888 
2889   auto One = B.buildFConstant(S32, 1.0f);
2890 
2891   auto DenominatorScaled =
2892     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2893       .addUse(LHS)
2894       .addUse(RHS)
2895       .addImm(0)
2896       .setMIFlags(Flags);
2897   auto NumeratorScaled =
2898     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2899       .addUse(LHS)
2900       .addUse(RHS)
2901       .addImm(1)
2902       .setMIFlags(Flags);
2903 
2904   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2905     .addUse(DenominatorScaled.getReg(0))
2906     .setMIFlags(Flags);
2907   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2908 
2909   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2910   // aren't modeled as reading it.
2911   if (!Mode.allFP32Denormals())
2912     toggleSPDenormMode(true, B, ST, Mode);
2913 
2914   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2915   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2916   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2917   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2918   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2919   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2920 
2921   if (!Mode.allFP32Denormals())
2922     toggleSPDenormMode(false, B, ST, Mode);
2923 
2924   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2925     .addUse(Fma4.getReg(0))
2926     .addUse(Fma1.getReg(0))
2927     .addUse(Fma3.getReg(0))
2928     .addUse(NumeratorScaled.getReg(1))
2929     .setMIFlags(Flags);
2930 
2931   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2932     .addUse(Fmas.getReg(0))
2933     .addUse(RHS)
2934     .addUse(LHS)
2935     .setMIFlags(Flags);
2936 
2937   MI.eraseFromParent();
2938   return true;
2939 }
2940 
2941 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2942                                          MachineRegisterInfo &MRI,
2943                                          MachineIRBuilder &B) const {
2944   B.setInstr(MI);
2945   Register Res = MI.getOperand(0).getReg();
2946   Register LHS = MI.getOperand(1).getReg();
2947   Register RHS = MI.getOperand(2).getReg();
2948 
2949   uint16_t Flags = MI.getFlags();
2950 
2951   LLT S64 = LLT::scalar(64);
2952   LLT S1 = LLT::scalar(1);
2953 
2954   auto One = B.buildFConstant(S64, 1.0);
2955 
2956   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2957     .addUse(LHS)
2958     .addUse(RHS)
2959     .addImm(0)
2960     .setMIFlags(Flags);
2961 
2962   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2963 
2964   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2965     .addUse(DivScale0.getReg(0))
2966     .setMIFlags(Flags);
2967 
2968   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2969   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2970   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2971 
2972   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2973     .addUse(LHS)
2974     .addUse(RHS)
2975     .addImm(1)
2976     .setMIFlags(Flags);
2977 
2978   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2979   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
2980   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2981 
2982   Register Scale;
2983   if (!ST.hasUsableDivScaleConditionOutput()) {
2984     // Workaround a hardware bug on SI where the condition output from div_scale
2985     // is not usable.
2986 
2987     LLT S32 = LLT::scalar(32);
2988 
2989     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2990     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2991     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2992     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2993 
2994     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2995                               Scale1Unmerge.getReg(1));
2996     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2997                               Scale0Unmerge.getReg(1));
2998     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2999   } else {
3000     Scale = DivScale1.getReg(1);
3001   }
3002 
3003   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3004     .addUse(Fma4.getReg(0))
3005     .addUse(Fma3.getReg(0))
3006     .addUse(Mul.getReg(0))
3007     .addUse(Scale)
3008     .setMIFlags(Flags);
3009 
3010   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3011     .addUse(Fmas.getReg(0))
3012     .addUse(RHS)
3013     .addUse(LHS)
3014     .setMIFlags(Flags);
3015 
3016   MI.eraseFromParent();
3017   return true;
3018 }
3019 
3020 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3021                                                  MachineRegisterInfo &MRI,
3022                                                  MachineIRBuilder &B) const {
3023   B.setInstr(MI);
3024   Register Res = MI.getOperand(0).getReg();
3025   Register LHS = MI.getOperand(2).getReg();
3026   Register RHS = MI.getOperand(3).getReg();
3027   uint16_t Flags = MI.getFlags();
3028 
3029   LLT S32 = LLT::scalar(32);
3030   LLT S1 = LLT::scalar(1);
3031 
3032   auto Abs = B.buildFAbs(S32, RHS, Flags);
3033   const APFloat C0Val(1.0f);
3034 
3035   auto C0 = B.buildConstant(S32, 0x6f800000);
3036   auto C1 = B.buildConstant(S32, 0x2f800000);
3037   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3038 
3039   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3040   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3041 
3042   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3043 
3044   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3045     .addUse(Mul0.getReg(0))
3046     .setMIFlags(Flags);
3047 
3048   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3049 
3050   B.buildFMul(Res, Sel, Mul1, Flags);
3051 
3052   MI.eraseFromParent();
3053   return true;
3054 }
3055 
3056 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3057                                                  MachineRegisterInfo &MRI,
3058                                                  MachineIRBuilder &B) const {
3059   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3060   if (!MFI->isEntryFunction()) {
3061     return legalizePreloadedArgIntrin(MI, MRI, B,
3062                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3063   }
3064 
3065   B.setInstr(MI);
3066 
3067   uint64_t Offset =
3068     ST.getTargetLowering()->getImplicitParameterOffset(
3069       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3070   Register DstReg = MI.getOperand(0).getReg();
3071   LLT DstTy = MRI.getType(DstReg);
3072   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3073 
3074   const ArgDescriptor *Arg;
3075   const TargetRegisterClass *RC;
3076   std::tie(Arg, RC)
3077     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3078   if (!Arg)
3079     return false;
3080 
3081   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3082   if (!loadInputValue(KernargPtrReg, B, Arg))
3083     return false;
3084 
3085   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3086   MI.eraseFromParent();
3087   return true;
3088 }
3089 
3090 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3091                                               MachineRegisterInfo &MRI,
3092                                               MachineIRBuilder &B,
3093                                               unsigned AddrSpace) const {
3094   B.setInstr(MI);
3095   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3096   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3097   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3098   MI.eraseFromParent();
3099   return true;
3100 }
3101 
3102 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3103 // offset (the offset that is included in bounds checking and swizzling, to be
3104 // split between the instruction's voffset and immoffset fields) and soffset
3105 // (the offset that is excluded from bounds checking and swizzling, to go in
3106 // the instruction's soffset field).  This function takes the first kind of
3107 // offset and figures out how to split it between voffset and immoffset.
3108 std::tuple<Register, unsigned, unsigned>
3109 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3110                                         Register OrigOffset) const {
3111   const unsigned MaxImm = 4095;
3112   Register BaseReg;
3113   unsigned TotalConstOffset;
3114   MachineInstr *OffsetDef;
3115   const LLT S32 = LLT::scalar(32);
3116 
3117   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3118     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3119 
3120   unsigned ImmOffset = TotalConstOffset;
3121 
3122   // If the immediate value is too big for the immoffset field, put the value
3123   // and -4096 into the immoffset field so that the value that is copied/added
3124   // for the voffset field is a multiple of 4096, and it stands more chance
3125   // of being CSEd with the copy/add for another similar load/store.
3126   // However, do not do that rounding down to a multiple of 4096 if that is a
3127   // negative number, as it appears to be illegal to have a negative offset
3128   // in the vgpr, even if adding the immediate offset makes it positive.
3129   unsigned Overflow = ImmOffset & ~MaxImm;
3130   ImmOffset -= Overflow;
3131   if ((int32_t)Overflow < 0) {
3132     Overflow += ImmOffset;
3133     ImmOffset = 0;
3134   }
3135 
3136   if (Overflow != 0) {
3137     if (!BaseReg) {
3138       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3139     } else {
3140       auto OverflowVal = B.buildConstant(S32, Overflow);
3141       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3142     }
3143   }
3144 
3145   if (!BaseReg)
3146     BaseReg = B.buildConstant(S32, 0).getReg(0);
3147 
3148   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3149 }
3150 
3151 /// Handle register layout difference for f16 images for some subtargets.
3152 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3153                                              MachineRegisterInfo &MRI,
3154                                              Register Reg) const {
3155   if (!ST.hasUnpackedD16VMem())
3156     return Reg;
3157 
3158   const LLT S16 = LLT::scalar(16);
3159   const LLT S32 = LLT::scalar(32);
3160   LLT StoreVT = MRI.getType(Reg);
3161   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3162 
3163   auto Unmerge = B.buildUnmerge(S16, Reg);
3164 
3165   SmallVector<Register, 4> WideRegs;
3166   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3167     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3168 
3169   int NumElts = StoreVT.getNumElements();
3170 
3171   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3172 }
3173 
3174 Register AMDGPULegalizerInfo::fixStoreSourceType(
3175   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3176   MachineRegisterInfo *MRI = B.getMRI();
3177   LLT Ty = MRI->getType(VData);
3178 
3179   const LLT S16 = LLT::scalar(16);
3180 
3181   // Fixup illegal register types for i8 stores.
3182   if (Ty == LLT::scalar(8) || Ty == S16) {
3183     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3184     return AnyExt;
3185   }
3186 
3187   if (Ty.isVector()) {
3188     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3189       if (IsFormat)
3190         return handleD16VData(B, *MRI, VData);
3191     }
3192   }
3193 
3194   return VData;
3195 }
3196 
3197 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3198                                               MachineRegisterInfo &MRI,
3199                                               MachineIRBuilder &B,
3200                                               bool IsTyped,
3201                                               bool IsFormat) const {
3202   B.setInstr(MI);
3203 
3204   Register VData = MI.getOperand(1).getReg();
3205   LLT Ty = MRI.getType(VData);
3206   LLT EltTy = Ty.getScalarType();
3207   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3208   const LLT S32 = LLT::scalar(32);
3209 
3210   VData = fixStoreSourceType(B, VData, IsFormat);
3211   Register RSrc = MI.getOperand(2).getReg();
3212 
3213   MachineMemOperand *MMO = *MI.memoperands_begin();
3214   const int MemSize = MMO->getSize();
3215 
3216   unsigned ImmOffset;
3217   unsigned TotalOffset;
3218 
3219   // The typed intrinsics add an immediate after the registers.
3220   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3221 
3222   // The struct intrinsic variants add one additional operand over raw.
3223   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3224   Register VIndex;
3225   int OpOffset = 0;
3226   if (HasVIndex) {
3227     VIndex = MI.getOperand(3).getReg();
3228     OpOffset = 1;
3229   }
3230 
3231   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3232   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3233 
3234   unsigned Format = 0;
3235   if (IsTyped) {
3236     Format = MI.getOperand(5 + OpOffset).getImm();
3237     ++OpOffset;
3238   }
3239 
3240   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3241 
3242   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3243   if (TotalOffset != 0)
3244     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3245 
3246   unsigned Opc;
3247   if (IsTyped) {
3248     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3249                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3250   } else if (IsFormat) {
3251     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3252                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3253   } else {
3254     switch (MemSize) {
3255     case 1:
3256       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3257       break;
3258     case 2:
3259       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3260       break;
3261     default:
3262       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3263       break;
3264     }
3265   }
3266 
3267   if (!VIndex)
3268     VIndex = B.buildConstant(S32, 0).getReg(0);
3269 
3270   auto MIB = B.buildInstr(Opc)
3271     .addUse(VData)              // vdata
3272     .addUse(RSrc)               // rsrc
3273     .addUse(VIndex)             // vindex
3274     .addUse(VOffset)            // voffset
3275     .addUse(SOffset)            // soffset
3276     .addImm(ImmOffset);         // offset(imm)
3277 
3278   if (IsTyped)
3279     MIB.addImm(Format);
3280 
3281   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3282      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3283      .addMemOperand(MMO);
3284 
3285   MI.eraseFromParent();
3286   return true;
3287 }
3288 
3289 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3290                                              MachineRegisterInfo &MRI,
3291                                              MachineIRBuilder &B,
3292                                              bool IsFormat,
3293                                              bool IsTyped) const {
3294   B.setInstr(MI);
3295 
3296   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3297   MachineMemOperand *MMO = *MI.memoperands_begin();
3298   const int MemSize = MMO->getSize();
3299   const LLT S32 = LLT::scalar(32);
3300 
3301   Register Dst = MI.getOperand(0).getReg();
3302   Register RSrc = MI.getOperand(2).getReg();
3303 
3304   // The typed intrinsics add an immediate after the registers.
3305   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3306 
3307   // The struct intrinsic variants add one additional operand over raw.
3308   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3309   Register VIndex;
3310   int OpOffset = 0;
3311   if (HasVIndex) {
3312     VIndex = MI.getOperand(3).getReg();
3313     OpOffset = 1;
3314   }
3315 
3316   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3317   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3318 
3319   unsigned Format = 0;
3320   if (IsTyped) {
3321     Format = MI.getOperand(5 + OpOffset).getImm();
3322     ++OpOffset;
3323   }
3324 
3325   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3326   unsigned ImmOffset;
3327   unsigned TotalOffset;
3328 
3329   LLT Ty = MRI.getType(Dst);
3330   LLT EltTy = Ty.getScalarType();
3331   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3332   const bool Unpacked = ST.hasUnpackedD16VMem();
3333 
3334   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3335   if (TotalOffset != 0)
3336     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3337 
3338   unsigned Opc;
3339 
3340   if (IsTyped) {
3341     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3342                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3343   } else if (IsFormat) {
3344     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3345                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3346   } else {
3347     switch (MemSize) {
3348     case 1:
3349       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3350       break;
3351     case 2:
3352       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3353       break;
3354     default:
3355       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3356       break;
3357     }
3358   }
3359 
3360   Register LoadDstReg;
3361 
3362   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3363   LLT UnpackedTy = Ty.changeElementSize(32);
3364 
3365   if (IsExtLoad)
3366     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3367   else if (Unpacked && IsD16 && Ty.isVector())
3368     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3369   else
3370     LoadDstReg = Dst;
3371 
3372   if (!VIndex)
3373     VIndex = B.buildConstant(S32, 0).getReg(0);
3374 
3375   auto MIB = B.buildInstr(Opc)
3376     .addDef(LoadDstReg)         // vdata
3377     .addUse(RSrc)               // rsrc
3378     .addUse(VIndex)             // vindex
3379     .addUse(VOffset)            // voffset
3380     .addUse(SOffset)            // soffset
3381     .addImm(ImmOffset);         // offset(imm)
3382 
3383   if (IsTyped)
3384     MIB.addImm(Format);
3385 
3386   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3387      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3388      .addMemOperand(MMO);
3389 
3390   if (LoadDstReg != Dst) {
3391     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3392 
3393     // Widen result for extending loads was widened.
3394     if (IsExtLoad)
3395       B.buildTrunc(Dst, LoadDstReg);
3396     else {
3397       // Repack to original 16-bit vector result
3398       // FIXME: G_TRUNC should work, but legalization currently fails
3399       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3400       SmallVector<Register, 4> Repack;
3401       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3402         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3403       B.buildMerge(Dst, Repack);
3404     }
3405   }
3406 
3407   MI.eraseFromParent();
3408   return true;
3409 }
3410 
3411 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3412                                                MachineIRBuilder &B,
3413                                                bool IsInc) const {
3414   B.setInstr(MI);
3415   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3416                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3417   B.buildInstr(Opc)
3418     .addDef(MI.getOperand(0).getReg())
3419     .addUse(MI.getOperand(2).getReg())
3420     .addUse(MI.getOperand(3).getReg())
3421     .cloneMemRefs(MI);
3422   MI.eraseFromParent();
3423   return true;
3424 }
3425 
3426 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3427   switch (IntrID) {
3428   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3429   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3430     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3431   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3432   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3433     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3434   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3435   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3436     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3437   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3438   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3439     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3440   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3441   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3442     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3443   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3444   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3445     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3446   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3447   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3448     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3449   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3450   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3451     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3452   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3453   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3454     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3455   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3456   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3457     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3458   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3459   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3460     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3461   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3462   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3463     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3464   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3465   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3466     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3467   default:
3468     llvm_unreachable("unhandled atomic opcode");
3469   }
3470 }
3471 
3472 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3473                                                MachineIRBuilder &B,
3474                                                Intrinsic::ID IID) const {
3475   B.setInstr(MI);
3476 
3477   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3478                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3479 
3480   Register Dst = MI.getOperand(0).getReg();
3481   Register VData = MI.getOperand(2).getReg();
3482 
3483   Register CmpVal;
3484   int OpOffset = 0;
3485 
3486   if (IsCmpSwap) {
3487     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3488     ++OpOffset;
3489   }
3490 
3491   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3492   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3493 
3494   // The struct intrinsic variants add one additional operand over raw.
3495   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3496   Register VIndex;
3497   if (HasVIndex) {
3498     VIndex = MI.getOperand(4 + OpOffset).getReg();
3499     ++OpOffset;
3500   }
3501 
3502   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3503   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3504   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3505 
3506   MachineMemOperand *MMO = *MI.memoperands_begin();
3507 
3508   unsigned ImmOffset;
3509   unsigned TotalOffset;
3510   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3511   if (TotalOffset != 0)
3512     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3513 
3514   if (!VIndex)
3515     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3516 
3517   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3518     .addDef(Dst)
3519     .addUse(VData); // vdata
3520 
3521   if (IsCmpSwap)
3522     MIB.addReg(CmpVal);
3523 
3524   MIB.addUse(RSrc)               // rsrc
3525      .addUse(VIndex)             // vindex
3526      .addUse(VOffset)            // voffset
3527      .addUse(SOffset)            // soffset
3528      .addImm(ImmOffset)          // offset(imm)
3529      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3530      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3531      .addMemOperand(MMO);
3532 
3533   MI.eraseFromParent();
3534   return true;
3535 }
3536 
3537 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3538 /// vector with s16 typed elements.
3539 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3540                                         SmallVectorImpl<Register> &PackedAddrs,
3541                                         int AddrIdx, int DimIdx, int NumVAddrs,
3542                                         int NumGradients) {
3543   const LLT S16 = LLT::scalar(16);
3544   const LLT V2S16 = LLT::vector(2, 16);
3545 
3546   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3547     MachineOperand &SrcOp = MI.getOperand(I);
3548     if (!SrcOp.isReg())
3549       continue; // _L to _LZ may have eliminated this.
3550 
3551     Register AddrReg = SrcOp.getReg();
3552 
3553     if (I < DimIdx) {
3554       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3555       PackedAddrs.push_back(AddrReg);
3556     } else {
3557       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3558       // derivatives dx/dh and dx/dv are packed with undef.
3559       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3560           ((NumGradients / 2) % 2 == 1 &&
3561            (I == DimIdx + (NumGradients / 2) - 1 ||
3562             I == DimIdx + NumGradients - 1)) ||
3563           // Check for _L to _LZ optimization
3564           !MI.getOperand(I + 1).isReg()) {
3565         PackedAddrs.push_back(
3566             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3567                 .getReg(0));
3568       } else {
3569         PackedAddrs.push_back(
3570             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3571                 .getReg(0));
3572         ++I;
3573       }
3574     }
3575   }
3576 }
3577 
3578 /// Convert from separate vaddr components to a single vector address register,
3579 /// and replace the remaining operands with $noreg.
3580 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3581                                      int DimIdx, int NumVAddrs) {
3582   const LLT S32 = LLT::scalar(32);
3583 
3584   SmallVector<Register, 8> AddrRegs;
3585   for (int I = 0; I != NumVAddrs; ++I) {
3586     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3587     if (SrcOp.isReg()) {
3588       AddrRegs.push_back(SrcOp.getReg());
3589       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3590     }
3591   }
3592 
3593   int NumAddrRegs = AddrRegs.size();
3594   if (NumAddrRegs != 1) {
3595     // Round up to 8 elements for v5-v7
3596     // FIXME: Missing intermediate sized register classes and instructions.
3597     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3598       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3599       auto Undef = B.buildUndef(S32);
3600       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3601       NumAddrRegs = RoundedNumRegs;
3602     }
3603 
3604     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3605     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3606   }
3607 
3608   for (int I = 1; I != NumVAddrs; ++I) {
3609     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3610     if (SrcOp.isReg())
3611       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3612   }
3613 }
3614 
3615 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3616 ///
3617 /// Depending on the subtarget, load/store with 16-bit element data need to be
3618 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3619 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3620 /// registers.
3621 ///
3622 /// We don't want to directly select image instructions just yet, but also want
3623 /// to exposes all register repacking to the legalizer/combiners. We also don't
3624 /// want a selected instrution entering RegBankSelect. In order to avoid
3625 /// defining a multitude of intermediate image instructions, directly hack on
3626 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3627 /// now unnecessary arguments with $noreg.
3628 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3629     MachineInstr &MI, MachineIRBuilder &B,
3630     GISelChangeObserver &Observer,
3631     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3632   B.setInstr(MI);
3633 
3634   const int NumDefs = MI.getNumExplicitDefs();
3635   bool IsTFE = NumDefs == 2;
3636   // We are only processing the operands of d16 image operations on subtargets
3637   // that use the unpacked register layout, or need to repack the TFE result.
3638 
3639   // TODO: Do we need to guard against already legalized intrinsics?
3640   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3641     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3642 
3643   MachineRegisterInfo *MRI = B.getMRI();
3644   const LLT S32 = LLT::scalar(32);
3645   const LLT S16 = LLT::scalar(16);
3646   const LLT V2S16 = LLT::vector(2, 16);
3647 
3648   // Index of first address argument
3649   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3650 
3651   // Check for 16 bit addresses and pack if true.
3652   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3653   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3654   const bool IsA16 = AddrTy == S16;
3655 
3656   int NumVAddrs, NumGradients;
3657   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3658   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3659     getDMaskIdx(BaseOpcode, NumDefs);
3660   unsigned DMask = 0;
3661 
3662   int DMaskLanes = 0;
3663   if (!BaseOpcode->Atomic) {
3664     DMask = MI.getOperand(DMaskIdx).getImm();
3665     if (BaseOpcode->Gather4) {
3666       DMaskLanes = 4;
3667     } else if (DMask != 0) {
3668       DMaskLanes = countPopulation(DMask);
3669     } else if (!IsTFE && !BaseOpcode->Store) {
3670       // If dmask is 0, this is a no-op load. This can be eliminated.
3671       B.buildUndef(MI.getOperand(0));
3672       MI.eraseFromParent();
3673       return true;
3674     }
3675   }
3676 
3677   Observer.changingInstr(MI);
3678   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3679 
3680   unsigned NewOpcode = NumDefs == 0 ?
3681     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3682 
3683   // Track that we legalized this
3684   MI.setDesc(B.getTII().get(NewOpcode));
3685 
3686   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3687   // dmask to be at least 1 otherwise the instruction will fail
3688   if (IsTFE && DMask == 0) {
3689     DMask = 0x1;
3690     DMaskLanes = 1;
3691     MI.getOperand(DMaskIdx).setImm(DMask);
3692   }
3693 
3694   if (BaseOpcode->Atomic) {
3695     Register VData0 = MI.getOperand(2).getReg();
3696     LLT Ty = MRI->getType(VData0);
3697 
3698     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3699     if (Ty.isVector())
3700       return false;
3701 
3702     if (BaseOpcode->AtomicX2) {
3703       Register VData1 = MI.getOperand(3).getReg();
3704       // The two values are packed in one register.
3705       LLT PackedTy = LLT::vector(2, Ty);
3706       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3707       MI.getOperand(2).setReg(Concat.getReg(0));
3708       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3709     }
3710   }
3711 
3712   int CorrectedNumVAddrs = NumVAddrs;
3713 
3714   // Optimize _L to _LZ when _L is zero
3715   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3716         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3717     const ConstantFP *ConstantLod;
3718     const int LodIdx = AddrIdx + NumVAddrs - 1;
3719 
3720     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3721       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3722         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3723         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3724           LZMappingInfo->LZ, ImageDimIntr->Dim);
3725 
3726         // The starting indexes should remain in the same place.
3727         --NumVAddrs;
3728         --CorrectedNumVAddrs;
3729 
3730         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3731           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3732         MI.RemoveOperand(LodIdx);
3733       }
3734     }
3735   }
3736 
3737   // Optimize _mip away, when 'lod' is zero
3738   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3739     int64_t ConstantLod;
3740     const int LodIdx = AddrIdx + NumVAddrs - 1;
3741 
3742     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3743       if (ConstantLod == 0) {
3744         // TODO: Change intrinsic opcode and remove operand instead or replacing
3745         // it with 0, as the _L to _LZ handling is done above.
3746         MI.getOperand(LodIdx).ChangeToImmediate(0);
3747         --CorrectedNumVAddrs;
3748       }
3749     }
3750   }
3751 
3752   // If the register allocator cannot place the address registers contiguously
3753   // without introducing moves, then using the non-sequential address encoding
3754   // is always preferable, since it saves VALU instructions and is usually a
3755   // wash in terms of code size or even better.
3756   //
3757   // However, we currently have no way of hinting to the register allocator
3758   // that MIMG addresses should be placed contiguously when it is possible to
3759   // do so, so force non-NSA for the common 2-address case as a heuristic.
3760   //
3761   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3762   // allocation when possible.
3763   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3764 
3765   // Rewrite the addressing register layout before doing anything else.
3766   if (IsA16) {
3767     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3768     // should be introduced.
3769     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3770       return false;
3771 
3772     if (NumVAddrs > 1) {
3773       SmallVector<Register, 4> PackedRegs;
3774       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3775                                   NumGradients);
3776 
3777       if (!UseNSA && PackedRegs.size() > 1) {
3778         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3779         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3780         PackedRegs[0] = Concat.getReg(0);
3781         PackedRegs.resize(1);
3782       }
3783 
3784       const int NumPacked = PackedRegs.size();
3785       for (int I = 0; I != NumVAddrs; ++I) {
3786         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3787         if (!SrcOp.isReg()) {
3788           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3789           continue;
3790         }
3791 
3792         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3793 
3794         if (I < NumPacked)
3795           SrcOp.setReg(PackedRegs[I]);
3796         else
3797           SrcOp.setReg(AMDGPU::NoRegister);
3798       }
3799     }
3800   } else if (!UseNSA && NumVAddrs > 1) {
3801     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3802   }
3803 
3804 
3805   if (BaseOpcode->Store) { // No TFE for stores?
3806     // TODO: Handle dmask trim
3807     Register VData = MI.getOperand(1).getReg();
3808     LLT Ty = MRI->getType(VData);
3809     if (!Ty.isVector() || Ty.getElementType() != S16)
3810       return true;
3811 
3812     B.setInstr(MI);
3813 
3814     Register RepackedReg = handleD16VData(B, *MRI, VData);
3815     if (RepackedReg != VData) {
3816       MI.getOperand(1).setReg(RepackedReg);
3817     }
3818 
3819     return true;
3820   }
3821 
3822   Register DstReg = MI.getOperand(0).getReg();
3823   LLT Ty = MRI->getType(DstReg);
3824   const LLT EltTy = Ty.getScalarType();
3825   const bool IsD16 = Ty.getScalarType() == S16;
3826   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3827 
3828   // Confirm that the return type is large enough for the dmask specified
3829   if (NumElts < DMaskLanes)
3830     return false;
3831 
3832   if (NumElts > 4 || DMaskLanes > 4)
3833     return false;
3834 
3835   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3836   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3837 
3838   // The raw dword aligned data component of the load. The only legal cases
3839   // where this matters should be when using the packed D16 format, for
3840   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3841   LLT RoundedTy;
3842 
3843   // S32 vector to to cover all data, plus TFE result element.
3844   LLT TFETy;
3845 
3846   // Register type to use for each loaded component. Will be S32 or V2S16.
3847   LLT RegTy;
3848 
3849   if (IsD16 && ST.hasUnpackedD16VMem()) {
3850     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3851     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3852     RegTy = S32;
3853   } else {
3854     unsigned EltSize = EltTy.getSizeInBits();
3855     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3856     unsigned RoundedSize = 32 * RoundedElts;
3857     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3858     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3859     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3860   }
3861 
3862   // The return type does not need adjustment.
3863   // TODO: Should we change s16 case to s32 or <2 x s16>?
3864   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3865     return true;
3866 
3867   Register Dst1Reg;
3868 
3869   // Insert after the instruction.
3870   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3871 
3872   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3873   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3874   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3875   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3876 
3877   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3878 
3879   MI.getOperand(0).setReg(NewResultReg);
3880 
3881   // In the IR, TFE is supposed to be used with a 2 element struct return
3882   // type. The intruction really returns these two values in one contiguous
3883   // register, with one additional dword beyond the loaded data. Rewrite the
3884   // return type to use a single register result.
3885 
3886   if (IsTFE) {
3887     Dst1Reg = MI.getOperand(1).getReg();
3888     if (MRI->getType(Dst1Reg) != S32)
3889       return false;
3890 
3891     // TODO: Make sure the TFE operand bit is set.
3892     MI.RemoveOperand(1);
3893 
3894     // Handle the easy case that requires no repack instructions.
3895     if (Ty == S32) {
3896       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3897       return true;
3898     }
3899   }
3900 
3901   // Now figure out how to copy the new result register back into the old
3902   // result.
3903   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3904 
3905   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3906 
3907   if (ResultNumRegs == 1) {
3908     assert(!IsTFE);
3909     ResultRegs[0] = NewResultReg;
3910   } else {
3911     // We have to repack into a new vector of some kind.
3912     for (int I = 0; I != NumDataRegs; ++I)
3913       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3914     B.buildUnmerge(ResultRegs, NewResultReg);
3915 
3916     // Drop the final TFE element to get the data part. The TFE result is
3917     // directly written to the right place already.
3918     if (IsTFE)
3919       ResultRegs.resize(NumDataRegs);
3920   }
3921 
3922   // For an s16 scalar result, we form an s32 result with a truncate regardless
3923   // of packed vs. unpacked.
3924   if (IsD16 && !Ty.isVector()) {
3925     B.buildTrunc(DstReg, ResultRegs[0]);
3926     return true;
3927   }
3928 
3929   // Avoid a build/concat_vector of 1 entry.
3930   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3931     B.buildBitcast(DstReg, ResultRegs[0]);
3932     return true;
3933   }
3934 
3935   assert(Ty.isVector());
3936 
3937   if (IsD16) {
3938     // For packed D16 results with TFE enabled, all the data components are
3939     // S32. Cast back to the expected type.
3940     //
3941     // TODO: We don't really need to use load s32 elements. We would only need one
3942     // cast for the TFE result if a multiple of v2s16 was used.
3943     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3944       for (Register &Reg : ResultRegs)
3945         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3946     } else if (ST.hasUnpackedD16VMem()) {
3947       for (Register &Reg : ResultRegs)
3948         Reg = B.buildTrunc(S16, Reg).getReg(0);
3949     }
3950   }
3951 
3952   auto padWithUndef = [&](LLT Ty, int NumElts) {
3953     if (NumElts == 0)
3954       return;
3955     Register Undef = B.buildUndef(Ty).getReg(0);
3956     for (int I = 0; I != NumElts; ++I)
3957       ResultRegs.push_back(Undef);
3958   };
3959 
3960   // Pad out any elements eliminated due to the dmask.
3961   LLT ResTy = MRI->getType(ResultRegs[0]);
3962   if (!ResTy.isVector()) {
3963     padWithUndef(ResTy, NumElts - ResultRegs.size());
3964     B.buildBuildVector(DstReg, ResultRegs);
3965     return true;
3966   }
3967 
3968   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3969   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3970 
3971   // Deal with the one annoying legal case.
3972   const LLT V3S16 = LLT::vector(3, 16);
3973   if (Ty == V3S16) {
3974     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3975     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3976     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3977     return true;
3978   }
3979 
3980   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3981   B.buildConcatVectors(DstReg, ResultRegs);
3982   return true;
3983 }
3984 
3985 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3986   MachineInstr &MI, MachineIRBuilder &B,
3987   GISelChangeObserver &Observer) const {
3988   Register Dst = MI.getOperand(0).getReg();
3989   LLT Ty = B.getMRI()->getType(Dst);
3990   unsigned Size = Ty.getSizeInBits();
3991   MachineFunction &MF = B.getMF();
3992 
3993   Observer.changingInstr(MI);
3994 
3995   // FIXME: We don't really need this intermediate instruction. The intrinsic
3996   // should be fixed to have a memory operand. Since it's readnone, we're not
3997   // allowed to add one.
3998   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3999   MI.RemoveOperand(1); // Remove intrinsic ID
4000 
4001   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4002   // TODO: Should this use datalayout alignment?
4003   const unsigned MemSize = (Size + 7) / 8;
4004   const Align MemAlign(4);
4005   MachineMemOperand *MMO = MF.getMachineMemOperand(
4006       MachinePointerInfo(),
4007       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4008           MachineMemOperand::MOInvariant,
4009       MemSize, MemAlign);
4010   MI.addMemOperand(MF, MMO);
4011 
4012   // There are no 96-bit result scalar loads, but widening to 128-bit should
4013   // always be legal. We may need to restore this to a 96-bit result if it turns
4014   // out this needs to be converted to a vector load during RegBankSelect.
4015   if (!isPowerOf2_32(Size)) {
4016     LegalizerHelper Helper(MF, *this, Observer, B);
4017     B.setInstr(MI);
4018 
4019     if (Ty.isVector())
4020       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4021     else
4022       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4023   }
4024 
4025   Observer.changedInstr(MI);
4026   return true;
4027 }
4028 
4029 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4030                                                 MachineRegisterInfo &MRI,
4031                                                 MachineIRBuilder &B) const {
4032   B.setInstr(MI);
4033 
4034   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4035   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4036       !ST.isTrapHandlerEnabled()) {
4037     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4038   } else {
4039     // Pass queue pointer to trap handler as input, and insert trap instruction
4040     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4041     const ArgDescriptor *Arg =
4042         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4043     if (!Arg)
4044       return false;
4045     MachineRegisterInfo &MRI = *B.getMRI();
4046     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4047     Register LiveIn = getLiveInRegister(
4048         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4049         /*InsertLiveInCopy=*/false);
4050     if (!loadInputValue(LiveIn, B, Arg))
4051       return false;
4052     B.buildCopy(SGPR01, LiveIn);
4053     B.buildInstr(AMDGPU::S_TRAP)
4054         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4055         .addReg(SGPR01, RegState::Implicit);
4056   }
4057 
4058   MI.eraseFromParent();
4059   return true;
4060 }
4061 
4062 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4063     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4064   B.setInstr(MI);
4065 
4066   // Is non-HSA path or trap-handler disabled? then, report a warning
4067   // accordingly
4068   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4069       !ST.isTrapHandlerEnabled()) {
4070     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4071                                      "debugtrap handler not supported",
4072                                      MI.getDebugLoc(), DS_Warning);
4073     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4074     Ctx.diagnose(NoTrap);
4075   } else {
4076     // Insert debug-trap instruction
4077     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4078   }
4079 
4080   MI.eraseFromParent();
4081   return true;
4082 }
4083 
4084 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4085                                             MachineIRBuilder &B,
4086                                             GISelChangeObserver &Observer) const {
4087   MachineRegisterInfo &MRI = *B.getMRI();
4088 
4089   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4090   auto IntrID = MI.getIntrinsicID();
4091   switch (IntrID) {
4092   case Intrinsic::amdgcn_if:
4093   case Intrinsic::amdgcn_else: {
4094     MachineInstr *Br = nullptr;
4095     MachineBasicBlock *UncondBrTarget = nullptr;
4096     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4097       const SIRegisterInfo *TRI
4098         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4099 
4100       B.setInstr(*BrCond);
4101       Register Def = MI.getOperand(1).getReg();
4102       Register Use = MI.getOperand(3).getReg();
4103 
4104       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4105       if (IntrID == Intrinsic::amdgcn_if) {
4106         B.buildInstr(AMDGPU::SI_IF)
4107           .addDef(Def)
4108           .addUse(Use)
4109           .addMBB(UncondBrTarget);
4110       } else {
4111         B.buildInstr(AMDGPU::SI_ELSE)
4112           .addDef(Def)
4113           .addUse(Use)
4114           .addMBB(UncondBrTarget)
4115           .addImm(0);
4116       }
4117 
4118       if (Br) {
4119         Br->getOperand(0).setMBB(CondBrTarget);
4120       } else {
4121         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4122         // since we're swapping branch targets it needs to be reinserted.
4123         // FIXME: IRTranslator should probably not do this
4124         B.buildBr(*CondBrTarget);
4125       }
4126 
4127       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4128       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4129       MI.eraseFromParent();
4130       BrCond->eraseFromParent();
4131       return true;
4132     }
4133 
4134     return false;
4135   }
4136   case Intrinsic::amdgcn_loop: {
4137     MachineInstr *Br = nullptr;
4138     MachineBasicBlock *UncondBrTarget = nullptr;
4139     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4140       const SIRegisterInfo *TRI
4141         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4142 
4143       B.setInstr(*BrCond);
4144 
4145       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4146       Register Reg = MI.getOperand(2).getReg();
4147       B.buildInstr(AMDGPU::SI_LOOP)
4148         .addUse(Reg)
4149         .addMBB(UncondBrTarget);
4150 
4151       if (Br)
4152         Br->getOperand(0).setMBB(CondBrTarget);
4153       else
4154         B.buildBr(*CondBrTarget);
4155 
4156       MI.eraseFromParent();
4157       BrCond->eraseFromParent();
4158       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4159       return true;
4160     }
4161 
4162     return false;
4163   }
4164   case Intrinsic::amdgcn_kernarg_segment_ptr:
4165     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4166       B.setInstr(MI);
4167       // This only makes sense to call in a kernel, so just lower to null.
4168       B.buildConstant(MI.getOperand(0).getReg(), 0);
4169       MI.eraseFromParent();
4170       return true;
4171     }
4172 
4173     return legalizePreloadedArgIntrin(
4174       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4175   case Intrinsic::amdgcn_implicitarg_ptr:
4176     return legalizeImplicitArgPtr(MI, MRI, B);
4177   case Intrinsic::amdgcn_workitem_id_x:
4178     return legalizePreloadedArgIntrin(MI, MRI, B,
4179                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4180   case Intrinsic::amdgcn_workitem_id_y:
4181     return legalizePreloadedArgIntrin(MI, MRI, B,
4182                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4183   case Intrinsic::amdgcn_workitem_id_z:
4184     return legalizePreloadedArgIntrin(MI, MRI, B,
4185                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4186   case Intrinsic::amdgcn_workgroup_id_x:
4187     return legalizePreloadedArgIntrin(MI, MRI, B,
4188                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4189   case Intrinsic::amdgcn_workgroup_id_y:
4190     return legalizePreloadedArgIntrin(MI, MRI, B,
4191                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4192   case Intrinsic::amdgcn_workgroup_id_z:
4193     return legalizePreloadedArgIntrin(MI, MRI, B,
4194                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4195   case Intrinsic::amdgcn_dispatch_ptr:
4196     return legalizePreloadedArgIntrin(MI, MRI, B,
4197                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4198   case Intrinsic::amdgcn_queue_ptr:
4199     return legalizePreloadedArgIntrin(MI, MRI, B,
4200                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4201   case Intrinsic::amdgcn_implicit_buffer_ptr:
4202     return legalizePreloadedArgIntrin(
4203       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4204   case Intrinsic::amdgcn_dispatch_id:
4205     return legalizePreloadedArgIntrin(MI, MRI, B,
4206                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4207   case Intrinsic::amdgcn_fdiv_fast:
4208     return legalizeFDIVFastIntrin(MI, MRI, B);
4209   case Intrinsic::amdgcn_is_shared:
4210     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4211   case Intrinsic::amdgcn_is_private:
4212     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4213   case Intrinsic::amdgcn_wavefrontsize: {
4214     B.setInstr(MI);
4215     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4216     MI.eraseFromParent();
4217     return true;
4218   }
4219   case Intrinsic::amdgcn_s_buffer_load:
4220     return legalizeSBufferLoad(MI, B, Observer);
4221   case Intrinsic::amdgcn_raw_buffer_store:
4222   case Intrinsic::amdgcn_struct_buffer_store:
4223     return legalizeBufferStore(MI, MRI, B, false, false);
4224   case Intrinsic::amdgcn_raw_buffer_store_format:
4225   case Intrinsic::amdgcn_struct_buffer_store_format:
4226     return legalizeBufferStore(MI, MRI, B, false, true);
4227   case Intrinsic::amdgcn_raw_tbuffer_store:
4228   case Intrinsic::amdgcn_struct_tbuffer_store:
4229     return legalizeBufferStore(MI, MRI, B, true, true);
4230   case Intrinsic::amdgcn_raw_buffer_load:
4231   case Intrinsic::amdgcn_struct_buffer_load:
4232     return legalizeBufferLoad(MI, MRI, B, false, false);
4233   case Intrinsic::amdgcn_raw_buffer_load_format:
4234   case Intrinsic::amdgcn_struct_buffer_load_format:
4235     return legalizeBufferLoad(MI, MRI, B, true, false);
4236   case Intrinsic::amdgcn_raw_tbuffer_load:
4237   case Intrinsic::amdgcn_struct_tbuffer_load:
4238     return legalizeBufferLoad(MI, MRI, B, true, true);
4239   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4240   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4241   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4242   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4243   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4244   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4245   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4246   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4247   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4248   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4249   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4250   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4251   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4252   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4253   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4254   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4255   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4256   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4257   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4258   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4259   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4260   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4261   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4262   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4263   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4264   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4265     return legalizeBufferAtomic(MI, B, IntrID);
4266   case Intrinsic::amdgcn_atomic_inc:
4267     return legalizeAtomicIncDec(MI, B, true);
4268   case Intrinsic::amdgcn_atomic_dec:
4269     return legalizeAtomicIncDec(MI, B, false);
4270   case Intrinsic::trap:
4271     return legalizeTrapIntrinsic(MI, MRI, B);
4272   case Intrinsic::debugtrap:
4273     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4274   default: {
4275     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4276             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4277       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4278     return true;
4279   }
4280   }
4281 
4282   return true;
4283 }
4284