1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     return Ty.getSizeInBits() % 32 == 0;
77   };
78 }
79 
80 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     const LLT EltTy = Ty.getScalarType();
84     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
85   };
86 }
87 
88 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getElementType();
92     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
93   };
94 }
95 
96 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99     const LLT EltTy = Ty.getElementType();
100     unsigned Size = Ty.getSizeInBits();
101     unsigned Pieces = (Size + 63) / 64;
102     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
103     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
104   };
105 }
106 
107 // Increase the number of vector elements to reach the next multiple of 32-bit
108 // type.
109 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
110   return [=](const LegalityQuery &Query) {
111     const LLT Ty = Query.Types[TypeIdx];
112 
113     const LLT EltTy = Ty.getElementType();
114     const int Size = Ty.getSizeInBits();
115     const int EltSize = EltTy.getSizeInBits();
116     const int NextMul32 = (Size + 31) / 32;
117 
118     assert(EltSize < 32);
119 
120     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
121     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
122   };
123 }
124 
125 static LLT getBitcastRegisterType(const LLT Ty) {
126   const unsigned Size = Ty.getSizeInBits();
127 
128   LLT CoercedTy;
129   if (Size <= 32) {
130     // <2 x s8> -> s16
131     // <4 x s8> -> s32
132     return LLT::scalar(Size);
133   }
134 
135   return LLT::scalarOrVector(Size / 32, 32);
136 }
137 
138 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT Ty = Query.Types[TypeIdx];
141     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
142   };
143 }
144 
145 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     unsigned Size = Ty.getSizeInBits();
149     assert(Size % 32 == 0);
150     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
151   };
152 }
153 
154 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
155   return [=](const LegalityQuery &Query) {
156     const LLT QueryTy = Query.Types[TypeIdx];
157     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
158   };
159 }
160 
161 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
165   };
166 }
167 
168 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
172   };
173 }
174 
175 static bool isRegisterSize(unsigned Size) {
176   return Size % 32 == 0 && Size <= MaxRegisterSize;
177 }
178 
179 static bool isRegisterVectorElementType(LLT EltTy) {
180   const int EltSize = EltTy.getSizeInBits();
181   return EltSize == 16 || EltSize % 32 == 0;
182 }
183 
184 static bool isRegisterVectorType(LLT Ty) {
185   const int EltSize = Ty.getElementType().getSizeInBits();
186   return EltSize == 32 || EltSize == 64 ||
187          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
188          EltSize == 128 || EltSize == 256;
189 }
190 
191 static bool isRegisterType(LLT Ty) {
192   if (!isRegisterSize(Ty.getSizeInBits()))
193     return false;
194 
195   if (Ty.isVector())
196     return isRegisterVectorType(Ty);
197 
198   return true;
199 }
200 
201 // Any combination of 32 or 64-bit elements up the maximum register size, and
202 // multiples of v2s16.
203 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
204   return [=](const LegalityQuery &Query) {
205     return isRegisterType(Query.Types[TypeIdx]);
206   };
207 }
208 
209 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     const LLT QueryTy = Query.Types[TypeIdx];
212     if (!QueryTy.isVector())
213       return false;
214     const LLT EltTy = QueryTy.getElementType();
215     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
216   };
217 }
218 
219 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT Ty = Query.Types[TypeIdx];
222     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
223            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
224   };
225 }
226 
227 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
228 // handle some operations by just promoting the register during
229 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
230 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
231                                     bool IsLoad) {
232   switch (AS) {
233   case AMDGPUAS::PRIVATE_ADDRESS:
234     // FIXME: Private element size.
235     return 32;
236   case AMDGPUAS::LOCAL_ADDRESS:
237     return ST.useDS128() ? 128 : 64;
238   case AMDGPUAS::GLOBAL_ADDRESS:
239   case AMDGPUAS::CONSTANT_ADDRESS:
240   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
241     // Treat constant and global as identical. SMRD loads are sometimes usable for
242     // global loads (ideally constant address space should be eliminated)
243     // depending on the context. Legality cannot be context dependent, but
244     // RegBankSelect can split the load as necessary depending on the pointer
245     // register bank/uniformity and if the memory is invariant or not written in a
246     // kernel.
247     return IsLoad ? 512 : 128;
248   default:
249     // Flat addresses may contextually need to be split to 32-bit parts if they
250     // may alias scratch depending on the subtarget.
251     return 128;
252   }
253 }
254 
255 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
256                                  const LegalityQuery &Query,
257                                  unsigned Opcode) {
258   const LLT Ty = Query.Types[0];
259 
260   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
261   const bool IsLoad = Opcode != AMDGPU::G_STORE;
262 
263   unsigned RegSize = Ty.getSizeInBits();
264   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
265   unsigned Align = Query.MMODescrs[0].AlignInBits;
266   unsigned AS = Query.Types[1].getAddressSpace();
267 
268   // All of these need to be custom lowered to cast the pointer operand.
269   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
270     return false;
271 
272   // TODO: We should be able to widen loads if the alignment is high enough, but
273   // we also need to modify the memory access size.
274 #if 0
275   // Accept widening loads based on alignment.
276   if (IsLoad && MemSize < Size)
277     MemSize = std::max(MemSize, Align);
278 #endif
279 
280   // Only 1-byte and 2-byte to 32-bit extloads are valid.
281   if (MemSize != RegSize && RegSize != 32)
282     return false;
283 
284   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
285     return false;
286 
287   switch (MemSize) {
288   case 8:
289   case 16:
290   case 32:
291   case 64:
292   case 128:
293     break;
294   case 96:
295     if (!ST.hasDwordx3LoadStores())
296       return false;
297     break;
298   case 256:
299   case 512:
300     // These may contextually need to be broken down.
301     break;
302   default:
303     return false;
304   }
305 
306   assert(RegSize >= MemSize);
307 
308   if (Align < MemSize) {
309     const SITargetLowering *TLI = ST.getTargetLowering();
310     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
311       return false;
312   }
313 
314   return true;
315 }
316 
317 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
318 // workaround this. Eventually it should ignore the type for loads and only care
319 // about the size. Return true in cases where we will workaround this for now by
320 // bitcasting.
321 static bool loadStoreBitcastWorkaround(const LLT Ty) {
322   if (EnableNewLegality)
323     return false;
324 
325   const unsigned Size = Ty.getSizeInBits();
326   if (Size <= 64)
327     return false;
328   if (!Ty.isVector())
329     return true;
330   unsigned EltSize = Ty.getElementType().getSizeInBits();
331   return EltSize != 32 && EltSize != 64;
332 }
333 
334 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
335                              unsigned Opcode) {
336   const LLT Ty = Query.Types[0];
337   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
338          !loadStoreBitcastWorkaround(Ty);
339 }
340 
341 /// Return true if a load or store of the type should be lowered with a bitcast
342 /// to a different type.
343 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
344                                        const unsigned MemSizeInBits) {
345   const unsigned Size = Ty.getSizeInBits();
346     if (Size != MemSizeInBits)
347       return Size <= 32 && Ty.isVector();
348 
349   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
350     return true;
351   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
352          !isRegisterVectorElementType(Ty.getElementType());
353 }
354 
355 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
356                                          const GCNTargetMachine &TM)
357   :  ST(ST_) {
358   using namespace TargetOpcode;
359 
360   auto GetAddrSpacePtr = [&TM](unsigned AS) {
361     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
362   };
363 
364   const LLT S1 = LLT::scalar(1);
365   const LLT S16 = LLT::scalar(16);
366   const LLT S32 = LLT::scalar(32);
367   const LLT S64 = LLT::scalar(64);
368   const LLT S128 = LLT::scalar(128);
369   const LLT S256 = LLT::scalar(256);
370   const LLT S512 = LLT::scalar(512);
371   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
372 
373   const LLT V2S16 = LLT::vector(2, 16);
374   const LLT V4S16 = LLT::vector(4, 16);
375 
376   const LLT V2S32 = LLT::vector(2, 32);
377   const LLT V3S32 = LLT::vector(3, 32);
378   const LLT V4S32 = LLT::vector(4, 32);
379   const LLT V5S32 = LLT::vector(5, 32);
380   const LLT V6S32 = LLT::vector(6, 32);
381   const LLT V7S32 = LLT::vector(7, 32);
382   const LLT V8S32 = LLT::vector(8, 32);
383   const LLT V9S32 = LLT::vector(9, 32);
384   const LLT V10S32 = LLT::vector(10, 32);
385   const LLT V11S32 = LLT::vector(11, 32);
386   const LLT V12S32 = LLT::vector(12, 32);
387   const LLT V13S32 = LLT::vector(13, 32);
388   const LLT V14S32 = LLT::vector(14, 32);
389   const LLT V15S32 = LLT::vector(15, 32);
390   const LLT V16S32 = LLT::vector(16, 32);
391   const LLT V32S32 = LLT::vector(32, 32);
392 
393   const LLT V2S64 = LLT::vector(2, 64);
394   const LLT V3S64 = LLT::vector(3, 64);
395   const LLT V4S64 = LLT::vector(4, 64);
396   const LLT V5S64 = LLT::vector(5, 64);
397   const LLT V6S64 = LLT::vector(6, 64);
398   const LLT V7S64 = LLT::vector(7, 64);
399   const LLT V8S64 = LLT::vector(8, 64);
400   const LLT V16S64 = LLT::vector(16, 64);
401 
402   std::initializer_list<LLT> AllS32Vectors =
403     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
404      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
405   std::initializer_list<LLT> AllS64Vectors =
406     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
407 
408   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
409   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
410   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
411   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
412   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
413   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
414   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
415 
416   const LLT CodePtr = FlatPtr;
417 
418   const std::initializer_list<LLT> AddrSpaces64 = {
419     GlobalPtr, ConstantPtr, FlatPtr
420   };
421 
422   const std::initializer_list<LLT> AddrSpaces32 = {
423     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
424   };
425 
426   const std::initializer_list<LLT> FPTypesBase = {
427     S32, S64
428   };
429 
430   const std::initializer_list<LLT> FPTypes16 = {
431     S32, S64, S16
432   };
433 
434   const std::initializer_list<LLT> FPTypesPK16 = {
435     S32, S64, S16, V2S16
436   };
437 
438   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
439 
440   setAction({G_BRCOND, S1}, Legal); // VCC branches
441   setAction({G_BRCOND, S32}, Legal); // SCC branches
442 
443   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
444   // elements for v3s16
445   getActionDefinitionsBuilder(G_PHI)
446     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
447     .legalFor(AllS32Vectors)
448     .legalFor(AllS64Vectors)
449     .legalFor(AddrSpaces64)
450     .legalFor(AddrSpaces32)
451     .legalIf(isPointer(0))
452     .clampScalar(0, S32, S256)
453     .widenScalarToNextPow2(0, 32)
454     .clampMaxNumElements(0, S32, 16)
455     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
456     .scalarize(0);
457 
458   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
459     // Full set of gfx9 features.
460     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
461       .legalFor({S32, S16, V2S16})
462       .clampScalar(0, S16, S32)
463       .clampMaxNumElements(0, S16, 2)
464       .scalarize(0)
465       .widenScalarToNextPow2(0, 32);
466 
467     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
468       .legalFor({S32, S16, V2S16}) // Clamp modifier
469       .minScalar(0, S16)
470       .clampMaxNumElements(0, S16, 2)
471       .scalarize(0)
472       .widenScalarToNextPow2(0, 32)
473       .lower();
474   } else if (ST.has16BitInsts()) {
475     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
476       .legalFor({S32, S16})
477       .clampScalar(0, S16, S32)
478       .scalarize(0)
479       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
480 
481     // Technically the saturating operations require clamp bit support, but this
482     // was introduced at the same time as 16-bit operations.
483     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
484       .legalFor({S32, S16}) // Clamp modifier
485       .minScalar(0, S16)
486       .scalarize(0)
487       .widenScalarToNextPow2(0, 16)
488       .lower();
489 
490     // We're just lowering this, but it helps get a better result to try to
491     // coerce to the desired type first.
492     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
493       .minScalar(0, S16)
494       .scalarize(0)
495       .lower();
496   } else {
497     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
498       .legalFor({S32})
499       .clampScalar(0, S32, S32)
500       .scalarize(0);
501 
502     if (ST.hasIntClamp()) {
503       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
504         .legalFor({S32}) // Clamp modifier.
505         .scalarize(0)
506         .minScalarOrElt(0, S32)
507         .lower();
508     } else {
509       // Clamp bit support was added in VI, along with 16-bit operations.
510       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
511         .minScalar(0, S32)
512         .scalarize(0)
513         .lower();
514     }
515 
516     // FIXME: DAG expansion gets better results. The widening uses the smaller
517     // range values and goes for the min/max lowering directly.
518     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
519       .minScalar(0, S32)
520       .scalarize(0)
521       .lower();
522   }
523 
524   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
525     .customFor({S32, S64})
526     .clampScalar(0, S32, S64)
527     .widenScalarToNextPow2(0, 32)
528     .scalarize(0);
529 
530   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
531     .legalFor({S32})
532     .clampScalar(0, S32, S32)
533     .scalarize(0);
534 
535   // Report legal for any types we can handle anywhere. For the cases only legal
536   // on the SALU, RegBankSelect will be able to re-legalize.
537   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
538     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
539     .clampScalar(0, S32, S64)
540     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
542     .widenScalarToNextPow2(0)
543     .scalarize(0);
544 
545   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
546                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
547     .legalFor({{S32, S1}, {S32, S32}})
548     .minScalar(0, S32)
549     // TODO: .scalarize(0)
550     .lower();
551 
552   getActionDefinitionsBuilder(G_BITCAST)
553     // Don't worry about the size constraint.
554     .legalIf(all(isRegisterType(0), isRegisterType(1)))
555     .lower();
556 
557 
558   getActionDefinitionsBuilder(G_CONSTANT)
559     .legalFor({S1, S32, S64, S16, GlobalPtr,
560                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
561     .legalIf(isPointer(0))
562     .clampScalar(0, S32, S64)
563     .widenScalarToNextPow2(0);
564 
565   getActionDefinitionsBuilder(G_FCONSTANT)
566     .legalFor({S32, S64, S16})
567     .clampScalar(0, S16, S64);
568 
569   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
570       .legalIf(isRegisterType(0))
571       // s1 and s16 are special cases because they have legal operations on
572       // them, but don't really occupy registers in the normal way.
573       .legalFor({S1, S16})
574       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
575       .clampScalarOrElt(0, S32, MaxScalar)
576       .widenScalarToNextPow2(0, 32)
577       .clampMaxNumElements(0, S32, 16);
578 
579   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
580 
581   // If the amount is divergent, we have to do a wave reduction to get the
582   // maximum value, so this is expanded during RegBankSelect.
583   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
584     .legalFor({{PrivatePtr, S32}});
585 
586   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
587     .customIf(typeIsNot(0, PrivatePtr));
588 
589   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
590 
591   auto &FPOpActions = getActionDefinitionsBuilder(
592     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
593     .legalFor({S32, S64});
594   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
595     .customFor({S32, S64});
596   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
597     .customFor({S32, S64});
598 
599   if (ST.has16BitInsts()) {
600     if (ST.hasVOP3PInsts())
601       FPOpActions.legalFor({S16, V2S16});
602     else
603       FPOpActions.legalFor({S16});
604 
605     TrigActions.customFor({S16});
606     FDIVActions.customFor({S16});
607   }
608 
609   auto &MinNumMaxNum = getActionDefinitionsBuilder({
610       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
611 
612   if (ST.hasVOP3PInsts()) {
613     MinNumMaxNum.customFor(FPTypesPK16)
614       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
615       .clampMaxNumElements(0, S16, 2)
616       .clampScalar(0, S16, S64)
617       .scalarize(0);
618   } else if (ST.has16BitInsts()) {
619     MinNumMaxNum.customFor(FPTypes16)
620       .clampScalar(0, S16, S64)
621       .scalarize(0);
622   } else {
623     MinNumMaxNum.customFor(FPTypesBase)
624       .clampScalar(0, S32, S64)
625       .scalarize(0);
626   }
627 
628   if (ST.hasVOP3PInsts())
629     FPOpActions.clampMaxNumElements(0, S16, 2);
630 
631   FPOpActions
632     .scalarize(0)
633     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
634 
635   TrigActions
636     .scalarize(0)
637     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
638 
639   FDIVActions
640     .scalarize(0)
641     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
642 
643   getActionDefinitionsBuilder({G_FNEG, G_FABS})
644     .legalFor(FPTypesPK16)
645     .clampMaxNumElements(0, S16, 2)
646     .scalarize(0)
647     .clampScalar(0, S16, S64);
648 
649   if (ST.has16BitInsts()) {
650     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
651       .legalFor({S32, S64, S16})
652       .scalarize(0)
653       .clampScalar(0, S16, S64);
654   } else {
655     getActionDefinitionsBuilder(G_FSQRT)
656       .legalFor({S32, S64})
657       .scalarize(0)
658       .clampScalar(0, S32, S64);
659 
660     if (ST.hasFractBug()) {
661       getActionDefinitionsBuilder(G_FFLOOR)
662         .customFor({S64})
663         .legalFor({S32, S64})
664         .scalarize(0)
665         .clampScalar(0, S32, S64);
666     } else {
667       getActionDefinitionsBuilder(G_FFLOOR)
668         .legalFor({S32, S64})
669         .scalarize(0)
670         .clampScalar(0, S32, S64);
671     }
672   }
673 
674   getActionDefinitionsBuilder(G_FPTRUNC)
675     .legalFor({{S32, S64}, {S16, S32}})
676     .scalarize(0)
677     .lower();
678 
679   getActionDefinitionsBuilder(G_FPEXT)
680     .legalFor({{S64, S32}, {S32, S16}})
681     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
682     .scalarize(0);
683 
684   getActionDefinitionsBuilder(G_FSUB)
685       // Use actual fsub instruction
686       .legalFor({S32})
687       // Must use fadd + fneg
688       .lowerFor({S64, S16, V2S16})
689       .scalarize(0)
690       .clampScalar(0, S32, S64);
691 
692   // Whether this is legal depends on the floating point mode for the function.
693   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
694   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
695     FMad.customFor({S32, S16});
696   else if (ST.hasMadMacF32Insts())
697     FMad.customFor({S32});
698   else if (ST.hasMadF16())
699     FMad.customFor({S16});
700   FMad.scalarize(0)
701       .lower();
702 
703   // TODO: Do we need to clamp maximum bitwidth?
704   getActionDefinitionsBuilder(G_TRUNC)
705     .legalIf(isScalar(0))
706     .legalFor({{V2S16, V2S32}})
707     .clampMaxNumElements(0, S16, 2)
708     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
709     // situations (like an invalid implicit use), we don't want to infinite loop
710     // in the legalizer.
711     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
712     .alwaysLegal();
713 
714   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
715     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
716                {S32, S1}, {S64, S1}, {S16, S1}})
717     .scalarize(0)
718     .clampScalar(0, S32, S64)
719     .widenScalarToNextPow2(1, 32);
720 
721   // TODO: Split s1->s64 during regbankselect for VALU.
722   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
723     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
724     .lowerFor({{S32, S64}})
725     .lowerIf(typeIs(1, S1))
726     .customFor({{S64, S64}});
727   if (ST.has16BitInsts())
728     IToFP.legalFor({{S16, S16}});
729   IToFP.clampScalar(1, S32, S64)
730        .minScalar(0, S32)
731        .scalarize(0)
732        .widenScalarToNextPow2(1);
733 
734   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
735     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
736     .customFor({{S64, S64}})
737     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
738   if (ST.has16BitInsts())
739     FPToI.legalFor({{S16, S16}});
740   else
741     FPToI.minScalar(1, S32);
742 
743   FPToI.minScalar(0, S32)
744        .scalarize(0)
745        .lower();
746 
747   // Lower roundeven into G_FRINT
748   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
749     .scalarize(0)
750     .lower();
751 
752   if (ST.has16BitInsts()) {
753     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
754       .legalFor({S16, S32, S64})
755       .clampScalar(0, S16, S64)
756       .scalarize(0);
757   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
758     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
759       .legalFor({S32, S64})
760       .clampScalar(0, S32, S64)
761       .scalarize(0);
762   } else {
763     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
764       .legalFor({S32})
765       .customFor({S64})
766       .clampScalar(0, S32, S64)
767       .scalarize(0);
768   }
769 
770   getActionDefinitionsBuilder(G_PTR_ADD)
771     .legalIf(all(isPointer(0), sameSize(0, 1)))
772     .scalarize(0)
773     .scalarSameSizeAs(1, 0);
774 
775   getActionDefinitionsBuilder(G_PTRMASK)
776     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
777     .scalarSameSizeAs(1, 0)
778     .scalarize(0);
779 
780   auto &CmpBuilder =
781     getActionDefinitionsBuilder(G_ICMP)
782     // The compare output type differs based on the register bank of the output,
783     // so make both s1 and s32 legal.
784     //
785     // Scalar compares producing output in scc will be promoted to s32, as that
786     // is the allocatable register type that will be needed for the copy from
787     // scc. This will be promoted during RegBankSelect, and we assume something
788     // before that won't try to use s32 result types.
789     //
790     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
791     // bank.
792     .legalForCartesianProduct(
793       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
794     .legalForCartesianProduct(
795       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
796   if (ST.has16BitInsts()) {
797     CmpBuilder.legalFor({{S1, S16}});
798   }
799 
800   CmpBuilder
801     .widenScalarToNextPow2(1)
802     .clampScalar(1, S32, S64)
803     .scalarize(0)
804     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
805 
806   getActionDefinitionsBuilder(G_FCMP)
807     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
808     .widenScalarToNextPow2(1)
809     .clampScalar(1, S32, S64)
810     .scalarize(0);
811 
812   // FIXME: fpow has a selection pattern that should move to custom lowering.
813   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
814   if (ST.has16BitInsts())
815     Exp2Ops.legalFor({S32, S16});
816   else
817     Exp2Ops.legalFor({S32});
818   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
819   Exp2Ops.scalarize(0);
820 
821   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
822   if (ST.has16BitInsts())
823     ExpOps.customFor({{S32}, {S16}});
824   else
825     ExpOps.customFor({S32});
826   ExpOps.clampScalar(0, MinScalarFPTy, S32)
827         .scalarize(0);
828 
829   getActionDefinitionsBuilder(G_FPOWI)
830     .clampScalar(0, MinScalarFPTy, S32)
831     .lower();
832 
833   // The 64-bit versions produce 32-bit results, but only on the SALU.
834   getActionDefinitionsBuilder(G_CTPOP)
835     .legalFor({{S32, S32}, {S32, S64}})
836     .clampScalar(0, S32, S32)
837     .clampScalar(1, S32, S64)
838     .scalarize(0)
839     .widenScalarToNextPow2(0, 32)
840     .widenScalarToNextPow2(1, 32);
841 
842   // The hardware instructions return a different result on 0 than the generic
843   // instructions expect. The hardware produces -1, but these produce the
844   // bitwidth.
845   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
846     .scalarize(0)
847     .clampScalar(0, S32, S32)
848     .clampScalar(1, S32, S64)
849     .widenScalarToNextPow2(0, 32)
850     .widenScalarToNextPow2(1, 32)
851     .lower();
852 
853   // The 64-bit versions produce 32-bit results, but only on the SALU.
854   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
855     .legalFor({{S32, S32}, {S32, S64}})
856     .clampScalar(0, S32, S32)
857     .clampScalar(1, S32, S64)
858     .scalarize(0)
859     .widenScalarToNextPow2(0, 32)
860     .widenScalarToNextPow2(1, 32);
861 
862   getActionDefinitionsBuilder(G_BITREVERSE)
863     .legalFor({S32})
864     .clampScalar(0, S32, S32)
865     .scalarize(0);
866 
867   if (ST.has16BitInsts()) {
868     getActionDefinitionsBuilder(G_BSWAP)
869       .legalFor({S16, S32, V2S16})
870       .clampMaxNumElements(0, S16, 2)
871       // FIXME: Fixing non-power-of-2 before clamp is workaround for
872       // narrowScalar limitation.
873       .widenScalarToNextPow2(0)
874       .clampScalar(0, S16, S32)
875       .scalarize(0);
876 
877     if (ST.hasVOP3PInsts()) {
878       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
879         .legalFor({S32, S16, V2S16})
880         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
881         .clampMaxNumElements(0, S16, 2)
882         .minScalar(0, S16)
883         .widenScalarToNextPow2(0)
884         .scalarize(0)
885         .lower();
886     } else {
887       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
888         .legalFor({S32, S16})
889         .widenScalarToNextPow2(0)
890         .minScalar(0, S16)
891         .scalarize(0)
892         .lower();
893     }
894   } else {
895     // TODO: Should have same legality without v_perm_b32
896     getActionDefinitionsBuilder(G_BSWAP)
897       .legalFor({S32})
898       .lowerIf(scalarNarrowerThan(0, 32))
899       // FIXME: Fixing non-power-of-2 before clamp is workaround for
900       // narrowScalar limitation.
901       .widenScalarToNextPow2(0)
902       .maxScalar(0, S32)
903       .scalarize(0)
904       .lower();
905 
906     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
907       .legalFor({S32})
908       .minScalar(0, S32)
909       .widenScalarToNextPow2(0)
910       .scalarize(0)
911       .lower();
912   }
913 
914   getActionDefinitionsBuilder(G_INTTOPTR)
915     // List the common cases
916     .legalForCartesianProduct(AddrSpaces64, {S64})
917     .legalForCartesianProduct(AddrSpaces32, {S32})
918     .scalarize(0)
919     // Accept any address space as long as the size matches
920     .legalIf(sameSize(0, 1))
921     .widenScalarIf(smallerThan(1, 0),
922       [](const LegalityQuery &Query) {
923         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
924       })
925     .narrowScalarIf(largerThan(1, 0),
926       [](const LegalityQuery &Query) {
927         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
928       });
929 
930   getActionDefinitionsBuilder(G_PTRTOINT)
931     // List the common cases
932     .legalForCartesianProduct(AddrSpaces64, {S64})
933     .legalForCartesianProduct(AddrSpaces32, {S32})
934     .scalarize(0)
935     // Accept any address space as long as the size matches
936     .legalIf(sameSize(0, 1))
937     .widenScalarIf(smallerThan(0, 1),
938       [](const LegalityQuery &Query) {
939         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
940       })
941     .narrowScalarIf(
942       largerThan(0, 1),
943       [](const LegalityQuery &Query) {
944         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
945       });
946 
947   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
948     .scalarize(0)
949     .custom();
950 
951   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
952                                     bool IsLoad) -> bool {
953     const LLT DstTy = Query.Types[0];
954 
955     // Split vector extloads.
956     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
957     unsigned Align = Query.MMODescrs[0].AlignInBits;
958 
959     if (MemSize < DstTy.getSizeInBits())
960       MemSize = std::max(MemSize, Align);
961 
962     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
963       return true;
964 
965     const LLT PtrTy = Query.Types[1];
966     unsigned AS = PtrTy.getAddressSpace();
967     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
968       return true;
969 
970     // Catch weird sized loads that don't evenly divide into the access sizes
971     // TODO: May be able to widen depending on alignment etc.
972     unsigned NumRegs = (MemSize + 31) / 32;
973     if (NumRegs == 3) {
974       if (!ST.hasDwordx3LoadStores())
975         return true;
976     } else {
977       // If the alignment allows, these should have been widened.
978       if (!isPowerOf2_32(NumRegs))
979         return true;
980     }
981 
982     if (Align < MemSize) {
983       const SITargetLowering *TLI = ST.getTargetLowering();
984       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
985     }
986 
987     return false;
988   };
989 
990   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
991                                          unsigned Opc) -> bool {
992     unsigned Size = Query.Types[0].getSizeInBits();
993     if (isPowerOf2_32(Size))
994       return false;
995 
996     if (Size == 96 && ST.hasDwordx3LoadStores())
997       return false;
998 
999     unsigned AddrSpace = Query.Types[1].getAddressSpace();
1000     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1001       return false;
1002 
1003     unsigned Align = Query.MMODescrs[0].AlignInBits;
1004     unsigned RoundedSize = NextPowerOf2(Size);
1005     return (Align >= RoundedSize);
1006   };
1007 
1008   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1009   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1010   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1011 
1012   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1013   // LDS
1014   // TODO: Unsupported flat for SI.
1015 
1016   for (unsigned Op : {G_LOAD, G_STORE}) {
1017     const bool IsStore = Op == G_STORE;
1018 
1019     auto &Actions = getActionDefinitionsBuilder(Op);
1020     // Explicitly list some common cases.
1021     // TODO: Does this help compile time at all?
1022     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1023                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1024                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1025                                       {S64, GlobalPtr, 64, GlobalAlign32},
1026                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1027                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1028                                       {S32, GlobalPtr, 8, GlobalAlign8},
1029                                       {S32, GlobalPtr, 16, GlobalAlign16},
1030 
1031                                       {S32, LocalPtr, 32, 32},
1032                                       {S64, LocalPtr, 64, 32},
1033                                       {V2S32, LocalPtr, 64, 32},
1034                                       {S32, LocalPtr, 8, 8},
1035                                       {S32, LocalPtr, 16, 16},
1036                                       {V2S16, LocalPtr, 32, 32},
1037 
1038                                       {S32, PrivatePtr, 32, 32},
1039                                       {S32, PrivatePtr, 8, 8},
1040                                       {S32, PrivatePtr, 16, 16},
1041                                       {V2S16, PrivatePtr, 32, 32},
1042 
1043                                       {S32, ConstantPtr, 32, GlobalAlign32},
1044                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1045                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1046                                       {S64, ConstantPtr, 64, GlobalAlign32},
1047                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1048     Actions.legalIf(
1049       [=](const LegalityQuery &Query) -> bool {
1050         return isLoadStoreLegal(ST, Query, Op);
1051       });
1052 
1053     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1054     // 64-bits.
1055     //
1056     // TODO: Should generalize bitcast action into coerce, which will also cover
1057     // inserting addrspacecasts.
1058     Actions.customIf(typeIs(1, Constant32Ptr));
1059 
1060     // Turn any illegal element vectors into something easier to deal
1061     // with. These will ultimately produce 32-bit scalar shifts to extract the
1062     // parts anyway.
1063     //
1064     // For odd 16-bit element vectors, prefer to split those into pieces with
1065     // 16-bit vector parts.
1066     Actions.bitcastIf(
1067       [=](const LegalityQuery &Query) -> bool {
1068         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1069                                           Query.MMODescrs[0].SizeInBits);
1070       }, bitcastToRegisterType(0));
1071 
1072     Actions
1073         .customIf(typeIs(1, Constant32Ptr))
1074         // Widen suitably aligned loads by loading extra elements.
1075         .moreElementsIf([=](const LegalityQuery &Query) {
1076             const LLT Ty = Query.Types[0];
1077             return Op == G_LOAD && Ty.isVector() &&
1078                    shouldWidenLoadResult(Query, Op);
1079           }, moreElementsToNextPow2(0))
1080         .widenScalarIf([=](const LegalityQuery &Query) {
1081             const LLT Ty = Query.Types[0];
1082             return Op == G_LOAD && !Ty.isVector() &&
1083                    shouldWidenLoadResult(Query, Op);
1084           }, widenScalarOrEltToNextPow2(0))
1085         .narrowScalarIf(
1086             [=](const LegalityQuery &Query) -> bool {
1087               return !Query.Types[0].isVector() &&
1088                      needToSplitMemOp(Query, Op == G_LOAD);
1089             },
1090             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1091               const LLT DstTy = Query.Types[0];
1092               const LLT PtrTy = Query.Types[1];
1093 
1094               const unsigned DstSize = DstTy.getSizeInBits();
1095               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1096 
1097               // Split extloads.
1098               if (DstSize > MemSize)
1099                 return std::make_pair(0, LLT::scalar(MemSize));
1100 
1101               if (!isPowerOf2_32(DstSize)) {
1102                 // We're probably decomposing an odd sized store. Try to split
1103                 // to the widest type. TODO: Account for alignment. As-is it
1104                 // should be OK, since the new parts will be further legalized.
1105                 unsigned FloorSize = PowerOf2Floor(DstSize);
1106                 return std::make_pair(0, LLT::scalar(FloorSize));
1107               }
1108 
1109               if (DstSize > 32 && (DstSize % 32 != 0)) {
1110                 // FIXME: Need a way to specify non-extload of larger size if
1111                 // suitably aligned.
1112                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1113               }
1114 
1115               unsigned MaxSize = maxSizeForAddrSpace(ST,
1116                                                      PtrTy.getAddressSpace(),
1117                                                      Op == G_LOAD);
1118               if (MemSize > MaxSize)
1119                 return std::make_pair(0, LLT::scalar(MaxSize));
1120 
1121               unsigned Align = Query.MMODescrs[0].AlignInBits;
1122               return std::make_pair(0, LLT::scalar(Align));
1123             })
1124         .fewerElementsIf(
1125             [=](const LegalityQuery &Query) -> bool {
1126               return Query.Types[0].isVector() &&
1127                      needToSplitMemOp(Query, Op == G_LOAD);
1128             },
1129             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1130               const LLT DstTy = Query.Types[0];
1131               const LLT PtrTy = Query.Types[1];
1132 
1133               LLT EltTy = DstTy.getElementType();
1134               unsigned MaxSize = maxSizeForAddrSpace(ST,
1135                                                      PtrTy.getAddressSpace(),
1136                                                      Op == G_LOAD);
1137 
1138               // FIXME: Handle widened to power of 2 results better. This ends
1139               // up scalarizing.
1140               // FIXME: 3 element stores scalarized on SI
1141 
1142               // Split if it's too large for the address space.
1143               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1144                 unsigned NumElts = DstTy.getNumElements();
1145                 unsigned EltSize = EltTy.getSizeInBits();
1146 
1147                 if (MaxSize % EltSize == 0) {
1148                   return std::make_pair(
1149                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1150                 }
1151 
1152                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1153 
1154                 // FIXME: Refine when odd breakdowns handled
1155                 // The scalars will need to be re-legalized.
1156                 if (NumPieces == 1 || NumPieces >= NumElts ||
1157                     NumElts % NumPieces != 0)
1158                   return std::make_pair(0, EltTy);
1159 
1160                 return std::make_pair(0,
1161                                       LLT::vector(NumElts / NumPieces, EltTy));
1162               }
1163 
1164               // FIXME: We could probably handle weird extending loads better.
1165               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1166               if (DstTy.getSizeInBits() > MemSize)
1167                 return std::make_pair(0, EltTy);
1168 
1169               unsigned EltSize = EltTy.getSizeInBits();
1170               unsigned DstSize = DstTy.getSizeInBits();
1171               if (!isPowerOf2_32(DstSize)) {
1172                 // We're probably decomposing an odd sized store. Try to split
1173                 // to the widest type. TODO: Account for alignment. As-is it
1174                 // should be OK, since the new parts will be further legalized.
1175                 unsigned FloorSize = PowerOf2Floor(DstSize);
1176                 return std::make_pair(
1177                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1178               }
1179 
1180               // Need to split because of alignment.
1181               unsigned Align = Query.MMODescrs[0].AlignInBits;
1182               if (EltSize > Align &&
1183                   (EltSize / Align < DstTy.getNumElements())) {
1184                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1185               }
1186 
1187               // May need relegalization for the scalars.
1188               return std::make_pair(0, EltTy);
1189             })
1190         .minScalar(0, S32);
1191 
1192     if (IsStore)
1193       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1194 
1195     // TODO: Need a bitcast lower option?
1196     Actions
1197         .widenScalarToNextPow2(0)
1198         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1199   }
1200 
1201   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1202                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1203                                                   {S32, GlobalPtr, 16, 2 * 8},
1204                                                   {S32, LocalPtr, 8, 8},
1205                                                   {S32, LocalPtr, 16, 16},
1206                                                   {S32, PrivatePtr, 8, 8},
1207                                                   {S32, PrivatePtr, 16, 16},
1208                                                   {S32, ConstantPtr, 8, 8},
1209                                                   {S32, ConstantPtr, 16, 2 * 8}});
1210   if (ST.hasFlatAddressSpace()) {
1211     ExtLoads.legalForTypesWithMemDesc(
1212         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1213   }
1214 
1215   ExtLoads.clampScalar(0, S32, S32)
1216           .widenScalarToNextPow2(0)
1217           .unsupportedIfMemSizeNotPow2()
1218           .lower();
1219 
1220   auto &Atomics = getActionDefinitionsBuilder(
1221     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1222      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1223      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1224      G_ATOMICRMW_UMIN})
1225     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1226                {S64, GlobalPtr}, {S64, LocalPtr},
1227                {S32, RegionPtr}, {S64, RegionPtr}});
1228   if (ST.hasFlatAddressSpace()) {
1229     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1230   }
1231 
1232   if (ST.hasLDSFPAtomics()) {
1233     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1234       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1235   }
1236 
1237   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1238   // demarshalling
1239   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1240     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1241                 {S32, FlatPtr}, {S64, FlatPtr}})
1242     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1243                {S32, RegionPtr}, {S64, RegionPtr}});
1244   // TODO: Pointer types, any 32-bit or 64-bit vector
1245 
1246   // Condition should be s32 for scalar, s1 for vector.
1247   getActionDefinitionsBuilder(G_SELECT)
1248     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1249           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1250           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1251     .clampScalar(0, S16, S64)
1252     .scalarize(1)
1253     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1254     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1255     .clampMaxNumElements(0, S32, 2)
1256     .clampMaxNumElements(0, LocalPtr, 2)
1257     .clampMaxNumElements(0, PrivatePtr, 2)
1258     .scalarize(0)
1259     .widenScalarToNextPow2(0)
1260     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1261 
1262   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1263   // be more flexible with the shift amount type.
1264   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1265     .legalFor({{S32, S32}, {S64, S32}});
1266   if (ST.has16BitInsts()) {
1267     if (ST.hasVOP3PInsts()) {
1268       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1269             .clampMaxNumElements(0, S16, 2);
1270     } else
1271       Shifts.legalFor({{S16, S16}});
1272 
1273     // TODO: Support 16-bit shift amounts for all types
1274     Shifts.widenScalarIf(
1275       [=](const LegalityQuery &Query) {
1276         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1277         // 32-bit amount.
1278         const LLT ValTy = Query.Types[0];
1279         const LLT AmountTy = Query.Types[1];
1280         return ValTy.getSizeInBits() <= 16 &&
1281                AmountTy.getSizeInBits() < 16;
1282       }, changeTo(1, S16));
1283     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1284     Shifts.clampScalar(1, S32, S32);
1285     Shifts.clampScalar(0, S16, S64);
1286     Shifts.widenScalarToNextPow2(0, 16);
1287   } else {
1288     // Make sure we legalize the shift amount type first, as the general
1289     // expansion for the shifted type will produce much worse code if it hasn't
1290     // been truncated already.
1291     Shifts.clampScalar(1, S32, S32);
1292     Shifts.clampScalar(0, S32, S64);
1293     Shifts.widenScalarToNextPow2(0, 32);
1294   }
1295   Shifts.scalarize(0);
1296 
1297   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1298     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1299     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1300     unsigned IdxTypeIdx = 2;
1301 
1302     getActionDefinitionsBuilder(Op)
1303       .customIf([=](const LegalityQuery &Query) {
1304           const LLT EltTy = Query.Types[EltTypeIdx];
1305           const LLT VecTy = Query.Types[VecTypeIdx];
1306           const LLT IdxTy = Query.Types[IdxTypeIdx];
1307           const unsigned EltSize = EltTy.getSizeInBits();
1308           return (EltSize == 32 || EltSize == 64) &&
1309                   VecTy.getSizeInBits() % 32 == 0 &&
1310                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1311                   IdxTy.getSizeInBits() == 32;
1312         })
1313       .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
1314                  bitcastToVectorElement32(1))
1315       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1316       .bitcastIf(
1317         all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
1318         [=](const LegalityQuery &Query) {
1319           // For > 64-bit element types, try to turn this into a 64-bit
1320           // element vector since we may be able to do better indexing
1321           // if this is scalar. If not, fall back to 32.
1322           const LLT EltTy = Query.Types[EltTypeIdx];
1323           const LLT VecTy = Query.Types[VecTypeIdx];
1324           const unsigned DstEltSize = EltTy.getSizeInBits();
1325           const unsigned VecSize = VecTy.getSizeInBits();
1326 
1327           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1328           return std::make_pair(
1329             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1330         })
1331       .clampScalar(EltTypeIdx, S32, S64)
1332       .clampScalar(VecTypeIdx, S32, S64)
1333       .clampScalar(IdxTypeIdx, S32, S32)
1334       // TODO: Clamp the number of elements before resorting to stack lowering.
1335       // It should only be necessary with variable indexes.
1336       // As a last resort, lower to the stack
1337       .lower();
1338   }
1339 
1340   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1341     .unsupportedIf([=](const LegalityQuery &Query) {
1342         const LLT &EltTy = Query.Types[1].getElementType();
1343         return Query.Types[0] != EltTy;
1344       });
1345 
1346   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1347     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1348     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1349 
1350     // FIXME: Doesn't handle extract of illegal sizes.
1351     getActionDefinitionsBuilder(Op)
1352       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1353       // FIXME: Multiples of 16 should not be legal.
1354       .legalIf([=](const LegalityQuery &Query) {
1355           const LLT BigTy = Query.Types[BigTyIdx];
1356           const LLT LitTy = Query.Types[LitTyIdx];
1357           return (BigTy.getSizeInBits() % 32 == 0) &&
1358                  (LitTy.getSizeInBits() % 16 == 0);
1359         })
1360       .widenScalarIf(
1361         [=](const LegalityQuery &Query) {
1362           const LLT BigTy = Query.Types[BigTyIdx];
1363           return (BigTy.getScalarSizeInBits() < 16);
1364         },
1365         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1366       .widenScalarIf(
1367         [=](const LegalityQuery &Query) {
1368           const LLT LitTy = Query.Types[LitTyIdx];
1369           return (LitTy.getScalarSizeInBits() < 16);
1370         },
1371         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1372       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1373       .widenScalarToNextPow2(BigTyIdx, 32);
1374 
1375   }
1376 
1377   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1378     .legalForCartesianProduct(AllS32Vectors, {S32})
1379     .legalForCartesianProduct(AllS64Vectors, {S64})
1380     .clampNumElements(0, V16S32, V32S32)
1381     .clampNumElements(0, V2S64, V16S64)
1382     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1383 
1384   if (ST.hasScalarPackInsts()) {
1385     BuildVector
1386       // FIXME: Should probably widen s1 vectors straight to s32
1387       .minScalarOrElt(0, S16)
1388       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1389       .minScalar(1, S32);
1390 
1391     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1392       .legalFor({V2S16, S32})
1393       .lower();
1394     BuildVector.minScalarOrElt(0, S32);
1395   } else {
1396     BuildVector.customFor({V2S16, S16});
1397     BuildVector.minScalarOrElt(0, S32);
1398 
1399     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1400       .customFor({V2S16, S32})
1401       .lower();
1402   }
1403 
1404   BuildVector.legalIf(isRegisterType(0));
1405 
1406   // FIXME: Clamp maximum size
1407   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1408     .legalIf(isRegisterType(0));
1409 
1410   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1411   // pre-legalize.
1412   if (ST.hasVOP3PInsts()) {
1413     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1414       .customFor({V2S16, V2S16})
1415       .lower();
1416   } else
1417     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1418 
1419   // Merge/Unmerge
1420   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1421     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1422     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1423 
1424     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1425       const LLT Ty = Query.Types[TypeIdx];
1426       if (Ty.isVector()) {
1427         const LLT &EltTy = Ty.getElementType();
1428         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1429           return true;
1430         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1431           return true;
1432       }
1433       return false;
1434     };
1435 
1436     auto &Builder = getActionDefinitionsBuilder(Op)
1437       .lowerFor({{S16, V2S16}})
1438       .lowerIf([=](const LegalityQuery &Query) {
1439           const LLT BigTy = Query.Types[BigTyIdx];
1440           return BigTy.getSizeInBits() == 32;
1441         })
1442       // Try to widen to s16 first for small types.
1443       // TODO: Only do this on targets with legal s16 shifts
1444       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1445       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1446       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1447       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1448                            elementTypeIs(1, S16)),
1449                        changeTo(1, V2S16))
1450       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1451       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1452       // valid.
1453       .clampScalar(LitTyIdx, S32, S512)
1454       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1455       // Break up vectors with weird elements into scalars
1456       .fewerElementsIf(
1457         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1458         scalarize(0))
1459       .fewerElementsIf(
1460         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1461         scalarize(1))
1462       .clampScalar(BigTyIdx, S32, MaxScalar);
1463 
1464     if (Op == G_MERGE_VALUES) {
1465       Builder.widenScalarIf(
1466         // TODO: Use 16-bit shifts if legal for 8-bit values?
1467         [=](const LegalityQuery &Query) {
1468           const LLT Ty = Query.Types[LitTyIdx];
1469           return Ty.getSizeInBits() < 32;
1470         },
1471         changeTo(LitTyIdx, S32));
1472     }
1473 
1474     Builder.widenScalarIf(
1475       [=](const LegalityQuery &Query) {
1476         const LLT Ty = Query.Types[BigTyIdx];
1477         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1478           Ty.getSizeInBits() % 16 != 0;
1479       },
1480       [=](const LegalityQuery &Query) {
1481         // Pick the next power of 2, or a multiple of 64 over 128.
1482         // Whichever is smaller.
1483         const LLT &Ty = Query.Types[BigTyIdx];
1484         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1485         if (NewSizeInBits >= 256) {
1486           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1487           if (RoundedTo < NewSizeInBits)
1488             NewSizeInBits = RoundedTo;
1489         }
1490         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1491       })
1492       .legalIf([=](const LegalityQuery &Query) {
1493           const LLT &BigTy = Query.Types[BigTyIdx];
1494           const LLT &LitTy = Query.Types[LitTyIdx];
1495 
1496           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1497             return false;
1498           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1499             return false;
1500 
1501           return BigTy.getSizeInBits() % 16 == 0 &&
1502                  LitTy.getSizeInBits() % 16 == 0 &&
1503                  BigTy.getSizeInBits() <= MaxRegisterSize;
1504         })
1505       // Any vectors left are the wrong size. Scalarize them.
1506       .scalarize(0)
1507       .scalarize(1);
1508   }
1509 
1510   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1511   // RegBankSelect.
1512   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1513     .legalFor({{S32}, {S64}});
1514 
1515   if (ST.hasVOP3PInsts()) {
1516     SextInReg.lowerFor({{V2S16}})
1517       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1518       // get more vector shift opportunities, since we'll get those when
1519       // expanded.
1520       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1521   } else if (ST.has16BitInsts()) {
1522     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1523   } else {
1524     // Prefer to promote to s32 before lowering if we don't have 16-bit
1525     // shifts. This avoid a lot of intermediate truncate and extend operations.
1526     SextInReg.lowerFor({{S32}, {S64}});
1527   }
1528 
1529   SextInReg
1530     .scalarize(0)
1531     .clampScalar(0, S32, S64)
1532     .lower();
1533 
1534   getActionDefinitionsBuilder(G_FSHR)
1535     .legalFor({{S32, S32}})
1536     .scalarize(0)
1537     .lower();
1538 
1539   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1540     .legalFor({S64});
1541 
1542   getActionDefinitionsBuilder(G_FENCE)
1543     .alwaysLegal();
1544 
1545   getActionDefinitionsBuilder({
1546       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1547       G_FCOPYSIGN,
1548 
1549       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1550       G_ATOMICRMW_NAND,
1551       G_ATOMICRMW_FSUB,
1552       G_READ_REGISTER,
1553       G_WRITE_REGISTER,
1554 
1555       G_SADDO, G_SSUBO,
1556 
1557        // TODO: Implement
1558       G_FMINIMUM, G_FMAXIMUM,
1559       G_FSHL
1560     }).lower();
1561 
1562   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1563         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1564         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1565     .unsupported();
1566 
1567   computeTables();
1568   verify(*ST.getInstrInfo());
1569 }
1570 
1571 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1572                                          MachineInstr &MI) const {
1573   MachineIRBuilder &B = Helper.MIRBuilder;
1574   MachineRegisterInfo &MRI = *B.getMRI();
1575   GISelChangeObserver &Observer = Helper.Observer;
1576 
1577   switch (MI.getOpcode()) {
1578   case TargetOpcode::G_ADDRSPACE_CAST:
1579     return legalizeAddrSpaceCast(MI, MRI, B);
1580   case TargetOpcode::G_FRINT:
1581     return legalizeFrint(MI, MRI, B);
1582   case TargetOpcode::G_FCEIL:
1583     return legalizeFceil(MI, MRI, B);
1584   case TargetOpcode::G_INTRINSIC_TRUNC:
1585     return legalizeIntrinsicTrunc(MI, MRI, B);
1586   case TargetOpcode::G_SITOFP:
1587     return legalizeITOFP(MI, MRI, B, true);
1588   case TargetOpcode::G_UITOFP:
1589     return legalizeITOFP(MI, MRI, B, false);
1590   case TargetOpcode::G_FPTOSI:
1591     return legalizeFPTOI(MI, MRI, B, true);
1592   case TargetOpcode::G_FPTOUI:
1593     return legalizeFPTOI(MI, MRI, B, false);
1594   case TargetOpcode::G_FMINNUM:
1595   case TargetOpcode::G_FMAXNUM:
1596   case TargetOpcode::G_FMINNUM_IEEE:
1597   case TargetOpcode::G_FMAXNUM_IEEE:
1598     return legalizeMinNumMaxNum(Helper, MI);
1599   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1600     return legalizeExtractVectorElt(MI, MRI, B);
1601   case TargetOpcode::G_INSERT_VECTOR_ELT:
1602     return legalizeInsertVectorElt(MI, MRI, B);
1603   case TargetOpcode::G_SHUFFLE_VECTOR:
1604     return legalizeShuffleVector(MI, MRI, B);
1605   case TargetOpcode::G_FSIN:
1606   case TargetOpcode::G_FCOS:
1607     return legalizeSinCos(MI, MRI, B);
1608   case TargetOpcode::G_GLOBAL_VALUE:
1609     return legalizeGlobalValue(MI, MRI, B);
1610   case TargetOpcode::G_LOAD:
1611     return legalizeLoad(MI, MRI, B, Observer);
1612   case TargetOpcode::G_FMAD:
1613     return legalizeFMad(MI, MRI, B);
1614   case TargetOpcode::G_FDIV:
1615     return legalizeFDIV(MI, MRI, B);
1616   case TargetOpcode::G_UDIV:
1617   case TargetOpcode::G_UREM:
1618     return legalizeUDIV_UREM(MI, MRI, B);
1619   case TargetOpcode::G_SDIV:
1620   case TargetOpcode::G_SREM:
1621     return legalizeSDIV_SREM(MI, MRI, B);
1622   case TargetOpcode::G_ATOMIC_CMPXCHG:
1623     return legalizeAtomicCmpXChg(MI, MRI, B);
1624   case TargetOpcode::G_FLOG:
1625     return legalizeFlog(MI, B, numbers::ln2f);
1626   case TargetOpcode::G_FLOG10:
1627     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1628   case TargetOpcode::G_FEXP:
1629     return legalizeFExp(MI, B);
1630   case TargetOpcode::G_FPOW:
1631     return legalizeFPow(MI, B);
1632   case TargetOpcode::G_FFLOOR:
1633     return legalizeFFloor(MI, MRI, B);
1634   case TargetOpcode::G_BUILD_VECTOR:
1635     return legalizeBuildVector(MI, MRI, B);
1636   default:
1637     return false;
1638   }
1639 
1640   llvm_unreachable("expected switch to return");
1641 }
1642 
1643 Register AMDGPULegalizerInfo::getSegmentAperture(
1644   unsigned AS,
1645   MachineRegisterInfo &MRI,
1646   MachineIRBuilder &B) const {
1647   MachineFunction &MF = B.getMF();
1648   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1649   const LLT S32 = LLT::scalar(32);
1650 
1651   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1652 
1653   if (ST.hasApertureRegs()) {
1654     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1655     // getreg.
1656     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1657         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1658         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1659     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1660         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1661         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1662     unsigned Encoding =
1663         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1664         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1665         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1666 
1667     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1668 
1669     B.buildInstr(AMDGPU::S_GETREG_B32)
1670       .addDef(GetReg)
1671       .addImm(Encoding);
1672     MRI.setType(GetReg, S32);
1673 
1674     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1675     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1676   }
1677 
1678   Register QueuePtr = MRI.createGenericVirtualRegister(
1679     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1680 
1681   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1682     return Register();
1683 
1684   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1685   // private_segment_aperture_base_hi.
1686   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1687 
1688   // TODO: can we be smarter about machine pointer info?
1689   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1690   MachineMemOperand *MMO = MF.getMachineMemOperand(
1691       PtrInfo,
1692       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1693           MachineMemOperand::MOInvariant,
1694       4, commonAlignment(Align(64), StructOffset));
1695 
1696   Register LoadAddr;
1697 
1698   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1699   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1700 }
1701 
1702 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1703   MachineInstr &MI, MachineRegisterInfo &MRI,
1704   MachineIRBuilder &B) const {
1705   MachineFunction &MF = B.getMF();
1706 
1707   const LLT S32 = LLT::scalar(32);
1708   Register Dst = MI.getOperand(0).getReg();
1709   Register Src = MI.getOperand(1).getReg();
1710 
1711   LLT DstTy = MRI.getType(Dst);
1712   LLT SrcTy = MRI.getType(Src);
1713   unsigned DestAS = DstTy.getAddressSpace();
1714   unsigned SrcAS = SrcTy.getAddressSpace();
1715 
1716   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1717   // vector element.
1718   assert(!DstTy.isVector());
1719 
1720   const AMDGPUTargetMachine &TM
1721     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1722 
1723   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1724     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1725     return true;
1726   }
1727 
1728   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1729     // Truncate.
1730     B.buildExtract(Dst, Src, 0);
1731     MI.eraseFromParent();
1732     return true;
1733   }
1734 
1735   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1736     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1737     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1738 
1739     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1740     // another. Merge operands are required to be the same type, but creating an
1741     // extra ptrtoint would be kind of pointless.
1742     auto HighAddr = B.buildConstant(
1743       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1744     B.buildMerge(Dst, {Src, HighAddr});
1745     MI.eraseFromParent();
1746     return true;
1747   }
1748 
1749   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1750     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1751            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1752     unsigned NullVal = TM.getNullPointerValue(DestAS);
1753 
1754     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1755     auto FlatNull = B.buildConstant(SrcTy, 0);
1756 
1757     // Extract low 32-bits of the pointer.
1758     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1759 
1760     auto CmpRes =
1761         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1762     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1763 
1764     MI.eraseFromParent();
1765     return true;
1766   }
1767 
1768   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1769     return false;
1770 
1771   if (!ST.hasFlatAddressSpace())
1772     return false;
1773 
1774   auto SegmentNull =
1775       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1776   auto FlatNull =
1777       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1778 
1779   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1780   if (!ApertureReg.isValid())
1781     return false;
1782 
1783   auto CmpRes =
1784       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1785 
1786   // Coerce the type of the low half of the result so we can use merge_values.
1787   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1788 
1789   // TODO: Should we allow mismatched types but matching sizes in merges to
1790   // avoid the ptrtoint?
1791   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1792   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1793 
1794   MI.eraseFromParent();
1795   return true;
1796 }
1797 
1798 bool AMDGPULegalizerInfo::legalizeFrint(
1799   MachineInstr &MI, MachineRegisterInfo &MRI,
1800   MachineIRBuilder &B) const {
1801   Register Src = MI.getOperand(1).getReg();
1802   LLT Ty = MRI.getType(Src);
1803   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1804 
1805   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1806   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1807 
1808   auto C1 = B.buildFConstant(Ty, C1Val);
1809   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1810 
1811   // TODO: Should this propagate fast-math-flags?
1812   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1813   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1814 
1815   auto C2 = B.buildFConstant(Ty, C2Val);
1816   auto Fabs = B.buildFAbs(Ty, Src);
1817 
1818   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1819   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1820   MI.eraseFromParent();
1821   return true;
1822 }
1823 
1824 bool AMDGPULegalizerInfo::legalizeFceil(
1825   MachineInstr &MI, MachineRegisterInfo &MRI,
1826   MachineIRBuilder &B) const {
1827 
1828   const LLT S1 = LLT::scalar(1);
1829   const LLT S64 = LLT::scalar(64);
1830 
1831   Register Src = MI.getOperand(1).getReg();
1832   assert(MRI.getType(Src) == S64);
1833 
1834   // result = trunc(src)
1835   // if (src > 0.0 && src != result)
1836   //   result += 1.0
1837 
1838   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1839 
1840   const auto Zero = B.buildFConstant(S64, 0.0);
1841   const auto One = B.buildFConstant(S64, 1.0);
1842   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1843   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1844   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1845   auto Add = B.buildSelect(S64, And, One, Zero);
1846 
1847   // TODO: Should this propagate fast-math-flags?
1848   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1849   return true;
1850 }
1851 
1852 static MachineInstrBuilder extractF64Exponent(Register Hi,
1853                                               MachineIRBuilder &B) {
1854   const unsigned FractBits = 52;
1855   const unsigned ExpBits = 11;
1856   LLT S32 = LLT::scalar(32);
1857 
1858   auto Const0 = B.buildConstant(S32, FractBits - 32);
1859   auto Const1 = B.buildConstant(S32, ExpBits);
1860 
1861   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1862     .addUse(Hi)
1863     .addUse(Const0.getReg(0))
1864     .addUse(Const1.getReg(0));
1865 
1866   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1867 }
1868 
1869 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1870   MachineInstr &MI, MachineRegisterInfo &MRI,
1871   MachineIRBuilder &B) const {
1872   const LLT S1 = LLT::scalar(1);
1873   const LLT S32 = LLT::scalar(32);
1874   const LLT S64 = LLT::scalar(64);
1875 
1876   Register Src = MI.getOperand(1).getReg();
1877   assert(MRI.getType(Src) == S64);
1878 
1879   // TODO: Should this use extract since the low half is unused?
1880   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1881   Register Hi = Unmerge.getReg(1);
1882 
1883   // Extract the upper half, since this is where we will find the sign and
1884   // exponent.
1885   auto Exp = extractF64Exponent(Hi, B);
1886 
1887   const unsigned FractBits = 52;
1888 
1889   // Extract the sign bit.
1890   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1891   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1892 
1893   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1894 
1895   const auto Zero32 = B.buildConstant(S32, 0);
1896 
1897   // Extend back to 64-bits.
1898   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1899 
1900   auto Shr = B.buildAShr(S64, FractMask, Exp);
1901   auto Not = B.buildNot(S64, Shr);
1902   auto Tmp0 = B.buildAnd(S64, Src, Not);
1903   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1904 
1905   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1906   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1907 
1908   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1909   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1910   MI.eraseFromParent();
1911   return true;
1912 }
1913 
1914 bool AMDGPULegalizerInfo::legalizeITOFP(
1915   MachineInstr &MI, MachineRegisterInfo &MRI,
1916   MachineIRBuilder &B, bool Signed) const {
1917 
1918   Register Dst = MI.getOperand(0).getReg();
1919   Register Src = MI.getOperand(1).getReg();
1920 
1921   const LLT S64 = LLT::scalar(64);
1922   const LLT S32 = LLT::scalar(32);
1923 
1924   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1925 
1926   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1927 
1928   auto CvtHi = Signed ?
1929     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1930     B.buildUITOFP(S64, Unmerge.getReg(1));
1931 
1932   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1933 
1934   auto ThirtyTwo = B.buildConstant(S32, 32);
1935   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1936     .addUse(CvtHi.getReg(0))
1937     .addUse(ThirtyTwo.getReg(0));
1938 
1939   // TODO: Should this propagate fast-math-flags?
1940   B.buildFAdd(Dst, LdExp, CvtLo);
1941   MI.eraseFromParent();
1942   return true;
1943 }
1944 
1945 // TODO: Copied from DAG implementation. Verify logic and document how this
1946 // actually works.
1947 bool AMDGPULegalizerInfo::legalizeFPTOI(
1948   MachineInstr &MI, MachineRegisterInfo &MRI,
1949   MachineIRBuilder &B, bool Signed) const {
1950 
1951   Register Dst = MI.getOperand(0).getReg();
1952   Register Src = MI.getOperand(1).getReg();
1953 
1954   const LLT S64 = LLT::scalar(64);
1955   const LLT S32 = LLT::scalar(32);
1956 
1957   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1958 
1959   unsigned Flags = MI.getFlags();
1960 
1961   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1962   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1963   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1964 
1965   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1966   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1967   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1968 
1969   auto Hi = Signed ?
1970     B.buildFPTOSI(S32, FloorMul) :
1971     B.buildFPTOUI(S32, FloorMul);
1972   auto Lo = B.buildFPTOUI(S32, Fma);
1973 
1974   B.buildMerge(Dst, { Lo, Hi });
1975   MI.eraseFromParent();
1976 
1977   return true;
1978 }
1979 
1980 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1981                                                MachineInstr &MI) const {
1982   MachineFunction &MF = Helper.MIRBuilder.getMF();
1983   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1984 
1985   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1986                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1987 
1988   // With ieee_mode disabled, the instructions have the correct behavior
1989   // already for G_FMINNUM/G_FMAXNUM
1990   if (!MFI->getMode().IEEE)
1991     return !IsIEEEOp;
1992 
1993   if (IsIEEEOp)
1994     return true;
1995 
1996   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1997 }
1998 
1999 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2000   MachineInstr &MI, MachineRegisterInfo &MRI,
2001   MachineIRBuilder &B) const {
2002   // TODO: Should move some of this into LegalizerHelper.
2003 
2004   // TODO: Promote dynamic indexing of s16 to s32
2005 
2006   // FIXME: Artifact combiner probably should have replaced the truncated
2007   // constant before this, so we shouldn't need
2008   // getConstantVRegValWithLookThrough.
2009   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2010     MI.getOperand(2).getReg(), MRI);
2011   if (!IdxVal) // Dynamic case will be selected to register indexing.
2012     return true;
2013 
2014   Register Dst = MI.getOperand(0).getReg();
2015   Register Vec = MI.getOperand(1).getReg();
2016 
2017   LLT VecTy = MRI.getType(Vec);
2018   LLT EltTy = VecTy.getElementType();
2019   assert(EltTy == MRI.getType(Dst));
2020 
2021   if (IdxVal->Value < VecTy.getNumElements())
2022     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2023   else
2024     B.buildUndef(Dst);
2025 
2026   MI.eraseFromParent();
2027   return true;
2028 }
2029 
2030 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2031   MachineInstr &MI, MachineRegisterInfo &MRI,
2032   MachineIRBuilder &B) const {
2033   // TODO: Should move some of this into LegalizerHelper.
2034 
2035   // TODO: Promote dynamic indexing of s16 to s32
2036 
2037   // FIXME: Artifact combiner probably should have replaced the truncated
2038   // constant before this, so we shouldn't need
2039   // getConstantVRegValWithLookThrough.
2040   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2041     MI.getOperand(3).getReg(), MRI);
2042   if (!IdxVal) // Dynamic case will be selected to register indexing.
2043     return true;
2044 
2045   Register Dst = MI.getOperand(0).getReg();
2046   Register Vec = MI.getOperand(1).getReg();
2047   Register Ins = MI.getOperand(2).getReg();
2048 
2049   LLT VecTy = MRI.getType(Vec);
2050   LLT EltTy = VecTy.getElementType();
2051   assert(EltTy == MRI.getType(Ins));
2052 
2053   if (IdxVal->Value < VecTy.getNumElements())
2054     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2055   else
2056     B.buildUndef(Dst);
2057 
2058   MI.eraseFromParent();
2059   return true;
2060 }
2061 
2062 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2063   MachineInstr &MI, MachineRegisterInfo &MRI,
2064   MachineIRBuilder &B) const {
2065   const LLT V2S16 = LLT::vector(2, 16);
2066 
2067   Register Dst = MI.getOperand(0).getReg();
2068   Register Src0 = MI.getOperand(1).getReg();
2069   LLT DstTy = MRI.getType(Dst);
2070   LLT SrcTy = MRI.getType(Src0);
2071 
2072   if (SrcTy == V2S16 && DstTy == V2S16 &&
2073       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2074     return true;
2075 
2076   MachineIRBuilder HelperBuilder(MI);
2077   GISelObserverWrapper DummyObserver;
2078   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2079   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2080 }
2081 
2082 bool AMDGPULegalizerInfo::legalizeSinCos(
2083   MachineInstr &MI, MachineRegisterInfo &MRI,
2084   MachineIRBuilder &B) const {
2085 
2086   Register DstReg = MI.getOperand(0).getReg();
2087   Register SrcReg = MI.getOperand(1).getReg();
2088   LLT Ty = MRI.getType(DstReg);
2089   unsigned Flags = MI.getFlags();
2090 
2091   Register TrigVal;
2092   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2093   if (ST.hasTrigReducedRange()) {
2094     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2095     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2096       .addUse(MulVal.getReg(0))
2097       .setMIFlags(Flags).getReg(0);
2098   } else
2099     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2100 
2101   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2102     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2103   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2104     .addUse(TrigVal)
2105     .setMIFlags(Flags);
2106   MI.eraseFromParent();
2107   return true;
2108 }
2109 
2110 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2111                                                   MachineIRBuilder &B,
2112                                                   const GlobalValue *GV,
2113                                                   int64_t Offset,
2114                                                   unsigned GAFlags) const {
2115   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2116   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2117   // to the following code sequence:
2118   //
2119   // For constant address space:
2120   //   s_getpc_b64 s[0:1]
2121   //   s_add_u32 s0, s0, $symbol
2122   //   s_addc_u32 s1, s1, 0
2123   //
2124   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2125   //   a fixup or relocation is emitted to replace $symbol with a literal
2126   //   constant, which is a pc-relative offset from the encoding of the $symbol
2127   //   operand to the global variable.
2128   //
2129   // For global address space:
2130   //   s_getpc_b64 s[0:1]
2131   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2132   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2133   //
2134   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2135   //   fixups or relocations are emitted to replace $symbol@*@lo and
2136   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2137   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2138   //   operand to the global variable.
2139   //
2140   // What we want here is an offset from the value returned by s_getpc
2141   // (which is the address of the s_add_u32 instruction) to the global
2142   // variable, but since the encoding of $symbol starts 4 bytes after the start
2143   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2144   // small. This requires us to add 4 to the global variable offset in order to
2145   // compute the correct address.
2146 
2147   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2148 
2149   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2150     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2151 
2152   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2153     .addDef(PCReg);
2154 
2155   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2156   if (GAFlags == SIInstrInfo::MO_NONE)
2157     MIB.addImm(0);
2158   else
2159     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2160 
2161   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2162 
2163   if (PtrTy.getSizeInBits() == 32)
2164     B.buildExtract(DstReg, PCReg, 0);
2165   return true;
2166  }
2167 
2168 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2169   MachineInstr &MI, MachineRegisterInfo &MRI,
2170   MachineIRBuilder &B) const {
2171   Register DstReg = MI.getOperand(0).getReg();
2172   LLT Ty = MRI.getType(DstReg);
2173   unsigned AS = Ty.getAddressSpace();
2174 
2175   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2176   MachineFunction &MF = B.getMF();
2177   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2178 
2179   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2180     if (!MFI->isEntryFunction()) {
2181       const Function &Fn = MF.getFunction();
2182       DiagnosticInfoUnsupported BadLDSDecl(
2183         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2184         DS_Warning);
2185       Fn.getContext().diagnose(BadLDSDecl);
2186 
2187       // We currently don't have a way to correctly allocate LDS objects that
2188       // aren't directly associated with a kernel. We do force inlining of
2189       // functions that use local objects. However, if these dead functions are
2190       // not eliminated, we don't want a compile time error. Just emit a warning
2191       // and a trap, since there should be no callable path here.
2192       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2193       B.buildUndef(DstReg);
2194       MI.eraseFromParent();
2195       return true;
2196     }
2197 
2198     // TODO: We could emit code to handle the initialization somewhere.
2199     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2200       const SITargetLowering *TLI = ST.getTargetLowering();
2201       if (!TLI->shouldUseLDSConstAddress(GV)) {
2202         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2203         return true; // Leave in place;
2204       }
2205 
2206       B.buildConstant(
2207           DstReg,
2208           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2209       MI.eraseFromParent();
2210       return true;
2211     }
2212 
2213     const Function &Fn = MF.getFunction();
2214     DiagnosticInfoUnsupported BadInit(
2215       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2216     Fn.getContext().diagnose(BadInit);
2217     return true;
2218   }
2219 
2220   const SITargetLowering *TLI = ST.getTargetLowering();
2221 
2222   if (TLI->shouldEmitFixup(GV)) {
2223     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2224     MI.eraseFromParent();
2225     return true;
2226   }
2227 
2228   if (TLI->shouldEmitPCReloc(GV)) {
2229     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2230     MI.eraseFromParent();
2231     return true;
2232   }
2233 
2234   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2235   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2236 
2237   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2238       MachinePointerInfo::getGOT(MF),
2239       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2240           MachineMemOperand::MOInvariant,
2241       8 /*Size*/, Align(8));
2242 
2243   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2244 
2245   if (Ty.getSizeInBits() == 32) {
2246     // Truncate if this is a 32-bit constant adrdess.
2247     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2248     B.buildExtract(DstReg, Load, 0);
2249   } else
2250     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2251 
2252   MI.eraseFromParent();
2253   return true;
2254 }
2255 
2256 bool AMDGPULegalizerInfo::legalizeLoad(
2257   MachineInstr &MI, MachineRegisterInfo &MRI,
2258   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2259   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2260   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2261   Observer.changingInstr(MI);
2262   MI.getOperand(1).setReg(Cast.getReg(0));
2263   Observer.changedInstr(MI);
2264   return true;
2265 }
2266 
2267 bool AMDGPULegalizerInfo::legalizeFMad(
2268   MachineInstr &MI, MachineRegisterInfo &MRI,
2269   MachineIRBuilder &B) const {
2270   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2271   assert(Ty.isScalar());
2272 
2273   MachineFunction &MF = B.getMF();
2274   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2275 
2276   // TODO: Always legal with future ftz flag.
2277   // FIXME: Do we need just output?
2278   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2279     return true;
2280   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2281     return true;
2282 
2283   MachineIRBuilder HelperBuilder(MI);
2284   GISelObserverWrapper DummyObserver;
2285   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2286   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2287 }
2288 
2289 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2290   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2291   Register DstReg = MI.getOperand(0).getReg();
2292   Register PtrReg = MI.getOperand(1).getReg();
2293   Register CmpVal = MI.getOperand(2).getReg();
2294   Register NewVal = MI.getOperand(3).getReg();
2295 
2296   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2297          "this should not have been custom lowered");
2298 
2299   LLT ValTy = MRI.getType(CmpVal);
2300   LLT VecTy = LLT::vector(2, ValTy);
2301 
2302   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2303 
2304   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2305     .addDef(DstReg)
2306     .addUse(PtrReg)
2307     .addUse(PackedVal)
2308     .setMemRefs(MI.memoperands());
2309 
2310   MI.eraseFromParent();
2311   return true;
2312 }
2313 
2314 bool AMDGPULegalizerInfo::legalizeFlog(
2315   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2316   Register Dst = MI.getOperand(0).getReg();
2317   Register Src = MI.getOperand(1).getReg();
2318   LLT Ty = B.getMRI()->getType(Dst);
2319   unsigned Flags = MI.getFlags();
2320 
2321   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2322   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2323 
2324   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2325   MI.eraseFromParent();
2326   return true;
2327 }
2328 
2329 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2330                                        MachineIRBuilder &B) const {
2331   Register Dst = MI.getOperand(0).getReg();
2332   Register Src = MI.getOperand(1).getReg();
2333   unsigned Flags = MI.getFlags();
2334   LLT Ty = B.getMRI()->getType(Dst);
2335 
2336   auto K = B.buildFConstant(Ty, numbers::log2e);
2337   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2338   B.buildFExp2(Dst, Mul, Flags);
2339   MI.eraseFromParent();
2340   return true;
2341 }
2342 
2343 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2344                                        MachineIRBuilder &B) const {
2345   Register Dst = MI.getOperand(0).getReg();
2346   Register Src0 = MI.getOperand(1).getReg();
2347   Register Src1 = MI.getOperand(2).getReg();
2348   unsigned Flags = MI.getFlags();
2349   LLT Ty = B.getMRI()->getType(Dst);
2350   const LLT S16 = LLT::scalar(16);
2351   const LLT S32 = LLT::scalar(32);
2352 
2353   if (Ty == S32) {
2354     auto Log = B.buildFLog2(S32, Src0, Flags);
2355     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2356       .addUse(Log.getReg(0))
2357       .addUse(Src1)
2358       .setMIFlags(Flags);
2359     B.buildFExp2(Dst, Mul, Flags);
2360   } else if (Ty == S16) {
2361     // There's no f16 fmul_legacy, so we need to convert for it.
2362     auto Log = B.buildFLog2(S16, Src0, Flags);
2363     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2364     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2365     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2366       .addUse(Ext0.getReg(0))
2367       .addUse(Ext1.getReg(0))
2368       .setMIFlags(Flags);
2369 
2370     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2371   } else
2372     return false;
2373 
2374   MI.eraseFromParent();
2375   return true;
2376 }
2377 
2378 // Find a source register, ignoring any possible source modifiers.
2379 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2380   Register ModSrc = OrigSrc;
2381   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2382     ModSrc = SrcFNeg->getOperand(1).getReg();
2383     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2384       ModSrc = SrcFAbs->getOperand(1).getReg();
2385   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2386     ModSrc = SrcFAbs->getOperand(1).getReg();
2387   return ModSrc;
2388 }
2389 
2390 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2391                                          MachineRegisterInfo &MRI,
2392                                          MachineIRBuilder &B) const {
2393 
2394   const LLT S1 = LLT::scalar(1);
2395   const LLT S64 = LLT::scalar(64);
2396   Register Dst = MI.getOperand(0).getReg();
2397   Register OrigSrc = MI.getOperand(1).getReg();
2398   unsigned Flags = MI.getFlags();
2399   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2400          "this should not have been custom lowered");
2401 
2402   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2403   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2404   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2405   // V_FRACT bug is:
2406   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2407   //
2408   // Convert floor(x) to (x - fract(x))
2409 
2410   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2411     .addUse(OrigSrc)
2412     .setMIFlags(Flags);
2413 
2414   // Give source modifier matching some assistance before obscuring a foldable
2415   // pattern.
2416 
2417   // TODO: We can avoid the neg on the fract? The input sign to fract
2418   // shouldn't matter?
2419   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2420 
2421   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2422 
2423   Register Min = MRI.createGenericVirtualRegister(S64);
2424 
2425   // We don't need to concern ourselves with the snan handling difference, so
2426   // use the one which will directly select.
2427   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2428   if (MFI->getMode().IEEE)
2429     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2430   else
2431     B.buildFMinNum(Min, Fract, Const, Flags);
2432 
2433   Register CorrectedFract = Min;
2434   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2435     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2436     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2437   }
2438 
2439   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2440   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2441 
2442   MI.eraseFromParent();
2443   return true;
2444 }
2445 
2446 // Turn an illegal packed v2s16 build vector into bit operations.
2447 // TODO: This should probably be a bitcast action in LegalizerHelper.
2448 bool AMDGPULegalizerInfo::legalizeBuildVector(
2449   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2450   Register Dst = MI.getOperand(0).getReg();
2451   const LLT S32 = LLT::scalar(32);
2452   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2453 
2454   Register Src0 = MI.getOperand(1).getReg();
2455   Register Src1 = MI.getOperand(2).getReg();
2456   assert(MRI.getType(Src0) == LLT::scalar(16));
2457 
2458   auto Merge = B.buildMerge(S32, {Src0, Src1});
2459   B.buildBitcast(Dst, Merge);
2460 
2461   MI.eraseFromParent();
2462   return true;
2463 }
2464 
2465 // Return the use branch instruction, otherwise null if the usage is invalid.
2466 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2467                                        MachineRegisterInfo &MRI,
2468                                        MachineInstr *&Br,
2469                                        MachineBasicBlock *&UncondBrTarget) {
2470   Register CondDef = MI.getOperand(0).getReg();
2471   if (!MRI.hasOneNonDBGUse(CondDef))
2472     return nullptr;
2473 
2474   MachineBasicBlock *Parent = MI.getParent();
2475   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2476   if (UseMI.getParent() != Parent ||
2477       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2478     return nullptr;
2479 
2480   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2481   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2482   if (Next == Parent->end()) {
2483     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2484     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2485       return nullptr;
2486     UncondBrTarget = &*NextMBB;
2487   } else {
2488     if (Next->getOpcode() != AMDGPU::G_BR)
2489       return nullptr;
2490     Br = &*Next;
2491     UncondBrTarget = Br->getOperand(0).getMBB();
2492   }
2493 
2494   return &UseMI;
2495 }
2496 
2497 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2498                                                MachineRegisterInfo &MRI,
2499                                                Register LiveIn,
2500                                                Register PhyReg) const {
2501   assert(PhyReg.isPhysical() && "Physical register expected");
2502 
2503   // Insert the live-in copy, if required, by defining destination virtual
2504   // register.
2505   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2506   if (!MRI.getVRegDef(LiveIn)) {
2507     // FIXME: Should have scoped insert pt
2508     MachineBasicBlock &OrigInsBB = B.getMBB();
2509     auto OrigInsPt = B.getInsertPt();
2510 
2511     MachineBasicBlock &EntryMBB = B.getMF().front();
2512     EntryMBB.addLiveIn(PhyReg);
2513     B.setInsertPt(EntryMBB, EntryMBB.begin());
2514     B.buildCopy(LiveIn, PhyReg);
2515 
2516     B.setInsertPt(OrigInsBB, OrigInsPt);
2517   }
2518 
2519   return LiveIn;
2520 }
2521 
2522 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2523                                                 MachineRegisterInfo &MRI,
2524                                                 Register PhyReg, LLT Ty,
2525                                                 bool InsertLiveInCopy) const {
2526   assert(PhyReg.isPhysical() && "Physical register expected");
2527 
2528   // Get or create virtual live-in regester
2529   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2530   if (!LiveIn) {
2531     LiveIn = MRI.createGenericVirtualRegister(Ty);
2532     MRI.addLiveIn(PhyReg, LiveIn);
2533   }
2534 
2535   // When the actual true copy required is from virtual register to physical
2536   // register (to be inserted later), live-in copy insertion from physical
2537   // to register virtual register is not required
2538   if (!InsertLiveInCopy)
2539     return LiveIn;
2540 
2541   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2542 }
2543 
2544 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2545                                          const ArgDescriptor *Arg,
2546                                          const TargetRegisterClass *ArgRC,
2547                                          LLT ArgTy) const {
2548   MCRegister SrcReg = Arg->getRegister();
2549   assert(SrcReg.isPhysical() && "Physical register expected");
2550   assert(DstReg.isVirtual() && "Virtual register expected");
2551 
2552   MachineRegisterInfo &MRI = *B.getMRI();
2553   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy);
2554 
2555   if (Arg->isMasked()) {
2556     // TODO: Should we try to emit this once in the entry block?
2557     const LLT S32 = LLT::scalar(32);
2558     const unsigned Mask = Arg->getMask();
2559     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2560 
2561     Register AndMaskSrc = LiveIn;
2562 
2563     if (Shift != 0) {
2564       auto ShiftAmt = B.buildConstant(S32, Shift);
2565       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2566     }
2567 
2568     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2569   } else {
2570     B.buildCopy(DstReg, LiveIn);
2571   }
2572 
2573   return true;
2574 }
2575 
2576 bool AMDGPULegalizerInfo::loadInputValue(
2577     Register DstReg, MachineIRBuilder &B,
2578     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2579   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2580   const ArgDescriptor *Arg;
2581   const TargetRegisterClass *ArgRC;
2582   LLT ArgTy;
2583   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2584 
2585   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2586     return false; // TODO: Handle these
2587   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2588 }
2589 
2590 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2591     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2592     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2593   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2594     return false;
2595 
2596   MI.eraseFromParent();
2597   return true;
2598 }
2599 
2600 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2601                                        MachineRegisterInfo &MRI,
2602                                        MachineIRBuilder &B) const {
2603   Register Dst = MI.getOperand(0).getReg();
2604   LLT DstTy = MRI.getType(Dst);
2605   LLT S16 = LLT::scalar(16);
2606   LLT S32 = LLT::scalar(32);
2607   LLT S64 = LLT::scalar(64);
2608 
2609   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2610     return true;
2611 
2612   if (DstTy == S16)
2613     return legalizeFDIV16(MI, MRI, B);
2614   if (DstTy == S32)
2615     return legalizeFDIV32(MI, MRI, B);
2616   if (DstTy == S64)
2617     return legalizeFDIV64(MI, MRI, B);
2618 
2619   return false;
2620 }
2621 
2622 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2623                                                   Register DstReg,
2624                                                   Register X,
2625                                                   Register Y,
2626                                                   bool IsDiv) const {
2627   const LLT S1 = LLT::scalar(1);
2628   const LLT S32 = LLT::scalar(32);
2629 
2630   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2631   // algorithm used here.
2632 
2633   // Initial estimate of inv(y).
2634   auto FloatY = B.buildUITOFP(S32, Y);
2635   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2636   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2637   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2638   auto Z = B.buildFPTOUI(S32, ScaledY);
2639 
2640   // One round of UNR.
2641   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2642   auto NegYZ = B.buildMul(S32, NegY, Z);
2643   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2644 
2645   // Quotient/remainder estimate.
2646   auto Q = B.buildUMulH(S32, X, Z);
2647   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2648 
2649   // First quotient/remainder refinement.
2650   auto One = B.buildConstant(S32, 1);
2651   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2652   if (IsDiv)
2653     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2654   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2655 
2656   // Second quotient/remainder refinement.
2657   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2658   if (IsDiv)
2659     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2660   else
2661     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2662 }
2663 
2664 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2665                                               MachineRegisterInfo &MRI,
2666                                               MachineIRBuilder &B) const {
2667   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2668   Register DstReg = MI.getOperand(0).getReg();
2669   Register Num = MI.getOperand(1).getReg();
2670   Register Den = MI.getOperand(2).getReg();
2671   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2672   MI.eraseFromParent();
2673   return true;
2674 }
2675 
2676 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2677 //
2678 // Return lo, hi of result
2679 //
2680 // %cvt.lo = G_UITOFP Val.lo
2681 // %cvt.hi = G_UITOFP Val.hi
2682 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2683 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2684 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2685 // %mul2 = G_FMUL %mul1, 2**(-32)
2686 // %trunc = G_INTRINSIC_TRUNC %mul2
2687 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2688 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2689 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2690                                                        Register Val) {
2691   const LLT S32 = LLT::scalar(32);
2692   auto Unmerge = B.buildUnmerge(S32, Val);
2693 
2694   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2695   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2696 
2697   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2698                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2699 
2700   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2701   auto Mul1 =
2702       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2703 
2704   // 2**(-32)
2705   auto Mul2 =
2706       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2707   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2708 
2709   // -(2**32)
2710   auto Mad2 = B.buildFMAD(S32, Trunc,
2711                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2712 
2713   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2714   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2715 
2716   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2717 }
2718 
2719 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2720                                                   Register DstReg,
2721                                                   Register Numer,
2722                                                   Register Denom,
2723                                                   bool IsDiv) const {
2724   const LLT S32 = LLT::scalar(32);
2725   const LLT S64 = LLT::scalar(64);
2726   const LLT S1 = LLT::scalar(1);
2727   Register RcpLo, RcpHi;
2728 
2729   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2730 
2731   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2732 
2733   auto Zero64 = B.buildConstant(S64, 0);
2734   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2735 
2736   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2737   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2738 
2739   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2740   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2741   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2742 
2743   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2744   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2745   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2746   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2747 
2748   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2749   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2750   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2751   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2752   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2753 
2754   auto Zero32 = B.buildConstant(S32, 0);
2755   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2756   auto Add2_HiC =
2757       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2758   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2759   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2760 
2761   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2762   Register NumerLo = UnmergeNumer.getReg(0);
2763   Register NumerHi = UnmergeNumer.getReg(1);
2764 
2765   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2766   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2767   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2768   Register Mul3_Lo = UnmergeMul3.getReg(0);
2769   Register Mul3_Hi = UnmergeMul3.getReg(1);
2770   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2771   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2772   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2773   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2774 
2775   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2776   Register DenomLo = UnmergeDenom.getReg(0);
2777   Register DenomHi = UnmergeDenom.getReg(1);
2778 
2779   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2780   auto C1 = B.buildSExt(S32, CmpHi);
2781 
2782   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2783   auto C2 = B.buildSExt(S32, CmpLo);
2784 
2785   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2786   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2787 
2788   // TODO: Here and below portions of the code can be enclosed into if/endif.
2789   // Currently control flow is unconditional and we have 4 selects after
2790   // potential endif to substitute PHIs.
2791 
2792   // if C3 != 0 ...
2793   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2794   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2795   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2796   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2797 
2798   auto One64 = B.buildConstant(S64, 1);
2799   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2800 
2801   auto C4 =
2802       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2803   auto C5 =
2804       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2805   auto C6 = B.buildSelect(
2806       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2807 
2808   // if (C6 != 0)
2809   auto Add4 = B.buildAdd(S64, Add3, One64);
2810   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2811 
2812   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2813   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2814   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2815 
2816   // endif C6
2817   // endif C3
2818 
2819   if (IsDiv) {
2820     auto Sel1 = B.buildSelect(
2821         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2822     B.buildSelect(DstReg,
2823                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2824   } else {
2825     auto Sel2 = B.buildSelect(
2826         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2827     B.buildSelect(DstReg,
2828                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2829   }
2830 }
2831 
2832 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2833                                             MachineRegisterInfo &MRI,
2834                                             MachineIRBuilder &B) const {
2835   const LLT S64 = LLT::scalar(64);
2836   const LLT S32 = LLT::scalar(32);
2837   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2838   Register DstReg = MI.getOperand(0).getReg();
2839   Register Num = MI.getOperand(1).getReg();
2840   Register Den = MI.getOperand(2).getReg();
2841   LLT Ty = MRI.getType(DstReg);
2842 
2843   if (Ty == S32)
2844     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2845   else if (Ty == S64)
2846     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2847   else
2848     return false;
2849 
2850   MI.eraseFromParent();
2851   return true;
2852 
2853 }
2854 
2855 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2856                                             MachineRegisterInfo &MRI,
2857                                             MachineIRBuilder &B) const {
2858   const LLT S64 = LLT::scalar(64);
2859   const LLT S32 = LLT::scalar(32);
2860 
2861   Register DstReg = MI.getOperand(0).getReg();
2862   const LLT Ty = MRI.getType(DstReg);
2863   if (Ty != S32 && Ty != S64)
2864     return false;
2865 
2866   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2867 
2868   Register LHS = MI.getOperand(1).getReg();
2869   Register RHS = MI.getOperand(2).getReg();
2870 
2871   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2872   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2873   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2874 
2875   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2876   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2877 
2878   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2879   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2880 
2881   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2882   if (Ty == S32)
2883     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2884   else
2885     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2886 
2887   Register Sign;
2888   if (IsDiv)
2889     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2890   else
2891     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2892 
2893   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2894   B.buildSub(DstReg, UDivRem, Sign);
2895 
2896   MI.eraseFromParent();
2897   return true;
2898 }
2899 
2900 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2901                                                  MachineRegisterInfo &MRI,
2902                                                  MachineIRBuilder &B) const {
2903   Register Res = MI.getOperand(0).getReg();
2904   Register LHS = MI.getOperand(1).getReg();
2905   Register RHS = MI.getOperand(2).getReg();
2906 
2907   uint16_t Flags = MI.getFlags();
2908 
2909   LLT ResTy = MRI.getType(Res);
2910   LLT S32 = LLT::scalar(32);
2911   LLT S64 = LLT::scalar(64);
2912 
2913   const MachineFunction &MF = B.getMF();
2914   bool Unsafe =
2915     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2916 
2917   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2918     return false;
2919 
2920   if (!Unsafe && ResTy == S32 &&
2921       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2922     return false;
2923 
2924   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2925     // 1 / x -> RCP(x)
2926     if (CLHS->isExactlyValue(1.0)) {
2927       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2928         .addUse(RHS)
2929         .setMIFlags(Flags);
2930 
2931       MI.eraseFromParent();
2932       return true;
2933     }
2934 
2935     // -1 / x -> RCP( FNEG(x) )
2936     if (CLHS->isExactlyValue(-1.0)) {
2937       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2938       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2939         .addUse(FNeg.getReg(0))
2940         .setMIFlags(Flags);
2941 
2942       MI.eraseFromParent();
2943       return true;
2944     }
2945   }
2946 
2947   // x / y -> x * (1.0 / y)
2948   if (Unsafe) {
2949     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2950       .addUse(RHS)
2951       .setMIFlags(Flags);
2952     B.buildFMul(Res, LHS, RCP, Flags);
2953 
2954     MI.eraseFromParent();
2955     return true;
2956   }
2957 
2958   return false;
2959 }
2960 
2961 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2962                                          MachineRegisterInfo &MRI,
2963                                          MachineIRBuilder &B) const {
2964   Register Res = MI.getOperand(0).getReg();
2965   Register LHS = MI.getOperand(1).getReg();
2966   Register RHS = MI.getOperand(2).getReg();
2967 
2968   uint16_t Flags = MI.getFlags();
2969 
2970   LLT S16 = LLT::scalar(16);
2971   LLT S32 = LLT::scalar(32);
2972 
2973   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2974   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2975 
2976   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2977     .addUse(RHSExt.getReg(0))
2978     .setMIFlags(Flags);
2979 
2980   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2981   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2982 
2983   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2984     .addUse(RDst.getReg(0))
2985     .addUse(RHS)
2986     .addUse(LHS)
2987     .setMIFlags(Flags);
2988 
2989   MI.eraseFromParent();
2990   return true;
2991 }
2992 
2993 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2994 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2995 static void toggleSPDenormMode(bool Enable,
2996                                MachineIRBuilder &B,
2997                                const GCNSubtarget &ST,
2998                                AMDGPU::SIModeRegisterDefaults Mode) {
2999   // Set SP denorm mode to this value.
3000   unsigned SPDenormMode =
3001     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3002 
3003   if (ST.hasDenormModeInst()) {
3004     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3005     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3006 
3007     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3008     B.buildInstr(AMDGPU::S_DENORM_MODE)
3009       .addImm(NewDenormModeValue);
3010 
3011   } else {
3012     // Select FP32 bit field in mode register.
3013     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3014                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3015                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3016 
3017     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3018       .addImm(SPDenormMode)
3019       .addImm(SPDenormModeBitField);
3020   }
3021 }
3022 
3023 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3024                                          MachineRegisterInfo &MRI,
3025                                          MachineIRBuilder &B) const {
3026   Register Res = MI.getOperand(0).getReg();
3027   Register LHS = MI.getOperand(1).getReg();
3028   Register RHS = MI.getOperand(2).getReg();
3029   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3030   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3031 
3032   uint16_t Flags = MI.getFlags();
3033 
3034   LLT S32 = LLT::scalar(32);
3035   LLT S1 = LLT::scalar(1);
3036 
3037   auto One = B.buildFConstant(S32, 1.0f);
3038 
3039   auto DenominatorScaled =
3040     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3041       .addUse(LHS)
3042       .addUse(RHS)
3043       .addImm(0)
3044       .setMIFlags(Flags);
3045   auto NumeratorScaled =
3046     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3047       .addUse(LHS)
3048       .addUse(RHS)
3049       .addImm(1)
3050       .setMIFlags(Flags);
3051 
3052   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3053     .addUse(DenominatorScaled.getReg(0))
3054     .setMIFlags(Flags);
3055   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3056 
3057   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3058   // aren't modeled as reading it.
3059   if (!Mode.allFP32Denormals())
3060     toggleSPDenormMode(true, B, ST, Mode);
3061 
3062   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3063   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3064   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3065   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3066   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3067   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3068 
3069   if (!Mode.allFP32Denormals())
3070     toggleSPDenormMode(false, B, ST, Mode);
3071 
3072   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3073     .addUse(Fma4.getReg(0))
3074     .addUse(Fma1.getReg(0))
3075     .addUse(Fma3.getReg(0))
3076     .addUse(NumeratorScaled.getReg(1))
3077     .setMIFlags(Flags);
3078 
3079   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3080     .addUse(Fmas.getReg(0))
3081     .addUse(RHS)
3082     .addUse(LHS)
3083     .setMIFlags(Flags);
3084 
3085   MI.eraseFromParent();
3086   return true;
3087 }
3088 
3089 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3090                                          MachineRegisterInfo &MRI,
3091                                          MachineIRBuilder &B) const {
3092   Register Res = MI.getOperand(0).getReg();
3093   Register LHS = MI.getOperand(1).getReg();
3094   Register RHS = MI.getOperand(2).getReg();
3095 
3096   uint16_t Flags = MI.getFlags();
3097 
3098   LLT S64 = LLT::scalar(64);
3099   LLT S1 = LLT::scalar(1);
3100 
3101   auto One = B.buildFConstant(S64, 1.0);
3102 
3103   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3104     .addUse(LHS)
3105     .addUse(RHS)
3106     .addImm(0)
3107     .setMIFlags(Flags);
3108 
3109   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3110 
3111   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3112     .addUse(DivScale0.getReg(0))
3113     .setMIFlags(Flags);
3114 
3115   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3116   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3117   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3118 
3119   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3120     .addUse(LHS)
3121     .addUse(RHS)
3122     .addImm(1)
3123     .setMIFlags(Flags);
3124 
3125   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3126   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3127   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3128 
3129   Register Scale;
3130   if (!ST.hasUsableDivScaleConditionOutput()) {
3131     // Workaround a hardware bug on SI where the condition output from div_scale
3132     // is not usable.
3133 
3134     LLT S32 = LLT::scalar(32);
3135 
3136     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3137     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3138     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3139     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3140 
3141     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3142                               Scale1Unmerge.getReg(1));
3143     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3144                               Scale0Unmerge.getReg(1));
3145     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3146   } else {
3147     Scale = DivScale1.getReg(1);
3148   }
3149 
3150   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3151     .addUse(Fma4.getReg(0))
3152     .addUse(Fma3.getReg(0))
3153     .addUse(Mul.getReg(0))
3154     .addUse(Scale)
3155     .setMIFlags(Flags);
3156 
3157   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3158     .addUse(Fmas.getReg(0))
3159     .addUse(RHS)
3160     .addUse(LHS)
3161     .setMIFlags(Flags);
3162 
3163   MI.eraseFromParent();
3164   return true;
3165 }
3166 
3167 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3168                                                  MachineRegisterInfo &MRI,
3169                                                  MachineIRBuilder &B) const {
3170   Register Res = MI.getOperand(0).getReg();
3171   Register LHS = MI.getOperand(2).getReg();
3172   Register RHS = MI.getOperand(3).getReg();
3173   uint16_t Flags = MI.getFlags();
3174 
3175   LLT S32 = LLT::scalar(32);
3176   LLT S1 = LLT::scalar(1);
3177 
3178   auto Abs = B.buildFAbs(S32, RHS, Flags);
3179   const APFloat C0Val(1.0f);
3180 
3181   auto C0 = B.buildConstant(S32, 0x6f800000);
3182   auto C1 = B.buildConstant(S32, 0x2f800000);
3183   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3184 
3185   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3186   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3187 
3188   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3189 
3190   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3191     .addUse(Mul0.getReg(0))
3192     .setMIFlags(Flags);
3193 
3194   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3195 
3196   B.buildFMul(Res, Sel, Mul1, Flags);
3197 
3198   MI.eraseFromParent();
3199   return true;
3200 }
3201 
3202 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3203                                             MachineRegisterInfo &MRI,
3204                                             MachineIRBuilder &B) const {
3205   uint64_t Offset =
3206     ST.getTargetLowering()->getImplicitParameterOffset(
3207       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3208   LLT DstTy = MRI.getType(DstReg);
3209   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3210 
3211   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3212   if (!loadInputValue(KernargPtrReg, B,
3213                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3214     return false;
3215 
3216   // FIXME: This should be nuw
3217   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3218   return true;
3219 }
3220 
3221 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3222                                                  MachineRegisterInfo &MRI,
3223                                                  MachineIRBuilder &B) const {
3224   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3225   if (!MFI->isEntryFunction()) {
3226     return legalizePreloadedArgIntrin(MI, MRI, B,
3227                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3228   }
3229 
3230   Register DstReg = MI.getOperand(0).getReg();
3231   if (!getImplicitArgPtr(DstReg, MRI, B))
3232     return false;
3233 
3234   MI.eraseFromParent();
3235   return true;
3236 }
3237 
3238 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3239                                               MachineRegisterInfo &MRI,
3240                                               MachineIRBuilder &B,
3241                                               unsigned AddrSpace) const {
3242   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3243   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3244   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3245   MI.eraseFromParent();
3246   return true;
3247 }
3248 
3249 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3250 // offset (the offset that is included in bounds checking and swizzling, to be
3251 // split between the instruction's voffset and immoffset fields) and soffset
3252 // (the offset that is excluded from bounds checking and swizzling, to go in
3253 // the instruction's soffset field).  This function takes the first kind of
3254 // offset and figures out how to split it between voffset and immoffset.
3255 std::tuple<Register, unsigned, unsigned>
3256 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3257                                         Register OrigOffset) const {
3258   const unsigned MaxImm = 4095;
3259   Register BaseReg;
3260   unsigned TotalConstOffset;
3261   MachineInstr *OffsetDef;
3262   const LLT S32 = LLT::scalar(32);
3263 
3264   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3265     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3266 
3267   unsigned ImmOffset = TotalConstOffset;
3268 
3269   // If the immediate value is too big for the immoffset field, put the value
3270   // and -4096 into the immoffset field so that the value that is copied/added
3271   // for the voffset field is a multiple of 4096, and it stands more chance
3272   // of being CSEd with the copy/add for another similar load/store.
3273   // However, do not do that rounding down to a multiple of 4096 if that is a
3274   // negative number, as it appears to be illegal to have a negative offset
3275   // in the vgpr, even if adding the immediate offset makes it positive.
3276   unsigned Overflow = ImmOffset & ~MaxImm;
3277   ImmOffset -= Overflow;
3278   if ((int32_t)Overflow < 0) {
3279     Overflow += ImmOffset;
3280     ImmOffset = 0;
3281   }
3282 
3283   if (Overflow != 0) {
3284     if (!BaseReg) {
3285       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3286     } else {
3287       auto OverflowVal = B.buildConstant(S32, Overflow);
3288       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3289     }
3290   }
3291 
3292   if (!BaseReg)
3293     BaseReg = B.buildConstant(S32, 0).getReg(0);
3294 
3295   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3296 }
3297 
3298 /// Handle register layout difference for f16 images for some subtargets.
3299 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3300                                              MachineRegisterInfo &MRI,
3301                                              Register Reg) const {
3302   if (!ST.hasUnpackedD16VMem())
3303     return Reg;
3304 
3305   const LLT S16 = LLT::scalar(16);
3306   const LLT S32 = LLT::scalar(32);
3307   LLT StoreVT = MRI.getType(Reg);
3308   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3309 
3310   auto Unmerge = B.buildUnmerge(S16, Reg);
3311 
3312   SmallVector<Register, 4> WideRegs;
3313   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3314     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3315 
3316   int NumElts = StoreVT.getNumElements();
3317 
3318   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3319 }
3320 
3321 Register AMDGPULegalizerInfo::fixStoreSourceType(
3322   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3323   MachineRegisterInfo *MRI = B.getMRI();
3324   LLT Ty = MRI->getType(VData);
3325 
3326   const LLT S16 = LLT::scalar(16);
3327 
3328   // Fixup illegal register types for i8 stores.
3329   if (Ty == LLT::scalar(8) || Ty == S16) {
3330     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3331     return AnyExt;
3332   }
3333 
3334   if (Ty.isVector()) {
3335     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3336       if (IsFormat)
3337         return handleD16VData(B, *MRI, VData);
3338     }
3339   }
3340 
3341   return VData;
3342 }
3343 
3344 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3345                                               MachineRegisterInfo &MRI,
3346                                               MachineIRBuilder &B,
3347                                               bool IsTyped,
3348                                               bool IsFormat) const {
3349   Register VData = MI.getOperand(1).getReg();
3350   LLT Ty = MRI.getType(VData);
3351   LLT EltTy = Ty.getScalarType();
3352   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3353   const LLT S32 = LLT::scalar(32);
3354 
3355   VData = fixStoreSourceType(B, VData, IsFormat);
3356   Register RSrc = MI.getOperand(2).getReg();
3357 
3358   MachineMemOperand *MMO = *MI.memoperands_begin();
3359   const int MemSize = MMO->getSize();
3360 
3361   unsigned ImmOffset;
3362   unsigned TotalOffset;
3363 
3364   // The typed intrinsics add an immediate after the registers.
3365   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3366 
3367   // The struct intrinsic variants add one additional operand over raw.
3368   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3369   Register VIndex;
3370   int OpOffset = 0;
3371   if (HasVIndex) {
3372     VIndex = MI.getOperand(3).getReg();
3373     OpOffset = 1;
3374   }
3375 
3376   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3377   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3378 
3379   unsigned Format = 0;
3380   if (IsTyped) {
3381     Format = MI.getOperand(5 + OpOffset).getImm();
3382     ++OpOffset;
3383   }
3384 
3385   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3386 
3387   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3388   if (TotalOffset != 0)
3389     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3390 
3391   unsigned Opc;
3392   if (IsTyped) {
3393     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3394                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3395   } else if (IsFormat) {
3396     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3397                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3398   } else {
3399     switch (MemSize) {
3400     case 1:
3401       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3402       break;
3403     case 2:
3404       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3405       break;
3406     default:
3407       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3408       break;
3409     }
3410   }
3411 
3412   if (!VIndex)
3413     VIndex = B.buildConstant(S32, 0).getReg(0);
3414 
3415   auto MIB = B.buildInstr(Opc)
3416     .addUse(VData)              // vdata
3417     .addUse(RSrc)               // rsrc
3418     .addUse(VIndex)             // vindex
3419     .addUse(VOffset)            // voffset
3420     .addUse(SOffset)            // soffset
3421     .addImm(ImmOffset);         // offset(imm)
3422 
3423   if (IsTyped)
3424     MIB.addImm(Format);
3425 
3426   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3427      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3428      .addMemOperand(MMO);
3429 
3430   MI.eraseFromParent();
3431   return true;
3432 }
3433 
3434 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3435                                              MachineRegisterInfo &MRI,
3436                                              MachineIRBuilder &B,
3437                                              bool IsFormat,
3438                                              bool IsTyped) const {
3439   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3440   MachineMemOperand *MMO = *MI.memoperands_begin();
3441   const int MemSize = MMO->getSize();
3442   const LLT S32 = LLT::scalar(32);
3443 
3444   Register Dst = MI.getOperand(0).getReg();
3445   Register RSrc = MI.getOperand(2).getReg();
3446 
3447   // The typed intrinsics add an immediate after the registers.
3448   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3449 
3450   // The struct intrinsic variants add one additional operand over raw.
3451   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3452   Register VIndex;
3453   int OpOffset = 0;
3454   if (HasVIndex) {
3455     VIndex = MI.getOperand(3).getReg();
3456     OpOffset = 1;
3457   }
3458 
3459   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3460   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3461 
3462   unsigned Format = 0;
3463   if (IsTyped) {
3464     Format = MI.getOperand(5 + OpOffset).getImm();
3465     ++OpOffset;
3466   }
3467 
3468   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3469   unsigned ImmOffset;
3470   unsigned TotalOffset;
3471 
3472   LLT Ty = MRI.getType(Dst);
3473   LLT EltTy = Ty.getScalarType();
3474   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3475   const bool Unpacked = ST.hasUnpackedD16VMem();
3476 
3477   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3478   if (TotalOffset != 0)
3479     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3480 
3481   unsigned Opc;
3482 
3483   if (IsTyped) {
3484     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3485                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3486   } else if (IsFormat) {
3487     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3488                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3489   } else {
3490     switch (MemSize) {
3491     case 1:
3492       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3493       break;
3494     case 2:
3495       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3496       break;
3497     default:
3498       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3499       break;
3500     }
3501   }
3502 
3503   Register LoadDstReg;
3504 
3505   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3506   LLT UnpackedTy = Ty.changeElementSize(32);
3507 
3508   if (IsExtLoad)
3509     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3510   else if (Unpacked && IsD16 && Ty.isVector())
3511     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3512   else
3513     LoadDstReg = Dst;
3514 
3515   if (!VIndex)
3516     VIndex = B.buildConstant(S32, 0).getReg(0);
3517 
3518   auto MIB = B.buildInstr(Opc)
3519     .addDef(LoadDstReg)         // vdata
3520     .addUse(RSrc)               // rsrc
3521     .addUse(VIndex)             // vindex
3522     .addUse(VOffset)            // voffset
3523     .addUse(SOffset)            // soffset
3524     .addImm(ImmOffset);         // offset(imm)
3525 
3526   if (IsTyped)
3527     MIB.addImm(Format);
3528 
3529   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3530      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3531      .addMemOperand(MMO);
3532 
3533   if (LoadDstReg != Dst) {
3534     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3535 
3536     // Widen result for extending loads was widened.
3537     if (IsExtLoad)
3538       B.buildTrunc(Dst, LoadDstReg);
3539     else {
3540       // Repack to original 16-bit vector result
3541       // FIXME: G_TRUNC should work, but legalization currently fails
3542       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3543       SmallVector<Register, 4> Repack;
3544       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3545         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3546       B.buildMerge(Dst, Repack);
3547     }
3548   }
3549 
3550   MI.eraseFromParent();
3551   return true;
3552 }
3553 
3554 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3555                                                MachineIRBuilder &B,
3556                                                bool IsInc) const {
3557   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3558                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3559   B.buildInstr(Opc)
3560     .addDef(MI.getOperand(0).getReg())
3561     .addUse(MI.getOperand(2).getReg())
3562     .addUse(MI.getOperand(3).getReg())
3563     .cloneMemRefs(MI);
3564   MI.eraseFromParent();
3565   return true;
3566 }
3567 
3568 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3569   switch (IntrID) {
3570   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3571   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3572     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3573   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3574   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3575     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3576   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3577   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3578     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3579   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3580   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3581     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3582   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3583   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3584     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3585   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3586   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3587     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3588   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3589   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3590     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3591   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3592   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3593     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3594   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3595   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3596     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3597   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3598   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3599     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3600   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3601   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3602     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3603   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3604   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3605     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3606   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3607   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3608     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3609   default:
3610     llvm_unreachable("unhandled atomic opcode");
3611   }
3612 }
3613 
3614 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3615                                                MachineIRBuilder &B,
3616                                                Intrinsic::ID IID) const {
3617   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3618                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3619 
3620   Register Dst = MI.getOperand(0).getReg();
3621   Register VData = MI.getOperand(2).getReg();
3622 
3623   Register CmpVal;
3624   int OpOffset = 0;
3625 
3626   if (IsCmpSwap) {
3627     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3628     ++OpOffset;
3629   }
3630 
3631   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3632   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3633 
3634   // The struct intrinsic variants add one additional operand over raw.
3635   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3636   Register VIndex;
3637   if (HasVIndex) {
3638     VIndex = MI.getOperand(4 + OpOffset).getReg();
3639     ++OpOffset;
3640   }
3641 
3642   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3643   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3644   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3645 
3646   MachineMemOperand *MMO = *MI.memoperands_begin();
3647 
3648   unsigned ImmOffset;
3649   unsigned TotalOffset;
3650   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3651   if (TotalOffset != 0)
3652     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3653 
3654   if (!VIndex)
3655     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3656 
3657   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3658     .addDef(Dst)
3659     .addUse(VData); // vdata
3660 
3661   if (IsCmpSwap)
3662     MIB.addReg(CmpVal);
3663 
3664   MIB.addUse(RSrc)               // rsrc
3665      .addUse(VIndex)             // vindex
3666      .addUse(VOffset)            // voffset
3667      .addUse(SOffset)            // soffset
3668      .addImm(ImmOffset)          // offset(imm)
3669      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3670      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3671      .addMemOperand(MMO);
3672 
3673   MI.eraseFromParent();
3674   return true;
3675 }
3676 
3677 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3678 /// vector with s16 typed elements.
3679 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3680                                         SmallVectorImpl<Register> &PackedAddrs,
3681                                         int AddrIdx, int DimIdx, int EndIdx,
3682                                         int NumGradients) {
3683   const LLT S16 = LLT::scalar(16);
3684   const LLT V2S16 = LLT::vector(2, 16);
3685 
3686   for (int I = AddrIdx; I < EndIdx; ++I) {
3687     MachineOperand &SrcOp = MI.getOperand(I);
3688     if (!SrcOp.isReg())
3689       continue; // _L to _LZ may have eliminated this.
3690 
3691     Register AddrReg = SrcOp.getReg();
3692 
3693     if (I < DimIdx) {
3694       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3695       PackedAddrs.push_back(AddrReg);
3696     } else {
3697       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3698       // derivatives dx/dh and dx/dv are packed with undef.
3699       if (((I + 1) >= EndIdx) ||
3700           ((NumGradients / 2) % 2 == 1 &&
3701            (I == DimIdx + (NumGradients / 2) - 1 ||
3702             I == DimIdx + NumGradients - 1)) ||
3703           // Check for _L to _LZ optimization
3704           !MI.getOperand(I + 1).isReg()) {
3705         PackedAddrs.push_back(
3706             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3707                 .getReg(0));
3708       } else {
3709         PackedAddrs.push_back(
3710             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3711                 .getReg(0));
3712         ++I;
3713       }
3714     }
3715   }
3716 }
3717 
3718 /// Convert from separate vaddr components to a single vector address register,
3719 /// and replace the remaining operands with $noreg.
3720 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3721                                      int DimIdx, int NumVAddrs) {
3722   const LLT S32 = LLT::scalar(32);
3723 
3724   SmallVector<Register, 8> AddrRegs;
3725   for (int I = 0; I != NumVAddrs; ++I) {
3726     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3727     if (SrcOp.isReg()) {
3728       AddrRegs.push_back(SrcOp.getReg());
3729       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3730     }
3731   }
3732 
3733   int NumAddrRegs = AddrRegs.size();
3734   if (NumAddrRegs != 1) {
3735     // Round up to 8 elements for v5-v7
3736     // FIXME: Missing intermediate sized register classes and instructions.
3737     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3738       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3739       auto Undef = B.buildUndef(S32);
3740       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3741       NumAddrRegs = RoundedNumRegs;
3742     }
3743 
3744     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3745     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3746   }
3747 
3748   for (int I = 1; I != NumVAddrs; ++I) {
3749     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3750     if (SrcOp.isReg())
3751       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3752   }
3753 }
3754 
3755 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3756 ///
3757 /// Depending on the subtarget, load/store with 16-bit element data need to be
3758 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3759 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3760 /// registers.
3761 ///
3762 /// We don't want to directly select image instructions just yet, but also want
3763 /// to exposes all register repacking to the legalizer/combiners. We also don't
3764 /// want a selected instrution entering RegBankSelect. In order to avoid
3765 /// defining a multitude of intermediate image instructions, directly hack on
3766 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3767 /// now unnecessary arguments with $noreg.
3768 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3769     MachineInstr &MI, MachineIRBuilder &B,
3770     GISelChangeObserver &Observer,
3771     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3772 
3773   const int NumDefs = MI.getNumExplicitDefs();
3774   bool IsTFE = NumDefs == 2;
3775   // We are only processing the operands of d16 image operations on subtargets
3776   // that use the unpacked register layout, or need to repack the TFE result.
3777 
3778   // TODO: Do we need to guard against already legalized intrinsics?
3779   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3780     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3781 
3782   MachineRegisterInfo *MRI = B.getMRI();
3783   const LLT S32 = LLT::scalar(32);
3784   const LLT S16 = LLT::scalar(16);
3785   const LLT V2S16 = LLT::vector(2, 16);
3786 
3787   // Index of first address argument
3788   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3789 
3790   int NumVAddrs, NumGradients;
3791   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3792   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3793     getDMaskIdx(BaseOpcode, NumDefs);
3794   unsigned DMask = 0;
3795 
3796   // Check for 16 bit addresses and pack if true.
3797   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3798   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3799   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3800   const bool IsG16 = GradTy == S16;
3801   const bool IsA16 = AddrTy == S16;
3802 
3803   int DMaskLanes = 0;
3804   if (!BaseOpcode->Atomic) {
3805     DMask = MI.getOperand(DMaskIdx).getImm();
3806     if (BaseOpcode->Gather4) {
3807       DMaskLanes = 4;
3808     } else if (DMask != 0) {
3809       DMaskLanes = countPopulation(DMask);
3810     } else if (!IsTFE && !BaseOpcode->Store) {
3811       // If dmask is 0, this is a no-op load. This can be eliminated.
3812       B.buildUndef(MI.getOperand(0));
3813       MI.eraseFromParent();
3814       return true;
3815     }
3816   }
3817 
3818   Observer.changingInstr(MI);
3819   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3820 
3821   unsigned NewOpcode = NumDefs == 0 ?
3822     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3823 
3824   // Track that we legalized this
3825   MI.setDesc(B.getTII().get(NewOpcode));
3826 
3827   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3828   // dmask to be at least 1 otherwise the instruction will fail
3829   if (IsTFE && DMask == 0) {
3830     DMask = 0x1;
3831     DMaskLanes = 1;
3832     MI.getOperand(DMaskIdx).setImm(DMask);
3833   }
3834 
3835   if (BaseOpcode->Atomic) {
3836     Register VData0 = MI.getOperand(2).getReg();
3837     LLT Ty = MRI->getType(VData0);
3838 
3839     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3840     if (Ty.isVector())
3841       return false;
3842 
3843     if (BaseOpcode->AtomicX2) {
3844       Register VData1 = MI.getOperand(3).getReg();
3845       // The two values are packed in one register.
3846       LLT PackedTy = LLT::vector(2, Ty);
3847       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3848       MI.getOperand(2).setReg(Concat.getReg(0));
3849       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3850     }
3851   }
3852 
3853   int CorrectedNumVAddrs = NumVAddrs;
3854 
3855   // Optimize _L to _LZ when _L is zero
3856   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3857         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3858     const ConstantFP *ConstantLod;
3859     const int LodIdx = AddrIdx + NumVAddrs - 1;
3860 
3861     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3862       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3863         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3864         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3865           LZMappingInfo->LZ, ImageDimIntr->Dim);
3866 
3867         // The starting indexes should remain in the same place.
3868         --NumVAddrs;
3869         --CorrectedNumVAddrs;
3870 
3871         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3872           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3873         MI.RemoveOperand(LodIdx);
3874       }
3875     }
3876   }
3877 
3878   // Optimize _mip away, when 'lod' is zero
3879   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3880     int64_t ConstantLod;
3881     const int LodIdx = AddrIdx + NumVAddrs - 1;
3882 
3883     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3884       if (ConstantLod == 0) {
3885         // TODO: Change intrinsic opcode and remove operand instead or replacing
3886         // it with 0, as the _L to _LZ handling is done above.
3887         MI.getOperand(LodIdx).ChangeToImmediate(0);
3888         --CorrectedNumVAddrs;
3889       }
3890     }
3891   }
3892 
3893   // Rewrite the addressing register layout before doing anything else.
3894   if (IsA16 || IsG16) {
3895     if (IsA16) {
3896       // Target must support the feature and gradients need to be 16 bit too
3897       if (!ST.hasA16() || !IsG16)
3898         return false;
3899     } else if (!ST.hasG16())
3900       return false;
3901 
3902     if (NumVAddrs > 1) {
3903       SmallVector<Register, 4> PackedRegs;
3904       // Don't compress addresses for G16
3905       const int PackEndIdx =
3906           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3907       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3908                                   PackEndIdx, NumGradients);
3909 
3910       if (!IsA16) {
3911         // Add uncompressed address
3912         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3913           int AddrReg = MI.getOperand(I).getReg();
3914           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3915           PackedRegs.push_back(AddrReg);
3916         }
3917       }
3918 
3919       // See also below in the non-a16 branch
3920       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3921 
3922       if (!UseNSA && PackedRegs.size() > 1) {
3923         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3924         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3925         PackedRegs[0] = Concat.getReg(0);
3926         PackedRegs.resize(1);
3927       }
3928 
3929       const int NumPacked = PackedRegs.size();
3930       for (int I = 0; I != NumVAddrs; ++I) {
3931         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3932         if (!SrcOp.isReg()) {
3933           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3934           continue;
3935         }
3936 
3937         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3938 
3939         if (I < NumPacked)
3940           SrcOp.setReg(PackedRegs[I]);
3941         else
3942           SrcOp.setReg(AMDGPU::NoRegister);
3943       }
3944     }
3945   } else {
3946     // If the register allocator cannot place the address registers contiguously
3947     // without introducing moves, then using the non-sequential address encoding
3948     // is always preferable, since it saves VALU instructions and is usually a
3949     // wash in terms of code size or even better.
3950     //
3951     // However, we currently have no way of hinting to the register allocator
3952     // that MIMG addresses should be placed contiguously when it is possible to
3953     // do so, so force non-NSA for the common 2-address case as a heuristic.
3954     //
3955     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3956     // allocation when possible.
3957     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3958 
3959     if (!UseNSA && NumVAddrs > 1)
3960       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3961   }
3962 
3963   int Flags = 0;
3964   if (IsA16)
3965     Flags |= 1;
3966   if (IsG16)
3967     Flags |= 2;
3968   MI.addOperand(MachineOperand::CreateImm(Flags));
3969 
3970   if (BaseOpcode->Store) { // No TFE for stores?
3971     // TODO: Handle dmask trim
3972     Register VData = MI.getOperand(1).getReg();
3973     LLT Ty = MRI->getType(VData);
3974     if (!Ty.isVector() || Ty.getElementType() != S16)
3975       return true;
3976 
3977     Register RepackedReg = handleD16VData(B, *MRI, VData);
3978     if (RepackedReg != VData) {
3979       MI.getOperand(1).setReg(RepackedReg);
3980     }
3981 
3982     return true;
3983   }
3984 
3985   Register DstReg = MI.getOperand(0).getReg();
3986   LLT Ty = MRI->getType(DstReg);
3987   const LLT EltTy = Ty.getScalarType();
3988   const bool IsD16 = Ty.getScalarType() == S16;
3989   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3990 
3991   // Confirm that the return type is large enough for the dmask specified
3992   if (NumElts < DMaskLanes)
3993     return false;
3994 
3995   if (NumElts > 4 || DMaskLanes > 4)
3996     return false;
3997 
3998   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3999   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4000 
4001   // The raw dword aligned data component of the load. The only legal cases
4002   // where this matters should be when using the packed D16 format, for
4003   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4004   LLT RoundedTy;
4005 
4006   // S32 vector to to cover all data, plus TFE result element.
4007   LLT TFETy;
4008 
4009   // Register type to use for each loaded component. Will be S32 or V2S16.
4010   LLT RegTy;
4011 
4012   if (IsD16 && ST.hasUnpackedD16VMem()) {
4013     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4014     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4015     RegTy = S32;
4016   } else {
4017     unsigned EltSize = EltTy.getSizeInBits();
4018     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4019     unsigned RoundedSize = 32 * RoundedElts;
4020     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4021     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4022     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4023   }
4024 
4025   // The return type does not need adjustment.
4026   // TODO: Should we change s16 case to s32 or <2 x s16>?
4027   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4028     return true;
4029 
4030   Register Dst1Reg;
4031 
4032   // Insert after the instruction.
4033   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4034 
4035   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4036   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4037   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4038   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4039 
4040   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4041 
4042   MI.getOperand(0).setReg(NewResultReg);
4043 
4044   // In the IR, TFE is supposed to be used with a 2 element struct return
4045   // type. The intruction really returns these two values in one contiguous
4046   // register, with one additional dword beyond the loaded data. Rewrite the
4047   // return type to use a single register result.
4048 
4049   if (IsTFE) {
4050     Dst1Reg = MI.getOperand(1).getReg();
4051     if (MRI->getType(Dst1Reg) != S32)
4052       return false;
4053 
4054     // TODO: Make sure the TFE operand bit is set.
4055     MI.RemoveOperand(1);
4056 
4057     // Handle the easy case that requires no repack instructions.
4058     if (Ty == S32) {
4059       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4060       return true;
4061     }
4062   }
4063 
4064   // Now figure out how to copy the new result register back into the old
4065   // result.
4066   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4067 
4068   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4069 
4070   if (ResultNumRegs == 1) {
4071     assert(!IsTFE);
4072     ResultRegs[0] = NewResultReg;
4073   } else {
4074     // We have to repack into a new vector of some kind.
4075     for (int I = 0; I != NumDataRegs; ++I)
4076       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4077     B.buildUnmerge(ResultRegs, NewResultReg);
4078 
4079     // Drop the final TFE element to get the data part. The TFE result is
4080     // directly written to the right place already.
4081     if (IsTFE)
4082       ResultRegs.resize(NumDataRegs);
4083   }
4084 
4085   // For an s16 scalar result, we form an s32 result with a truncate regardless
4086   // of packed vs. unpacked.
4087   if (IsD16 && !Ty.isVector()) {
4088     B.buildTrunc(DstReg, ResultRegs[0]);
4089     return true;
4090   }
4091 
4092   // Avoid a build/concat_vector of 1 entry.
4093   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4094     B.buildBitcast(DstReg, ResultRegs[0]);
4095     return true;
4096   }
4097 
4098   assert(Ty.isVector());
4099 
4100   if (IsD16) {
4101     // For packed D16 results with TFE enabled, all the data components are
4102     // S32. Cast back to the expected type.
4103     //
4104     // TODO: We don't really need to use load s32 elements. We would only need one
4105     // cast for the TFE result if a multiple of v2s16 was used.
4106     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4107       for (Register &Reg : ResultRegs)
4108         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4109     } else if (ST.hasUnpackedD16VMem()) {
4110       for (Register &Reg : ResultRegs)
4111         Reg = B.buildTrunc(S16, Reg).getReg(0);
4112     }
4113   }
4114 
4115   auto padWithUndef = [&](LLT Ty, int NumElts) {
4116     if (NumElts == 0)
4117       return;
4118     Register Undef = B.buildUndef(Ty).getReg(0);
4119     for (int I = 0; I != NumElts; ++I)
4120       ResultRegs.push_back(Undef);
4121   };
4122 
4123   // Pad out any elements eliminated due to the dmask.
4124   LLT ResTy = MRI->getType(ResultRegs[0]);
4125   if (!ResTy.isVector()) {
4126     padWithUndef(ResTy, NumElts - ResultRegs.size());
4127     B.buildBuildVector(DstReg, ResultRegs);
4128     return true;
4129   }
4130 
4131   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4132   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4133 
4134   // Deal with the one annoying legal case.
4135   const LLT V3S16 = LLT::vector(3, 16);
4136   if (Ty == V3S16) {
4137     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4138     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4139     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4140     return true;
4141   }
4142 
4143   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4144   B.buildConcatVectors(DstReg, ResultRegs);
4145   return true;
4146 }
4147 
4148 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4149   LegalizerHelper &Helper, MachineInstr &MI) const {
4150   MachineIRBuilder &B = Helper.MIRBuilder;
4151   GISelChangeObserver &Observer = Helper.Observer;
4152 
4153   Register Dst = MI.getOperand(0).getReg();
4154   LLT Ty = B.getMRI()->getType(Dst);
4155   unsigned Size = Ty.getSizeInBits();
4156   MachineFunction &MF = B.getMF();
4157 
4158   Observer.changingInstr(MI);
4159 
4160   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4161     Ty = getBitcastRegisterType(Ty);
4162     Helper.bitcastDst(MI, Ty, 0);
4163     Dst = MI.getOperand(0).getReg();
4164     B.setInsertPt(B.getMBB(), MI);
4165   }
4166 
4167   // FIXME: We don't really need this intermediate instruction. The intrinsic
4168   // should be fixed to have a memory operand. Since it's readnone, we're not
4169   // allowed to add one.
4170   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4171   MI.RemoveOperand(1); // Remove intrinsic ID
4172 
4173   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4174   // TODO: Should this use datalayout alignment?
4175   const unsigned MemSize = (Size + 7) / 8;
4176   const Align MemAlign(4);
4177   MachineMemOperand *MMO = MF.getMachineMemOperand(
4178       MachinePointerInfo(),
4179       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4180           MachineMemOperand::MOInvariant,
4181       MemSize, MemAlign);
4182   MI.addMemOperand(MF, MMO);
4183 
4184   // There are no 96-bit result scalar loads, but widening to 128-bit should
4185   // always be legal. We may need to restore this to a 96-bit result if it turns
4186   // out this needs to be converted to a vector load during RegBankSelect.
4187   if (!isPowerOf2_32(Size)) {
4188     if (Ty.isVector())
4189       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4190     else
4191       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4192   }
4193 
4194   Observer.changedInstr(MI);
4195   return true;
4196 }
4197 
4198 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4199                                                 MachineRegisterInfo &MRI,
4200                                                 MachineIRBuilder &B) const {
4201   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4202   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4203       !ST.isTrapHandlerEnabled()) {
4204     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4205   } else {
4206     // Pass queue pointer to trap handler as input, and insert trap instruction
4207     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4208     MachineRegisterInfo &MRI = *B.getMRI();
4209     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4210     Register LiveIn = getLiveInRegister(
4211         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4212         /*InsertLiveInCopy=*/false);
4213     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4214       return false;
4215     B.buildCopy(SGPR01, LiveIn);
4216     B.buildInstr(AMDGPU::S_TRAP)
4217         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4218         .addReg(SGPR01, RegState::Implicit);
4219   }
4220 
4221   MI.eraseFromParent();
4222   return true;
4223 }
4224 
4225 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4226     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4227   // Is non-HSA path or trap-handler disabled? then, report a warning
4228   // accordingly
4229   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4230       !ST.isTrapHandlerEnabled()) {
4231     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4232                                      "debugtrap handler not supported",
4233                                      MI.getDebugLoc(), DS_Warning);
4234     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4235     Ctx.diagnose(NoTrap);
4236   } else {
4237     // Insert debug-trap instruction
4238     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4239   }
4240 
4241   MI.eraseFromParent();
4242   return true;
4243 }
4244 
4245 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4246                                             MachineInstr &MI) const {
4247   MachineIRBuilder &B = Helper.MIRBuilder;
4248   MachineRegisterInfo &MRI = *B.getMRI();
4249 
4250   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4251   auto IntrID = MI.getIntrinsicID();
4252   switch (IntrID) {
4253   case Intrinsic::amdgcn_if:
4254   case Intrinsic::amdgcn_else: {
4255     MachineInstr *Br = nullptr;
4256     MachineBasicBlock *UncondBrTarget = nullptr;
4257     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4258       const SIRegisterInfo *TRI
4259         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4260 
4261       Register Def = MI.getOperand(1).getReg();
4262       Register Use = MI.getOperand(3).getReg();
4263 
4264       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4265       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4266       if (IntrID == Intrinsic::amdgcn_if) {
4267         B.buildInstr(AMDGPU::SI_IF)
4268           .addDef(Def)
4269           .addUse(Use)
4270           .addMBB(UncondBrTarget);
4271       } else {
4272         B.buildInstr(AMDGPU::SI_ELSE)
4273           .addDef(Def)
4274           .addUse(Use)
4275           .addMBB(UncondBrTarget)
4276           .addImm(0);
4277       }
4278 
4279       if (Br) {
4280         Br->getOperand(0).setMBB(CondBrTarget);
4281       } else {
4282         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4283         // since we're swapping branch targets it needs to be reinserted.
4284         // FIXME: IRTranslator should probably not do this
4285         B.buildBr(*CondBrTarget);
4286       }
4287 
4288       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4289       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4290       MI.eraseFromParent();
4291       BrCond->eraseFromParent();
4292       return true;
4293     }
4294 
4295     return false;
4296   }
4297   case Intrinsic::amdgcn_loop: {
4298     MachineInstr *Br = nullptr;
4299     MachineBasicBlock *UncondBrTarget = nullptr;
4300     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4301       const SIRegisterInfo *TRI
4302         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4303 
4304       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4305       Register Reg = MI.getOperand(2).getReg();
4306 
4307       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4308       B.buildInstr(AMDGPU::SI_LOOP)
4309         .addUse(Reg)
4310         .addMBB(UncondBrTarget);
4311 
4312       if (Br)
4313         Br->getOperand(0).setMBB(CondBrTarget);
4314       else
4315         B.buildBr(*CondBrTarget);
4316 
4317       MI.eraseFromParent();
4318       BrCond->eraseFromParent();
4319       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4320       return true;
4321     }
4322 
4323     return false;
4324   }
4325   case Intrinsic::amdgcn_kernarg_segment_ptr:
4326     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4327       // This only makes sense to call in a kernel, so just lower to null.
4328       B.buildConstant(MI.getOperand(0).getReg(), 0);
4329       MI.eraseFromParent();
4330       return true;
4331     }
4332 
4333     return legalizePreloadedArgIntrin(
4334       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4335   case Intrinsic::amdgcn_implicitarg_ptr:
4336     return legalizeImplicitArgPtr(MI, MRI, B);
4337   case Intrinsic::amdgcn_workitem_id_x:
4338     return legalizePreloadedArgIntrin(MI, MRI, B,
4339                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4340   case Intrinsic::amdgcn_workitem_id_y:
4341     return legalizePreloadedArgIntrin(MI, MRI, B,
4342                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4343   case Intrinsic::amdgcn_workitem_id_z:
4344     return legalizePreloadedArgIntrin(MI, MRI, B,
4345                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4346   case Intrinsic::amdgcn_workgroup_id_x:
4347     return legalizePreloadedArgIntrin(MI, MRI, B,
4348                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4349   case Intrinsic::amdgcn_workgroup_id_y:
4350     return legalizePreloadedArgIntrin(MI, MRI, B,
4351                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4352   case Intrinsic::amdgcn_workgroup_id_z:
4353     return legalizePreloadedArgIntrin(MI, MRI, B,
4354                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4355   case Intrinsic::amdgcn_dispatch_ptr:
4356     return legalizePreloadedArgIntrin(MI, MRI, B,
4357                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4358   case Intrinsic::amdgcn_queue_ptr:
4359     return legalizePreloadedArgIntrin(MI, MRI, B,
4360                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4361   case Intrinsic::amdgcn_implicit_buffer_ptr:
4362     return legalizePreloadedArgIntrin(
4363       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4364   case Intrinsic::amdgcn_dispatch_id:
4365     return legalizePreloadedArgIntrin(MI, MRI, B,
4366                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4367   case Intrinsic::amdgcn_fdiv_fast:
4368     return legalizeFDIVFastIntrin(MI, MRI, B);
4369   case Intrinsic::amdgcn_is_shared:
4370     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4371   case Intrinsic::amdgcn_is_private:
4372     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4373   case Intrinsic::amdgcn_wavefrontsize: {
4374     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4375     MI.eraseFromParent();
4376     return true;
4377   }
4378   case Intrinsic::amdgcn_s_buffer_load:
4379     return legalizeSBufferLoad(Helper, MI);
4380   case Intrinsic::amdgcn_raw_buffer_store:
4381   case Intrinsic::amdgcn_struct_buffer_store:
4382     return legalizeBufferStore(MI, MRI, B, false, false);
4383   case Intrinsic::amdgcn_raw_buffer_store_format:
4384   case Intrinsic::amdgcn_struct_buffer_store_format:
4385     return legalizeBufferStore(MI, MRI, B, false, true);
4386   case Intrinsic::amdgcn_raw_tbuffer_store:
4387   case Intrinsic::amdgcn_struct_tbuffer_store:
4388     return legalizeBufferStore(MI, MRI, B, true, true);
4389   case Intrinsic::amdgcn_raw_buffer_load:
4390   case Intrinsic::amdgcn_struct_buffer_load:
4391     return legalizeBufferLoad(MI, MRI, B, false, false);
4392   case Intrinsic::amdgcn_raw_buffer_load_format:
4393   case Intrinsic::amdgcn_struct_buffer_load_format:
4394     return legalizeBufferLoad(MI, MRI, B, true, false);
4395   case Intrinsic::amdgcn_raw_tbuffer_load:
4396   case Intrinsic::amdgcn_struct_tbuffer_load:
4397     return legalizeBufferLoad(MI, MRI, B, true, true);
4398   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4399   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4400   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4401   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4402   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4403   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4404   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4405   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4406   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4407   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4408   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4409   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4410   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4411   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4412   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4413   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4414   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4415   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4416   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4417   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4418   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4419   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4420   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4421   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4422   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4423   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4424     return legalizeBufferAtomic(MI, B, IntrID);
4425   case Intrinsic::amdgcn_atomic_inc:
4426     return legalizeAtomicIncDec(MI, B, true);
4427   case Intrinsic::amdgcn_atomic_dec:
4428     return legalizeAtomicIncDec(MI, B, false);
4429   case Intrinsic::trap:
4430     return legalizeTrapIntrinsic(MI, MRI, B);
4431   case Intrinsic::debugtrap:
4432     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4433   default: {
4434     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4435             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4436       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4437     return true;
4438   }
4439   }
4440 
4441   return true;
4442 }
4443