1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   if (Size <= 32) {
138     // <2 x s8> -> s16
139     // <4 x s8> -> s32
140     return LLT::scalar(Size);
141   }
142 
143   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
144 }
145 
146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147   return [=](const LegalityQuery &Query) {
148     const LLT Ty = Query.Types[TypeIdx];
149     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
150   };
151 }
152 
153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     unsigned Size = Ty.getSizeInBits();
157     assert(Size % 32 == 0);
158     return std::make_pair(
159         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
160   };
161 }
162 
163 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
164   return [=](const LegalityQuery &Query) {
165     const LLT QueryTy = Query.Types[TypeIdx];
166     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
167   };
168 }
169 
170 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
171   return [=](const LegalityQuery &Query) {
172     const LLT QueryTy = Query.Types[TypeIdx];
173     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
174   };
175 }
176 
177 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
178   return [=](const LegalityQuery &Query) {
179     const LLT QueryTy = Query.Types[TypeIdx];
180     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
181   };
182 }
183 
184 static bool isRegisterSize(unsigned Size) {
185   return Size % 32 == 0 && Size <= MaxRegisterSize;
186 }
187 
188 static bool isRegisterVectorElementType(LLT EltTy) {
189   const int EltSize = EltTy.getSizeInBits();
190   return EltSize == 16 || EltSize % 32 == 0;
191 }
192 
193 static bool isRegisterVectorType(LLT Ty) {
194   const int EltSize = Ty.getElementType().getSizeInBits();
195   return EltSize == 32 || EltSize == 64 ||
196          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
197          EltSize == 128 || EltSize == 256;
198 }
199 
200 static bool isRegisterType(LLT Ty) {
201   if (!isRegisterSize(Ty.getSizeInBits()))
202     return false;
203 
204   if (Ty.isVector())
205     return isRegisterVectorType(Ty);
206 
207   return true;
208 }
209 
210 // Any combination of 32 or 64-bit elements up the maximum register size, and
211 // multiples of v2s16.
212 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
213   return [=](const LegalityQuery &Query) {
214     return isRegisterType(Query.Types[TypeIdx]);
215   };
216 }
217 
218 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
219   return [=](const LegalityQuery &Query) {
220     const LLT QueryTy = Query.Types[TypeIdx];
221     if (!QueryTy.isVector())
222       return false;
223     const LLT EltTy = QueryTy.getElementType();
224     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
225   };
226 }
227 
228 // If we have a truncating store or an extending load with a data size larger
229 // than 32-bits, we need to reduce to a 32-bit type.
230 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
231   return [=](const LegalityQuery &Query) {
232     const LLT Ty = Query.Types[TypeIdx];
233     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
234            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
235   };
236 }
237 
238 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
239 // handle some operations by just promoting the register during
240 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
241 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
242                                     bool IsLoad) {
243   switch (AS) {
244   case AMDGPUAS::PRIVATE_ADDRESS:
245     // FIXME: Private element size.
246     return ST.enableFlatScratch() ? 128 : 32;
247   case AMDGPUAS::LOCAL_ADDRESS:
248     return ST.useDS128() ? 128 : 64;
249   case AMDGPUAS::GLOBAL_ADDRESS:
250   case AMDGPUAS::CONSTANT_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
252     // Treat constant and global as identical. SMRD loads are sometimes usable for
253     // global loads (ideally constant address space should be eliminated)
254     // depending on the context. Legality cannot be context dependent, but
255     // RegBankSelect can split the load as necessary depending on the pointer
256     // register bank/uniformity and if the memory is invariant or not written in a
257     // kernel.
258     return IsLoad ? 512 : 128;
259   default:
260     // Flat addresses may contextually need to be split to 32-bit parts if they
261     // may alias scratch depending on the subtarget.
262     return 128;
263   }
264 }
265 
266 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
267                                  const LegalityQuery &Query) {
268   const LLT Ty = Query.Types[0];
269 
270   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
271   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
272 
273   unsigned RegSize = Ty.getSizeInBits();
274   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
275   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
276   unsigned AS = Query.Types[1].getAddressSpace();
277 
278   // All of these need to be custom lowered to cast the pointer operand.
279   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
280     return false;
281 
282   // Do not handle extending vector loads.
283   if (Ty.isVector() && MemSize != RegSize)
284     return false;
285 
286   // TODO: We should be able to widen loads if the alignment is high enough, but
287   // we also need to modify the memory access size.
288 #if 0
289   // Accept widening loads based on alignment.
290   if (IsLoad && MemSize < Size)
291     MemSize = std::max(MemSize, Align);
292 #endif
293 
294   // Only 1-byte and 2-byte to 32-bit extloads are valid.
295   if (MemSize != RegSize && RegSize != 32)
296     return false;
297 
298   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
299     return false;
300 
301   switch (MemSize) {
302   case 8:
303   case 16:
304   case 32:
305   case 64:
306   case 128:
307     break;
308   case 96:
309     if (!ST.hasDwordx3LoadStores())
310       return false;
311     break;
312   case 256:
313   case 512:
314     // These may contextually need to be broken down.
315     break;
316   default:
317     return false;
318   }
319 
320   assert(RegSize >= MemSize);
321 
322   if (AlignBits < MemSize) {
323     const SITargetLowering *TLI = ST.getTargetLowering();
324     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
325                                                  Align(AlignBits / 8)))
326       return false;
327   }
328 
329   return true;
330 }
331 
332 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
333 // workaround this. Eventually it should ignore the type for loads and only care
334 // about the size. Return true in cases where we will workaround this for now by
335 // bitcasting.
336 static bool loadStoreBitcastWorkaround(const LLT Ty) {
337   if (EnableNewLegality)
338     return false;
339 
340   const unsigned Size = Ty.getSizeInBits();
341   if (Size <= 64)
342     return false;
343   if (!Ty.isVector())
344     return true;
345 
346   LLT EltTy = Ty.getElementType();
347   if (EltTy.isPointer())
348     return true;
349 
350   unsigned EltSize = EltTy.getSizeInBits();
351   return EltSize != 32 && EltSize != 64;
352 }
353 
354 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
355   const LLT Ty = Query.Types[0];
356   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
357          !loadStoreBitcastWorkaround(Ty);
358 }
359 
360 /// Return true if a load or store of the type should be lowered with a bitcast
361 /// to a different type.
362 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
363                                        const LLT MemTy) {
364   const unsigned MemSizeInBits = MemTy.getSizeInBits();
365   const unsigned Size = Ty.getSizeInBits();
366   if (Size != MemSizeInBits)
367     return Size <= 32 && Ty.isVector();
368 
369   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
370     return true;
371 
372   // Don't try to handle bitcasting vector ext loads for now.
373   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
374          (Size <= 32 || isRegisterSize(Size)) &&
375          !isRegisterVectorElementType(Ty.getElementType());
376 }
377 
378 /// Return true if we should legalize a load by widening an odd sized memory
379 /// access up to the alignment. Note this case when the memory access itself
380 /// changes, not the size of the result register.
381 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
382                             uint64_t AlignInBits, unsigned AddrSpace,
383                             unsigned Opcode) {
384   unsigned SizeInBits = MemoryTy.getSizeInBits();
385   // We don't want to widen cases that are naturally legal.
386   if (isPowerOf2_32(SizeInBits))
387     return false;
388 
389   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
390   // end up widening these for a scalar load during RegBankSelect, since there
391   // aren't 96-bit scalar loads.
392   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
393     return false;
394 
395   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
396     return false;
397 
398   // A load is known dereferenceable up to the alignment, so it's legal to widen
399   // to it.
400   //
401   // TODO: Could check dereferenceable for less aligned cases.
402   unsigned RoundedSize = NextPowerOf2(SizeInBits);
403   if (AlignInBits < RoundedSize)
404     return false;
405 
406   // Do not widen if it would introduce a slow unaligned load.
407   const SITargetLowering *TLI = ST.getTargetLowering();
408   bool Fast = false;
409   return TLI->allowsMisalignedMemoryAccessesImpl(
410              RoundedSize, AddrSpace, Align(AlignInBits / 8),
411              MachineMemOperand::MOLoad, &Fast) &&
412          Fast;
413 }
414 
415 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
416                             unsigned Opcode) {
417   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
418     return false;
419 
420   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
421                          Query.MMODescrs[0].AlignInBits,
422                          Query.Types[1].getAddressSpace(), Opcode);
423 }
424 
425 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
426                                          const GCNTargetMachine &TM)
427   :  ST(ST_) {
428   using namespace TargetOpcode;
429 
430   auto GetAddrSpacePtr = [&TM](unsigned AS) {
431     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
432   };
433 
434   const LLT S1 = LLT::scalar(1);
435   const LLT S8 = LLT::scalar(8);
436   const LLT S16 = LLT::scalar(16);
437   const LLT S32 = LLT::scalar(32);
438   const LLT S64 = LLT::scalar(64);
439   const LLT S128 = LLT::scalar(128);
440   const LLT S256 = LLT::scalar(256);
441   const LLT S512 = LLT::scalar(512);
442   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
443 
444   const LLT V2S8 = LLT::fixed_vector(2, 8);
445   const LLT V2S16 = LLT::fixed_vector(2, 16);
446   const LLT V4S16 = LLT::fixed_vector(4, 16);
447 
448   const LLT V2S32 = LLT::fixed_vector(2, 32);
449   const LLT V3S32 = LLT::fixed_vector(3, 32);
450   const LLT V4S32 = LLT::fixed_vector(4, 32);
451   const LLT V5S32 = LLT::fixed_vector(5, 32);
452   const LLT V6S32 = LLT::fixed_vector(6, 32);
453   const LLT V7S32 = LLT::fixed_vector(7, 32);
454   const LLT V8S32 = LLT::fixed_vector(8, 32);
455   const LLT V9S32 = LLT::fixed_vector(9, 32);
456   const LLT V10S32 = LLT::fixed_vector(10, 32);
457   const LLT V11S32 = LLT::fixed_vector(11, 32);
458   const LLT V12S32 = LLT::fixed_vector(12, 32);
459   const LLT V13S32 = LLT::fixed_vector(13, 32);
460   const LLT V14S32 = LLT::fixed_vector(14, 32);
461   const LLT V15S32 = LLT::fixed_vector(15, 32);
462   const LLT V16S32 = LLT::fixed_vector(16, 32);
463   const LLT V32S32 = LLT::fixed_vector(32, 32);
464 
465   const LLT V2S64 = LLT::fixed_vector(2, 64);
466   const LLT V3S64 = LLT::fixed_vector(3, 64);
467   const LLT V4S64 = LLT::fixed_vector(4, 64);
468   const LLT V5S64 = LLT::fixed_vector(5, 64);
469   const LLT V6S64 = LLT::fixed_vector(6, 64);
470   const LLT V7S64 = LLT::fixed_vector(7, 64);
471   const LLT V8S64 = LLT::fixed_vector(8, 64);
472   const LLT V16S64 = LLT::fixed_vector(16, 64);
473 
474   std::initializer_list<LLT> AllS32Vectors =
475     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
476      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
477   std::initializer_list<LLT> AllS64Vectors =
478     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
479 
480   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
481   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
482   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
483   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
484   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
485   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
486   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
487 
488   const LLT CodePtr = FlatPtr;
489 
490   const std::initializer_list<LLT> AddrSpaces64 = {
491     GlobalPtr, ConstantPtr, FlatPtr
492   };
493 
494   const std::initializer_list<LLT> AddrSpaces32 = {
495     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
496   };
497 
498   const std::initializer_list<LLT> FPTypesBase = {
499     S32, S64
500   };
501 
502   const std::initializer_list<LLT> FPTypes16 = {
503     S32, S64, S16
504   };
505 
506   const std::initializer_list<LLT> FPTypesPK16 = {
507     S32, S64, S16, V2S16
508   };
509 
510   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
511 
512   // s1 for VCC branches, s32 for SCC branches.
513   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
514 
515   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
516   // elements for v3s16
517   getActionDefinitionsBuilder(G_PHI)
518     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
519     .legalFor(AllS32Vectors)
520     .legalFor(AllS64Vectors)
521     .legalFor(AddrSpaces64)
522     .legalFor(AddrSpaces32)
523     .legalIf(isPointer(0))
524     .clampScalar(0, S16, S256)
525     .widenScalarToNextPow2(0, 32)
526     .clampMaxNumElements(0, S32, 16)
527     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
528     .scalarize(0);
529 
530   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
531     // Full set of gfx9 features.
532     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
533       .legalFor({S32, S16, V2S16})
534       .minScalar(0, S16)
535       .clampMaxNumElementsStrict(0, S16, 2)
536       .widenScalarToNextMultipleOf(0, 32)
537       .maxScalar(0, S32)
538       .scalarize(0);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElementsStrict(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .minScalar(0, S16)
551       .widenScalarToNextMultipleOf(0, 32)
552       .maxScalar(0, S32)
553       .scalarize(0);
554 
555     // Technically the saturating operations require clamp bit support, but this
556     // was introduced at the same time as 16-bit operations.
557     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
558       .legalFor({S32, S16}) // Clamp modifier
559       .minScalar(0, S16)
560       .scalarize(0)
561       .widenScalarToNextPow2(0, 16)
562       .lower();
563 
564     // We're just lowering this, but it helps get a better result to try to
565     // coerce to the desired type first.
566     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
567       .minScalar(0, S16)
568       .scalarize(0)
569       .lower();
570   } else {
571     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
572       .legalFor({S32})
573       .widenScalarToNextMultipleOf(0, 32)
574       .clampScalar(0, S32, S32)
575       .scalarize(0);
576 
577     if (ST.hasIntClamp()) {
578       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
579         .legalFor({S32}) // Clamp modifier.
580         .scalarize(0)
581         .minScalarOrElt(0, S32)
582         .lower();
583     } else {
584       // Clamp bit support was added in VI, along with 16-bit operations.
585       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
586         .minScalar(0, S32)
587         .scalarize(0)
588         .lower();
589     }
590 
591     // FIXME: DAG expansion gets better results. The widening uses the smaller
592     // range values and goes for the min/max lowering directly.
593     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
594       .minScalar(0, S32)
595       .scalarize(0)
596       .lower();
597   }
598 
599   getActionDefinitionsBuilder(
600       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
601       .customFor({S32, S64})
602       .clampScalar(0, S32, S64)
603       .widenScalarToNextPow2(0, 32)
604       .scalarize(0);
605 
606   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
607                    .legalFor({S32})
608                    .maxScalar(0, S32);
609 
610   if (ST.hasVOP3PInsts()) {
611     Mulh
612       .clampMaxNumElements(0, S8, 2)
613       .lowerFor({V2S8});
614   }
615 
616   Mulh
617     .scalarize(0)
618     .lower();
619 
620   // Report legal for any types we can handle anywhere. For the cases only legal
621   // on the SALU, RegBankSelect will be able to re-legalize.
622   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
623     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
624     .clampScalar(0, S32, S64)
625     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
626     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
627     .widenScalarToNextPow2(0)
628     .scalarize(0);
629 
630   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
631                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
632     .legalFor({{S32, S1}, {S32, S32}})
633     .minScalar(0, S32)
634     // TODO: .scalarize(0)
635     .lower();
636 
637   getActionDefinitionsBuilder(G_BITCAST)
638     // Don't worry about the size constraint.
639     .legalIf(all(isRegisterType(0), isRegisterType(1)))
640     .lower();
641 
642 
643   getActionDefinitionsBuilder(G_CONSTANT)
644     .legalFor({S1, S32, S64, S16, GlobalPtr,
645                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
646     .legalIf(isPointer(0))
647     .clampScalar(0, S32, S64)
648     .widenScalarToNextPow2(0);
649 
650   getActionDefinitionsBuilder(G_FCONSTANT)
651     .legalFor({S32, S64, S16})
652     .clampScalar(0, S16, S64);
653 
654   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
655       .legalIf(isRegisterType(0))
656       // s1 and s16 are special cases because they have legal operations on
657       // them, but don't really occupy registers in the normal way.
658       .legalFor({S1, S16})
659       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
660       .clampScalarOrElt(0, S32, MaxScalar)
661       .widenScalarToNextPow2(0, 32)
662       .clampMaxNumElements(0, S32, 16);
663 
664   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
665 
666   // If the amount is divergent, we have to do a wave reduction to get the
667   // maximum value, so this is expanded during RegBankSelect.
668   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
669     .legalFor({{PrivatePtr, S32}});
670 
671   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
672     .customIf(typeIsNot(0, PrivatePtr));
673 
674   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
675 
676   auto &FPOpActions = getActionDefinitionsBuilder(
677     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
678     .legalFor({S32, S64});
679   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
680     .customFor({S32, S64});
681   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
682     .customFor({S32, S64});
683 
684   if (ST.has16BitInsts()) {
685     if (ST.hasVOP3PInsts())
686       FPOpActions.legalFor({S16, V2S16});
687     else
688       FPOpActions.legalFor({S16});
689 
690     TrigActions.customFor({S16});
691     FDIVActions.customFor({S16});
692   }
693 
694   auto &MinNumMaxNum = getActionDefinitionsBuilder({
695       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
696 
697   if (ST.hasVOP3PInsts()) {
698     MinNumMaxNum.customFor(FPTypesPK16)
699       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
700       .clampMaxNumElements(0, S16, 2)
701       .clampScalar(0, S16, S64)
702       .scalarize(0);
703   } else if (ST.has16BitInsts()) {
704     MinNumMaxNum.customFor(FPTypes16)
705       .clampScalar(0, S16, S64)
706       .scalarize(0);
707   } else {
708     MinNumMaxNum.customFor(FPTypesBase)
709       .clampScalar(0, S32, S64)
710       .scalarize(0);
711   }
712 
713   if (ST.hasVOP3PInsts())
714     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
715 
716   FPOpActions
717     .scalarize(0)
718     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
719 
720   TrigActions
721     .scalarize(0)
722     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
723 
724   FDIVActions
725     .scalarize(0)
726     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
727 
728   getActionDefinitionsBuilder({G_FNEG, G_FABS})
729     .legalFor(FPTypesPK16)
730     .clampMaxNumElementsStrict(0, S16, 2)
731     .scalarize(0)
732     .clampScalar(0, S16, S64);
733 
734   if (ST.has16BitInsts()) {
735     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
736       .legalFor({S32, S64, S16})
737       .scalarize(0)
738       .clampScalar(0, S16, S64);
739   } else {
740     getActionDefinitionsBuilder(G_FSQRT)
741       .legalFor({S32, S64})
742       .scalarize(0)
743       .clampScalar(0, S32, S64);
744 
745     if (ST.hasFractBug()) {
746       getActionDefinitionsBuilder(G_FFLOOR)
747         .customFor({S64})
748         .legalFor({S32, S64})
749         .scalarize(0)
750         .clampScalar(0, S32, S64);
751     } else {
752       getActionDefinitionsBuilder(G_FFLOOR)
753         .legalFor({S32, S64})
754         .scalarize(0)
755         .clampScalar(0, S32, S64);
756     }
757   }
758 
759   getActionDefinitionsBuilder(G_FPTRUNC)
760     .legalFor({{S32, S64}, {S16, S32}})
761     .scalarize(0)
762     .lower();
763 
764   getActionDefinitionsBuilder(G_FPEXT)
765     .legalFor({{S64, S32}, {S32, S16}})
766     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
767     .scalarize(0);
768 
769   getActionDefinitionsBuilder(G_FSUB)
770       // Use actual fsub instruction
771       .legalFor({S32})
772       // Must use fadd + fneg
773       .lowerFor({S64, S16, V2S16})
774       .scalarize(0)
775       .clampScalar(0, S32, S64);
776 
777   // Whether this is legal depends on the floating point mode for the function.
778   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
779   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
780     FMad.customFor({S32, S16});
781   else if (ST.hasMadMacF32Insts())
782     FMad.customFor({S32});
783   else if (ST.hasMadF16())
784     FMad.customFor({S16});
785   FMad.scalarize(0)
786       .lower();
787 
788   auto &FRem = getActionDefinitionsBuilder(G_FREM);
789   if (ST.has16BitInsts()) {
790     FRem.customFor({S16, S32, S64});
791   } else {
792     FRem.minScalar(0, S32)
793         .customFor({S32, S64});
794   }
795   FRem.scalarize(0);
796 
797   // TODO: Do we need to clamp maximum bitwidth?
798   getActionDefinitionsBuilder(G_TRUNC)
799     .legalIf(isScalar(0))
800     .legalFor({{V2S16, V2S32}})
801     .clampMaxNumElements(0, S16, 2)
802     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
803     // situations (like an invalid implicit use), we don't want to infinite loop
804     // in the legalizer.
805     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
806     .alwaysLegal();
807 
808   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
809     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
810                {S32, S1}, {S64, S1}, {S16, S1}})
811     .scalarize(0)
812     .clampScalar(0, S32, S64)
813     .widenScalarToNextPow2(1, 32);
814 
815   // TODO: Split s1->s64 during regbankselect for VALU.
816   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
817                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
818                     .lowerIf(typeIs(1, S1))
819                     .customFor({{S32, S64}, {S64, S64}});
820   if (ST.has16BitInsts())
821     IToFP.legalFor({{S16, S16}});
822   IToFP.clampScalar(1, S32, S64)
823        .minScalar(0, S32)
824        .scalarize(0)
825        .widenScalarToNextPow2(1);
826 
827   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
828     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
829     .customFor({{S64, S32}, {S64, S64}})
830     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
831   if (ST.has16BitInsts())
832     FPToI.legalFor({{S16, S16}});
833   else
834     FPToI.minScalar(1, S32);
835 
836   FPToI.minScalar(0, S32)
837        .widenScalarToNextPow2(0, 32)
838        .scalarize(0)
839        .lower();
840 
841   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
842       .customFor({S16, S32})
843       .scalarize(0)
844       .lower();
845 
846   // Lower roundeven into G_FRINT
847   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
848     .scalarize(0)
849     .lower();
850 
851   if (ST.has16BitInsts()) {
852     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
853       .legalFor({S16, S32, S64})
854       .clampScalar(0, S16, S64)
855       .scalarize(0);
856   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
857     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
858       .legalFor({S32, S64})
859       .clampScalar(0, S32, S64)
860       .scalarize(0);
861   } else {
862     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
863       .legalFor({S32})
864       .customFor({S64})
865       .clampScalar(0, S32, S64)
866       .scalarize(0);
867   }
868 
869   getActionDefinitionsBuilder(G_PTR_ADD)
870     .legalIf(all(isPointer(0), sameSize(0, 1)))
871     .scalarize(0)
872     .scalarSameSizeAs(1, 0);
873 
874   getActionDefinitionsBuilder(G_PTRMASK)
875     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
876     .scalarSameSizeAs(1, 0)
877     .scalarize(0);
878 
879   auto &CmpBuilder =
880     getActionDefinitionsBuilder(G_ICMP)
881     // The compare output type differs based on the register bank of the output,
882     // so make both s1 and s32 legal.
883     //
884     // Scalar compares producing output in scc will be promoted to s32, as that
885     // is the allocatable register type that will be needed for the copy from
886     // scc. This will be promoted during RegBankSelect, and we assume something
887     // before that won't try to use s32 result types.
888     //
889     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
890     // bank.
891     .legalForCartesianProduct(
892       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
893     .legalForCartesianProduct(
894       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
895   if (ST.has16BitInsts()) {
896     CmpBuilder.legalFor({{S1, S16}});
897   }
898 
899   CmpBuilder
900     .widenScalarToNextPow2(1)
901     .clampScalar(1, S32, S64)
902     .scalarize(0)
903     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
904 
905   getActionDefinitionsBuilder(G_FCMP)
906     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
907     .widenScalarToNextPow2(1)
908     .clampScalar(1, S32, S64)
909     .scalarize(0);
910 
911   // FIXME: fpow has a selection pattern that should move to custom lowering.
912   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
913   if (ST.has16BitInsts())
914     Exp2Ops.legalFor({S32, S16});
915   else
916     Exp2Ops.legalFor({S32});
917   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
918   Exp2Ops.scalarize(0);
919 
920   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
921   if (ST.has16BitInsts())
922     ExpOps.customFor({{S32}, {S16}});
923   else
924     ExpOps.customFor({S32});
925   ExpOps.clampScalar(0, MinScalarFPTy, S32)
926         .scalarize(0);
927 
928   getActionDefinitionsBuilder(G_FPOWI)
929     .clampScalar(0, MinScalarFPTy, S32)
930     .lower();
931 
932   // The 64-bit versions produce 32-bit results, but only on the SALU.
933   getActionDefinitionsBuilder(G_CTPOP)
934     .legalFor({{S32, S32}, {S32, S64}})
935     .clampScalar(0, S32, S32)
936     .widenScalarToNextPow2(1, 32)
937     .clampScalar(1, S32, S64)
938     .scalarize(0)
939     .widenScalarToNextPow2(0, 32);
940 
941 
942   // The hardware instructions return a different result on 0 than the generic
943   // instructions expect. The hardware produces -1, but these produce the
944   // bitwidth.
945   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
946     .scalarize(0)
947     .clampScalar(0, S32, S32)
948     .clampScalar(1, S32, S64)
949     .widenScalarToNextPow2(0, 32)
950     .widenScalarToNextPow2(1, 32)
951     .custom();
952 
953   // The 64-bit versions produce 32-bit results, but only on the SALU.
954   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
955     .legalFor({{S32, S32}, {S32, S64}})
956     .clampScalar(0, S32, S32)
957     .clampScalar(1, S32, S64)
958     .scalarize(0)
959     .widenScalarToNextPow2(0, 32)
960     .widenScalarToNextPow2(1, 32);
961 
962   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
963   // RegBankSelect.
964   getActionDefinitionsBuilder(G_BITREVERSE)
965     .legalFor({S32, S64})
966     .clampScalar(0, S32, S64)
967     .scalarize(0)
968     .widenScalarToNextPow2(0);
969 
970   if (ST.has16BitInsts()) {
971     getActionDefinitionsBuilder(G_BSWAP)
972       .legalFor({S16, S32, V2S16})
973       .clampMaxNumElementsStrict(0, S16, 2)
974       // FIXME: Fixing non-power-of-2 before clamp is workaround for
975       // narrowScalar limitation.
976       .widenScalarToNextPow2(0)
977       .clampScalar(0, S16, S32)
978       .scalarize(0);
979 
980     if (ST.hasVOP3PInsts()) {
981       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
982         .legalFor({S32, S16, V2S16})
983         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
984         .clampMaxNumElements(0, S16, 2)
985         .minScalar(0, S16)
986         .widenScalarToNextPow2(0)
987         .scalarize(0)
988         .lower();
989     } else {
990       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
991         .legalFor({S32, S16})
992         .widenScalarToNextPow2(0)
993         .minScalar(0, S16)
994         .scalarize(0)
995         .lower();
996     }
997   } else {
998     // TODO: Should have same legality without v_perm_b32
999     getActionDefinitionsBuilder(G_BSWAP)
1000       .legalFor({S32})
1001       .lowerIf(scalarNarrowerThan(0, 32))
1002       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1003       // narrowScalar limitation.
1004       .widenScalarToNextPow2(0)
1005       .maxScalar(0, S32)
1006       .scalarize(0)
1007       .lower();
1008 
1009     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1010       .legalFor({S32})
1011       .minScalar(0, S32)
1012       .widenScalarToNextPow2(0)
1013       .scalarize(0)
1014       .lower();
1015   }
1016 
1017   getActionDefinitionsBuilder(G_INTTOPTR)
1018     // List the common cases
1019     .legalForCartesianProduct(AddrSpaces64, {S64})
1020     .legalForCartesianProduct(AddrSpaces32, {S32})
1021     .scalarize(0)
1022     // Accept any address space as long as the size matches
1023     .legalIf(sameSize(0, 1))
1024     .widenScalarIf(smallerThan(1, 0),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1027       })
1028     .narrowScalarIf(largerThan(1, 0),
1029       [](const LegalityQuery &Query) {
1030         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1031       });
1032 
1033   getActionDefinitionsBuilder(G_PTRTOINT)
1034     // List the common cases
1035     .legalForCartesianProduct(AddrSpaces64, {S64})
1036     .legalForCartesianProduct(AddrSpaces32, {S32})
1037     .scalarize(0)
1038     // Accept any address space as long as the size matches
1039     .legalIf(sameSize(0, 1))
1040     .widenScalarIf(smallerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       })
1044     .narrowScalarIf(
1045       largerThan(0, 1),
1046       [](const LegalityQuery &Query) {
1047         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1048       });
1049 
1050   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1051     .scalarize(0)
1052     .custom();
1053 
1054   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1055                                     bool IsLoad) -> bool {
1056     const LLT DstTy = Query.Types[0];
1057 
1058     // Split vector extloads.
1059     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1060 
1061     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1062       return true;
1063 
1064     const LLT PtrTy = Query.Types[1];
1065     unsigned AS = PtrTy.getAddressSpace();
1066     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1067       return true;
1068 
1069     // Catch weird sized loads that don't evenly divide into the access sizes
1070     // TODO: May be able to widen depending on alignment etc.
1071     unsigned NumRegs = (MemSize + 31) / 32;
1072     if (NumRegs == 3) {
1073       if (!ST.hasDwordx3LoadStores())
1074         return true;
1075     } else {
1076       // If the alignment allows, these should have been widened.
1077       if (!isPowerOf2_32(NumRegs))
1078         return true;
1079     }
1080 
1081     return false;
1082   };
1083 
1084   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1085   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1086   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1087 
1088   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1089   // LDS
1090   // TODO: Unsupported flat for SI.
1091 
1092   for (unsigned Op : {G_LOAD, G_STORE}) {
1093     const bool IsStore = Op == G_STORE;
1094 
1095     auto &Actions = getActionDefinitionsBuilder(Op);
1096     // Explicitly list some common cases.
1097     // TODO: Does this help compile time at all?
1098     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1099                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1100                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1101                                       {S64, GlobalPtr, S64, GlobalAlign32},
1102                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1103                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1104                                       {S32, GlobalPtr, S8, GlobalAlign8},
1105                                       {S32, GlobalPtr, S16, GlobalAlign16},
1106 
1107                                       {S32, LocalPtr, S32, 32},
1108                                       {S64, LocalPtr, S64, 32},
1109                                       {V2S32, LocalPtr, V2S32, 32},
1110                                       {S32, LocalPtr, S8, 8},
1111                                       {S32, LocalPtr, S16, 16},
1112                                       {V2S16, LocalPtr, S32, 32},
1113 
1114                                       {S32, PrivatePtr, S32, 32},
1115                                       {S32, PrivatePtr, S8, 8},
1116                                       {S32, PrivatePtr, S16, 16},
1117                                       {V2S16, PrivatePtr, S32, 32},
1118 
1119                                       {S32, ConstantPtr, S32, GlobalAlign32},
1120                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1121                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1122                                       {S64, ConstantPtr, S64, GlobalAlign32},
1123                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1124     Actions.legalIf(
1125       [=](const LegalityQuery &Query) -> bool {
1126         return isLoadStoreLegal(ST, Query);
1127       });
1128 
1129     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1130     // 64-bits.
1131     //
1132     // TODO: Should generalize bitcast action into coerce, which will also cover
1133     // inserting addrspacecasts.
1134     Actions.customIf(typeIs(1, Constant32Ptr));
1135 
1136     // Turn any illegal element vectors into something easier to deal
1137     // with. These will ultimately produce 32-bit scalar shifts to extract the
1138     // parts anyway.
1139     //
1140     // For odd 16-bit element vectors, prefer to split those into pieces with
1141     // 16-bit vector parts.
1142     Actions.bitcastIf(
1143       [=](const LegalityQuery &Query) -> bool {
1144         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1145                                           Query.MMODescrs[0].MemoryTy);
1146       }, bitcastToRegisterType(0));
1147 
1148     if (!IsStore) {
1149       // Widen suitably aligned loads by loading extra bytes. The standard
1150       // legalization actions can't properly express widening memory operands.
1151       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1152         return shouldWidenLoad(ST, Query, G_LOAD);
1153       });
1154     }
1155 
1156     // FIXME: load/store narrowing should be moved to lower action
1157     Actions
1158         .narrowScalarIf(
1159             [=](const LegalityQuery &Query) -> bool {
1160               return !Query.Types[0].isVector() &&
1161                      needToSplitMemOp(Query, Op == G_LOAD);
1162             },
1163             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1164               const LLT DstTy = Query.Types[0];
1165               const LLT PtrTy = Query.Types[1];
1166 
1167               const unsigned DstSize = DstTy.getSizeInBits();
1168               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1169 
1170               // Split extloads.
1171               if (DstSize > MemSize)
1172                 return std::make_pair(0, LLT::scalar(MemSize));
1173 
1174               unsigned MaxSize = maxSizeForAddrSpace(ST,
1175                                                      PtrTy.getAddressSpace(),
1176                                                      Op == G_LOAD);
1177               if (MemSize > MaxSize)
1178                 return std::make_pair(0, LLT::scalar(MaxSize));
1179 
1180               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1181               return std::make_pair(0, LLT::scalar(Align));
1182             })
1183         .fewerElementsIf(
1184             [=](const LegalityQuery &Query) -> bool {
1185               return Query.Types[0].isVector() &&
1186                      needToSplitMemOp(Query, Op == G_LOAD);
1187             },
1188             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1189               const LLT DstTy = Query.Types[0];
1190               const LLT PtrTy = Query.Types[1];
1191 
1192               LLT EltTy = DstTy.getElementType();
1193               unsigned MaxSize = maxSizeForAddrSpace(ST,
1194                                                      PtrTy.getAddressSpace(),
1195                                                      Op == G_LOAD);
1196 
1197               // FIXME: Handle widened to power of 2 results better. This ends
1198               // up scalarizing.
1199               // FIXME: 3 element stores scalarized on SI
1200 
1201               // Split if it's too large for the address space.
1202               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1203               if (MemSize > MaxSize) {
1204                 unsigned NumElts = DstTy.getNumElements();
1205                 unsigned EltSize = EltTy.getSizeInBits();
1206 
1207                 if (MaxSize % EltSize == 0) {
1208                   return std::make_pair(
1209                       0, LLT::scalarOrVector(
1210                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1211                 }
1212 
1213                 unsigned NumPieces = MemSize / MaxSize;
1214 
1215                 // FIXME: Refine when odd breakdowns handled
1216                 // The scalars will need to be re-legalized.
1217                 if (NumPieces == 1 || NumPieces >= NumElts ||
1218                     NumElts % NumPieces != 0)
1219                   return std::make_pair(0, EltTy);
1220 
1221                 return std::make_pair(
1222                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1223               }
1224 
1225               // FIXME: We could probably handle weird extending loads better.
1226               if (DstTy.getSizeInBits() > MemSize)
1227                 return std::make_pair(0, EltTy);
1228 
1229               unsigned EltSize = EltTy.getSizeInBits();
1230               unsigned DstSize = DstTy.getSizeInBits();
1231               if (!isPowerOf2_32(DstSize)) {
1232                 // We're probably decomposing an odd sized store. Try to split
1233                 // to the widest type. TODO: Account for alignment. As-is it
1234                 // should be OK, since the new parts will be further legalized.
1235                 unsigned FloorSize = PowerOf2Floor(DstSize);
1236                 return std::make_pair(
1237                     0, LLT::scalarOrVector(
1238                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1239               }
1240 
1241               // May need relegalization for the scalars.
1242               return std::make_pair(0, EltTy);
1243             })
1244     .minScalar(0, S32)
1245     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1246     .widenScalarToNextPow2(0)
1247     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1248     .lower();
1249   }
1250 
1251   // FIXME: Unaligned accesses not lowered.
1252   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1253                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1254                                                   {S32, GlobalPtr, S16, 2 * 8},
1255                                                   {S32, LocalPtr, S8, 8},
1256                                                   {S32, LocalPtr, S16, 16},
1257                                                   {S32, PrivatePtr, S8, 8},
1258                                                   {S32, PrivatePtr, S16, 16},
1259                                                   {S32, ConstantPtr, S8, 8},
1260                                                   {S32, ConstantPtr, S16, 2 * 8}})
1261                        .legalIf(
1262                          [=](const LegalityQuery &Query) -> bool {
1263                            return isLoadStoreLegal(ST, Query);
1264                          });
1265 
1266   if (ST.hasFlatAddressSpace()) {
1267     ExtLoads.legalForTypesWithMemDesc(
1268         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1269   }
1270 
1271   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1272   // 64-bits.
1273   //
1274   // TODO: Should generalize bitcast action into coerce, which will also cover
1275   // inserting addrspacecasts.
1276   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1277 
1278   ExtLoads.clampScalar(0, S32, S32)
1279           .widenScalarToNextPow2(0)
1280           .lower();
1281 
1282   auto &Atomics = getActionDefinitionsBuilder(
1283     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1284      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1285      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1286      G_ATOMICRMW_UMIN})
1287     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1288                {S64, GlobalPtr}, {S64, LocalPtr},
1289                {S32, RegionPtr}, {S64, RegionPtr}});
1290   if (ST.hasFlatAddressSpace()) {
1291     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1292   }
1293 
1294   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1295   if (ST.hasLDSFPAtomicAdd()) {
1296     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1297     if (ST.hasGFX90AInsts())
1298       Atomic.legalFor({{S64, LocalPtr}});
1299     if (ST.hasGFX940Insts())
1300       Atomic.legalFor({{V2S16, LocalPtr}});
1301   }
1302   if (ST.hasAtomicFaddInsts())
1303     Atomic.legalFor({{S32, GlobalPtr}});
1304 
1305   if (ST.hasGFX90AInsts()) {
1306     // These are legal with some caveats, and should have undergone expansion in
1307     // the IR in most situations
1308     // TODO: Move atomic expansion into legalizer
1309     // TODO: Also supports <2 x f16>
1310     Atomic.legalFor({
1311         {S32, GlobalPtr},
1312         {S64, GlobalPtr},
1313         {S64, FlatPtr}
1314       });
1315   }
1316 
1317   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1318   // demarshalling
1319   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1320     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1321                 {S32, FlatPtr}, {S64, FlatPtr}})
1322     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1323                {S32, RegionPtr}, {S64, RegionPtr}});
1324   // TODO: Pointer types, any 32-bit or 64-bit vector
1325 
1326   // Condition should be s32 for scalar, s1 for vector.
1327   getActionDefinitionsBuilder(G_SELECT)
1328       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1329                                  LocalPtr, FlatPtr, PrivatePtr,
1330                                  LLT::fixed_vector(2, LocalPtr),
1331                                  LLT::fixed_vector(2, PrivatePtr)},
1332                                 {S1, S32})
1333       .clampScalar(0, S16, S64)
1334       .scalarize(1)
1335       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1336       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1337       .clampMaxNumElements(0, S32, 2)
1338       .clampMaxNumElements(0, LocalPtr, 2)
1339       .clampMaxNumElements(0, PrivatePtr, 2)
1340       .scalarize(0)
1341       .widenScalarToNextPow2(0)
1342       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1343 
1344   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1345   // be more flexible with the shift amount type.
1346   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1347     .legalFor({{S32, S32}, {S64, S32}});
1348   if (ST.has16BitInsts()) {
1349     if (ST.hasVOP3PInsts()) {
1350       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1351             .clampMaxNumElements(0, S16, 2);
1352     } else
1353       Shifts.legalFor({{S16, S16}});
1354 
1355     // TODO: Support 16-bit shift amounts for all types
1356     Shifts.widenScalarIf(
1357       [=](const LegalityQuery &Query) {
1358         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1359         // 32-bit amount.
1360         const LLT ValTy = Query.Types[0];
1361         const LLT AmountTy = Query.Types[1];
1362         return ValTy.getSizeInBits() <= 16 &&
1363                AmountTy.getSizeInBits() < 16;
1364       }, changeTo(1, S16));
1365     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1366     Shifts.clampScalar(1, S32, S32);
1367     Shifts.widenScalarToNextPow2(0, 16);
1368     Shifts.clampScalar(0, S16, S64);
1369 
1370     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1371       .minScalar(0, S16)
1372       .scalarize(0)
1373       .lower();
1374   } else {
1375     // Make sure we legalize the shift amount type first, as the general
1376     // expansion for the shifted type will produce much worse code if it hasn't
1377     // been truncated already.
1378     Shifts.clampScalar(1, S32, S32);
1379     Shifts.widenScalarToNextPow2(0, 32);
1380     Shifts.clampScalar(0, S32, S64);
1381 
1382     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1383       .minScalar(0, S32)
1384       .scalarize(0)
1385       .lower();
1386   }
1387   Shifts.scalarize(0);
1388 
1389   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1390     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1391     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1392     unsigned IdxTypeIdx = 2;
1393 
1394     getActionDefinitionsBuilder(Op)
1395       .customIf([=](const LegalityQuery &Query) {
1396           const LLT EltTy = Query.Types[EltTypeIdx];
1397           const LLT VecTy = Query.Types[VecTypeIdx];
1398           const LLT IdxTy = Query.Types[IdxTypeIdx];
1399           const unsigned EltSize = EltTy.getSizeInBits();
1400           return (EltSize == 32 || EltSize == 64) &&
1401                   VecTy.getSizeInBits() % 32 == 0 &&
1402                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1403                   IdxTy.getSizeInBits() == 32;
1404         })
1405       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1406                  bitcastToVectorElement32(VecTypeIdx))
1407       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1408       .bitcastIf(
1409         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1410         [=](const LegalityQuery &Query) {
1411           // For > 64-bit element types, try to turn this into a 64-bit
1412           // element vector since we may be able to do better indexing
1413           // if this is scalar. If not, fall back to 32.
1414           const LLT EltTy = Query.Types[EltTypeIdx];
1415           const LLT VecTy = Query.Types[VecTypeIdx];
1416           const unsigned DstEltSize = EltTy.getSizeInBits();
1417           const unsigned VecSize = VecTy.getSizeInBits();
1418 
1419           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1420           return std::make_pair(
1421               VecTypeIdx,
1422               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1423         })
1424       .clampScalar(EltTypeIdx, S32, S64)
1425       .clampScalar(VecTypeIdx, S32, S64)
1426       .clampScalar(IdxTypeIdx, S32, S32)
1427       .clampMaxNumElements(VecTypeIdx, S32, 32)
1428       // TODO: Clamp elements for 64-bit vectors?
1429       // It should only be necessary with variable indexes.
1430       // As a last resort, lower to the stack
1431       .lower();
1432   }
1433 
1434   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1435     .unsupportedIf([=](const LegalityQuery &Query) {
1436         const LLT &EltTy = Query.Types[1].getElementType();
1437         return Query.Types[0] != EltTy;
1438       });
1439 
1440   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1441     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1442     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1443 
1444     // FIXME: Doesn't handle extract of illegal sizes.
1445     getActionDefinitionsBuilder(Op)
1446       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1447       .lowerIf([=](const LegalityQuery &Query) {
1448           // Sub-vector(or single element) insert and extract.
1449           // TODO: verify immediate offset here since lower only works with
1450           // whole elements.
1451           const LLT BigTy = Query.Types[BigTyIdx];
1452           return BigTy.isVector();
1453         })
1454       // FIXME: Multiples of 16 should not be legal.
1455       .legalIf([=](const LegalityQuery &Query) {
1456           const LLT BigTy = Query.Types[BigTyIdx];
1457           const LLT LitTy = Query.Types[LitTyIdx];
1458           return (BigTy.getSizeInBits() % 32 == 0) &&
1459                  (LitTy.getSizeInBits() % 16 == 0);
1460         })
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT BigTy = Query.Types[BigTyIdx];
1464           return (BigTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1467       .widenScalarIf(
1468         [=](const LegalityQuery &Query) {
1469           const LLT LitTy = Query.Types[LitTyIdx];
1470           return (LitTy.getScalarSizeInBits() < 16);
1471         },
1472         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1473       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1474       .widenScalarToNextPow2(BigTyIdx, 32);
1475 
1476   }
1477 
1478   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1479     .legalForCartesianProduct(AllS32Vectors, {S32})
1480     .legalForCartesianProduct(AllS64Vectors, {S64})
1481     .clampNumElements(0, V16S32, V32S32)
1482     .clampNumElements(0, V2S64, V16S64)
1483     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1484 
1485   if (ST.hasScalarPackInsts()) {
1486     BuildVector
1487       // FIXME: Should probably widen s1 vectors straight to s32
1488       .minScalarOrElt(0, S16)
1489       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1490       .minScalar(1, S32);
1491 
1492     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1493       .legalFor({V2S16, S32})
1494       .lower();
1495     BuildVector.minScalarOrElt(0, S32);
1496   } else {
1497     BuildVector.customFor({V2S16, S16});
1498     BuildVector.minScalarOrElt(0, S32);
1499 
1500     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1501       .customFor({V2S16, S32})
1502       .lower();
1503   }
1504 
1505   BuildVector.legalIf(isRegisterType(0));
1506 
1507   // FIXME: Clamp maximum size
1508   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1509     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1510     .clampMaxNumElements(0, S32, 32)
1511     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1512     .clampMaxNumElements(0, S16, 64);
1513 
1514   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1515   // pre-legalize.
1516   if (ST.hasVOP3PInsts()) {
1517     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1518       .customFor({V2S16, V2S16})
1519       .lower();
1520   } else
1521     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1522 
1523   // Merge/Unmerge
1524   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1525     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1526     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1527 
1528     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1529       const LLT Ty = Query.Types[TypeIdx];
1530       if (Ty.isVector()) {
1531         const LLT &EltTy = Ty.getElementType();
1532         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1533           return true;
1534         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1535           return true;
1536       }
1537       return false;
1538     };
1539 
1540     auto &Builder = getActionDefinitionsBuilder(Op)
1541       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1542       .lowerFor({{S16, V2S16}})
1543       .lowerIf([=](const LegalityQuery &Query) {
1544           const LLT BigTy = Query.Types[BigTyIdx];
1545           return BigTy.getSizeInBits() == 32;
1546         })
1547       // Try to widen to s16 first for small types.
1548       // TODO: Only do this on targets with legal s16 shifts
1549       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1550       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1551       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1552       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1553                            elementTypeIs(1, S16)),
1554                        changeTo(1, V2S16))
1555       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1556       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1557       // valid.
1558       .clampScalar(LitTyIdx, S32, S512)
1559       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1560       // Break up vectors with weird elements into scalars
1561       .fewerElementsIf(
1562         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1563         scalarize(0))
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1566         scalarize(1))
1567       .clampScalar(BigTyIdx, S32, MaxScalar);
1568 
1569     if (Op == G_MERGE_VALUES) {
1570       Builder.widenScalarIf(
1571         // TODO: Use 16-bit shifts if legal for 8-bit values?
1572         [=](const LegalityQuery &Query) {
1573           const LLT Ty = Query.Types[LitTyIdx];
1574           return Ty.getSizeInBits() < 32;
1575         },
1576         changeTo(LitTyIdx, S32));
1577     }
1578 
1579     Builder.widenScalarIf(
1580       [=](const LegalityQuery &Query) {
1581         const LLT Ty = Query.Types[BigTyIdx];
1582         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1583           Ty.getSizeInBits() % 16 != 0;
1584       },
1585       [=](const LegalityQuery &Query) {
1586         // Pick the next power of 2, or a multiple of 64 over 128.
1587         // Whichever is smaller.
1588         const LLT &Ty = Query.Types[BigTyIdx];
1589         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1590         if (NewSizeInBits >= 256) {
1591           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1592           if (RoundedTo < NewSizeInBits)
1593             NewSizeInBits = RoundedTo;
1594         }
1595         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1596       })
1597       // Any vectors left are the wrong size. Scalarize them.
1598       .scalarize(0)
1599       .scalarize(1);
1600   }
1601 
1602   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1603   // RegBankSelect.
1604   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1605     .legalFor({{S32}, {S64}});
1606 
1607   if (ST.hasVOP3PInsts()) {
1608     SextInReg.lowerFor({{V2S16}})
1609       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1610       // get more vector shift opportunities, since we'll get those when
1611       // expanded.
1612       .clampMaxNumElementsStrict(0, S16, 2);
1613   } else if (ST.has16BitInsts()) {
1614     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1615   } else {
1616     // Prefer to promote to s32 before lowering if we don't have 16-bit
1617     // shifts. This avoid a lot of intermediate truncate and extend operations.
1618     SextInReg.lowerFor({{S32}, {S64}});
1619   }
1620 
1621   SextInReg
1622     .scalarize(0)
1623     .clampScalar(0, S32, S64)
1624     .lower();
1625 
1626   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1627     .scalarize(0)
1628     .lower();
1629 
1630   // TODO: Only Try to form v2s16 with legal packed instructions.
1631   getActionDefinitionsBuilder(G_FSHR)
1632     .legalFor({{S32, S32}})
1633     .lowerFor({{V2S16, V2S16}})
1634     .clampMaxNumElementsStrict(0, S16, 2)
1635     .scalarize(0)
1636     .lower();
1637 
1638   if (ST.hasVOP3PInsts()) {
1639     getActionDefinitionsBuilder(G_FSHL)
1640       .lowerFor({{V2S16, V2S16}})
1641       .clampMaxNumElementsStrict(0, S16, 2)
1642       .scalarize(0)
1643       .lower();
1644   } else {
1645     getActionDefinitionsBuilder(G_FSHL)
1646       .scalarize(0)
1647       .lower();
1648   }
1649 
1650   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1651     .legalFor({S64});
1652 
1653   getActionDefinitionsBuilder(G_FENCE)
1654     .alwaysLegal();
1655 
1656   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1657       .scalarize(0)
1658       .minScalar(0, S32)
1659       .lower();
1660 
1661   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1662       .legalFor({{S32, S32}, {S64, S32}})
1663       .clampScalar(1, S32, S32)
1664       .clampScalar(0, S32, S64)
1665       .widenScalarToNextPow2(0)
1666       .scalarize(0);
1667 
1668   getActionDefinitionsBuilder({
1669       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1670       G_FCOPYSIGN,
1671 
1672       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1673       G_ATOMICRMW_NAND,
1674       G_ATOMICRMW_FSUB,
1675       G_READ_REGISTER,
1676       G_WRITE_REGISTER,
1677 
1678       G_SADDO, G_SSUBO,
1679 
1680        // TODO: Implement
1681       G_FMINIMUM, G_FMAXIMUM}).lower();
1682 
1683   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1684       .lower();
1685 
1686   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1687         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1688         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1689     .unsupported();
1690 
1691   getLegacyLegalizerInfo().computeTables();
1692   verify(*ST.getInstrInfo());
1693 }
1694 
1695 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1696                                          MachineInstr &MI) const {
1697   MachineIRBuilder &B = Helper.MIRBuilder;
1698   MachineRegisterInfo &MRI = *B.getMRI();
1699 
1700   switch (MI.getOpcode()) {
1701   case TargetOpcode::G_ADDRSPACE_CAST:
1702     return legalizeAddrSpaceCast(MI, MRI, B);
1703   case TargetOpcode::G_FRINT:
1704     return legalizeFrint(MI, MRI, B);
1705   case TargetOpcode::G_FCEIL:
1706     return legalizeFceil(MI, MRI, B);
1707   case TargetOpcode::G_FREM:
1708     return legalizeFrem(MI, MRI, B);
1709   case TargetOpcode::G_INTRINSIC_TRUNC:
1710     return legalizeIntrinsicTrunc(MI, MRI, B);
1711   case TargetOpcode::G_SITOFP:
1712     return legalizeITOFP(MI, MRI, B, true);
1713   case TargetOpcode::G_UITOFP:
1714     return legalizeITOFP(MI, MRI, B, false);
1715   case TargetOpcode::G_FPTOSI:
1716     return legalizeFPTOI(MI, MRI, B, true);
1717   case TargetOpcode::G_FPTOUI:
1718     return legalizeFPTOI(MI, MRI, B, false);
1719   case TargetOpcode::G_FMINNUM:
1720   case TargetOpcode::G_FMAXNUM:
1721   case TargetOpcode::G_FMINNUM_IEEE:
1722   case TargetOpcode::G_FMAXNUM_IEEE:
1723     return legalizeMinNumMaxNum(Helper, MI);
1724   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1725     return legalizeExtractVectorElt(MI, MRI, B);
1726   case TargetOpcode::G_INSERT_VECTOR_ELT:
1727     return legalizeInsertVectorElt(MI, MRI, B);
1728   case TargetOpcode::G_SHUFFLE_VECTOR:
1729     return legalizeShuffleVector(MI, MRI, B);
1730   case TargetOpcode::G_FSIN:
1731   case TargetOpcode::G_FCOS:
1732     return legalizeSinCos(MI, MRI, B);
1733   case TargetOpcode::G_GLOBAL_VALUE:
1734     return legalizeGlobalValue(MI, MRI, B);
1735   case TargetOpcode::G_LOAD:
1736   case TargetOpcode::G_SEXTLOAD:
1737   case TargetOpcode::G_ZEXTLOAD:
1738     return legalizeLoad(Helper, MI);
1739   case TargetOpcode::G_FMAD:
1740     return legalizeFMad(MI, MRI, B);
1741   case TargetOpcode::G_FDIV:
1742     return legalizeFDIV(MI, MRI, B);
1743   case TargetOpcode::G_UDIV:
1744   case TargetOpcode::G_UREM:
1745   case TargetOpcode::G_UDIVREM:
1746     return legalizeUnsignedDIV_REM(MI, MRI, B);
1747   case TargetOpcode::G_SDIV:
1748   case TargetOpcode::G_SREM:
1749   case TargetOpcode::G_SDIVREM:
1750     return legalizeSignedDIV_REM(MI, MRI, B);
1751   case TargetOpcode::G_ATOMIC_CMPXCHG:
1752     return legalizeAtomicCmpXChg(MI, MRI, B);
1753   case TargetOpcode::G_FLOG:
1754     return legalizeFlog(MI, B, numbers::ln2f);
1755   case TargetOpcode::G_FLOG10:
1756     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1757   case TargetOpcode::G_FEXP:
1758     return legalizeFExp(MI, B);
1759   case TargetOpcode::G_FPOW:
1760     return legalizeFPow(MI, B);
1761   case TargetOpcode::G_FFLOOR:
1762     return legalizeFFloor(MI, MRI, B);
1763   case TargetOpcode::G_BUILD_VECTOR:
1764     return legalizeBuildVector(MI, MRI, B);
1765   case TargetOpcode::G_CTLZ:
1766   case TargetOpcode::G_CTTZ:
1767     return legalizeCTLZ_CTTZ(MI, MRI, B);
1768   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1769     return legalizeFPTruncRound(MI, B);
1770   default:
1771     return false;
1772   }
1773 
1774   llvm_unreachable("expected switch to return");
1775 }
1776 
1777 Register AMDGPULegalizerInfo::getSegmentAperture(
1778   unsigned AS,
1779   MachineRegisterInfo &MRI,
1780   MachineIRBuilder &B) const {
1781   MachineFunction &MF = B.getMF();
1782   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1783   const LLT S32 = LLT::scalar(32);
1784 
1785   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1786 
1787   if (ST.hasApertureRegs()) {
1788     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1789     // getreg.
1790     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1791         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1792         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1793     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1794         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1795         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1796     unsigned Encoding =
1797         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1798         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1799         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1800 
1801     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1802 
1803     B.buildInstr(AMDGPU::S_GETREG_B32)
1804       .addDef(GetReg)
1805       .addImm(Encoding);
1806     MRI.setType(GetReg, S32);
1807 
1808     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1809     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1810   }
1811 
1812   // TODO: can we be smarter about machine pointer info?
1813   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1814   Register LoadAddr = MRI.createGenericVirtualRegister(
1815     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1816   // For code object version 5, private_base and shared_base are passed through
1817   // implicit kernargs.
1818   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1819     AMDGPUTargetLowering::ImplicitParameter Param =
1820         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1821                                       : AMDGPUTargetLowering::PRIVATE_BASE;
1822     uint64_t Offset =
1823         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1824 
1825     Register KernargPtrReg = MRI.createGenericVirtualRegister(
1826         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1827 
1828     if (!loadInputValue(KernargPtrReg, B,
1829                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1830       return Register();
1831 
1832     MachineMemOperand *MMO = MF.getMachineMemOperand(
1833         PtrInfo,
1834         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1835             MachineMemOperand::MOInvariant,
1836         LLT::scalar(32), commonAlignment(Align(64), Offset));
1837 
1838     // Pointer address
1839     B.buildPtrAdd(LoadAddr, KernargPtrReg,
1840                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1841     // Load address
1842     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1843   }
1844 
1845   Register QueuePtr = MRI.createGenericVirtualRegister(
1846     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1847 
1848   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1849     return Register();
1850 
1851   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1852   // private_segment_aperture_base_hi.
1853   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1854 
1855   MachineMemOperand *MMO = MF.getMachineMemOperand(
1856       PtrInfo,
1857       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1858           MachineMemOperand::MOInvariant,
1859       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1860 
1861   B.buildPtrAdd(LoadAddr, QueuePtr,
1862                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1863   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1864 }
1865 
1866 /// Return true if the value is a known valid address, such that a null check is
1867 /// not necessary.
1868 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1869                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1870   MachineInstr *Def = MRI.getVRegDef(Val);
1871   switch (Def->getOpcode()) {
1872   case AMDGPU::G_FRAME_INDEX:
1873   case AMDGPU::G_GLOBAL_VALUE:
1874   case AMDGPU::G_BLOCK_ADDR:
1875     return true;
1876   case AMDGPU::G_CONSTANT: {
1877     const ConstantInt *CI = Def->getOperand(1).getCImm();
1878     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1879   }
1880   default:
1881     return false;
1882   }
1883 
1884   return false;
1885 }
1886 
1887 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1888   MachineInstr &MI, MachineRegisterInfo &MRI,
1889   MachineIRBuilder &B) const {
1890   MachineFunction &MF = B.getMF();
1891 
1892   const LLT S32 = LLT::scalar(32);
1893   Register Dst = MI.getOperand(0).getReg();
1894   Register Src = MI.getOperand(1).getReg();
1895 
1896   LLT DstTy = MRI.getType(Dst);
1897   LLT SrcTy = MRI.getType(Src);
1898   unsigned DestAS = DstTy.getAddressSpace();
1899   unsigned SrcAS = SrcTy.getAddressSpace();
1900 
1901   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1902   // vector element.
1903   assert(!DstTy.isVector());
1904 
1905   const AMDGPUTargetMachine &TM
1906     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1907 
1908   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1909     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1910     return true;
1911   }
1912 
1913   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1914       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1915        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1916     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1917       // Extract low 32-bits of the pointer.
1918       B.buildExtract(Dst, Src, 0);
1919       MI.eraseFromParent();
1920       return true;
1921     }
1922 
1923     unsigned NullVal = TM.getNullPointerValue(DestAS);
1924 
1925     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1926     auto FlatNull = B.buildConstant(SrcTy, 0);
1927 
1928     // Extract low 32-bits of the pointer.
1929     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1930 
1931     auto CmpRes =
1932         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1933     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1934 
1935     MI.eraseFromParent();
1936     return true;
1937   }
1938 
1939   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1940       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1941        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1942     if (!ST.hasFlatAddressSpace())
1943       return false;
1944 
1945     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1946     if (!ApertureReg.isValid())
1947       return false;
1948 
1949     // Coerce the type of the low half of the result so we can use merge_values.
1950     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1951 
1952     // TODO: Should we allow mismatched types but matching sizes in merges to
1953     // avoid the ptrtoint?
1954     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1955 
1956     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1957       B.buildCopy(Dst, BuildPtr);
1958       MI.eraseFromParent();
1959       return true;
1960     }
1961 
1962     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1963     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1964 
1965     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1966                               SegmentNull.getReg(0));
1967 
1968     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1969 
1970     MI.eraseFromParent();
1971     return true;
1972   }
1973 
1974   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1975       SrcTy.getSizeInBits() == 64) {
1976     // Truncate.
1977     B.buildExtract(Dst, Src, 0);
1978     MI.eraseFromParent();
1979     return true;
1980   }
1981 
1982   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1983       DstTy.getSizeInBits() == 64) {
1984     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1985     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1986 
1987     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1988     // another. Merge operands are required to be the same type, but creating an
1989     // extra ptrtoint would be kind of pointless.
1990     auto HighAddr = B.buildConstant(
1991         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1992     B.buildMerge(Dst, {Src, HighAddr});
1993     MI.eraseFromParent();
1994     return true;
1995   }
1996 
1997   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1998       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
1999 
2000   LLVMContext &Ctx = MF.getFunction().getContext();
2001   Ctx.diagnose(InvalidAddrSpaceCast);
2002   B.buildUndef(Dst);
2003   MI.eraseFromParent();
2004   return true;
2005 }
2006 
2007 bool AMDGPULegalizerInfo::legalizeFrint(
2008   MachineInstr &MI, MachineRegisterInfo &MRI,
2009   MachineIRBuilder &B) const {
2010   Register Src = MI.getOperand(1).getReg();
2011   LLT Ty = MRI.getType(Src);
2012   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2013 
2014   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2015   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2016 
2017   auto C1 = B.buildFConstant(Ty, C1Val);
2018   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2019 
2020   // TODO: Should this propagate fast-math-flags?
2021   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2022   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2023 
2024   auto C2 = B.buildFConstant(Ty, C2Val);
2025   auto Fabs = B.buildFAbs(Ty, Src);
2026 
2027   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2028   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2029   MI.eraseFromParent();
2030   return true;
2031 }
2032 
2033 bool AMDGPULegalizerInfo::legalizeFceil(
2034   MachineInstr &MI, MachineRegisterInfo &MRI,
2035   MachineIRBuilder &B) const {
2036 
2037   const LLT S1 = LLT::scalar(1);
2038   const LLT S64 = LLT::scalar(64);
2039 
2040   Register Src = MI.getOperand(1).getReg();
2041   assert(MRI.getType(Src) == S64);
2042 
2043   // result = trunc(src)
2044   // if (src > 0.0 && src != result)
2045   //   result += 1.0
2046 
2047   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2048 
2049   const auto Zero = B.buildFConstant(S64, 0.0);
2050   const auto One = B.buildFConstant(S64, 1.0);
2051   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2052   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2053   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2054   auto Add = B.buildSelect(S64, And, One, Zero);
2055 
2056   // TODO: Should this propagate fast-math-flags?
2057   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2058   MI.eraseFromParent();
2059   return true;
2060 }
2061 
2062 bool AMDGPULegalizerInfo::legalizeFrem(
2063   MachineInstr &MI, MachineRegisterInfo &MRI,
2064   MachineIRBuilder &B) const {
2065     Register DstReg = MI.getOperand(0).getReg();
2066     Register Src0Reg = MI.getOperand(1).getReg();
2067     Register Src1Reg = MI.getOperand(2).getReg();
2068     auto Flags = MI.getFlags();
2069     LLT Ty = MRI.getType(DstReg);
2070 
2071     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2072     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2073     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2074     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2075     MI.eraseFromParent();
2076     return true;
2077 }
2078 
2079 static MachineInstrBuilder extractF64Exponent(Register Hi,
2080                                               MachineIRBuilder &B) {
2081   const unsigned FractBits = 52;
2082   const unsigned ExpBits = 11;
2083   LLT S32 = LLT::scalar(32);
2084 
2085   auto Const0 = B.buildConstant(S32, FractBits - 32);
2086   auto Const1 = B.buildConstant(S32, ExpBits);
2087 
2088   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2089     .addUse(Hi)
2090     .addUse(Const0.getReg(0))
2091     .addUse(Const1.getReg(0));
2092 
2093   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2094 }
2095 
2096 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2097   MachineInstr &MI, MachineRegisterInfo &MRI,
2098   MachineIRBuilder &B) const {
2099   const LLT S1 = LLT::scalar(1);
2100   const LLT S32 = LLT::scalar(32);
2101   const LLT S64 = LLT::scalar(64);
2102 
2103   Register Src = MI.getOperand(1).getReg();
2104   assert(MRI.getType(Src) == S64);
2105 
2106   // TODO: Should this use extract since the low half is unused?
2107   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2108   Register Hi = Unmerge.getReg(1);
2109 
2110   // Extract the upper half, since this is where we will find the sign and
2111   // exponent.
2112   auto Exp = extractF64Exponent(Hi, B);
2113 
2114   const unsigned FractBits = 52;
2115 
2116   // Extract the sign bit.
2117   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2118   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2119 
2120   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2121 
2122   const auto Zero32 = B.buildConstant(S32, 0);
2123 
2124   // Extend back to 64-bits.
2125   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2126 
2127   auto Shr = B.buildAShr(S64, FractMask, Exp);
2128   auto Not = B.buildNot(S64, Shr);
2129   auto Tmp0 = B.buildAnd(S64, Src, Not);
2130   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2131 
2132   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2133   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2134 
2135   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2136   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 bool AMDGPULegalizerInfo::legalizeITOFP(
2142   MachineInstr &MI, MachineRegisterInfo &MRI,
2143   MachineIRBuilder &B, bool Signed) const {
2144 
2145   Register Dst = MI.getOperand(0).getReg();
2146   Register Src = MI.getOperand(1).getReg();
2147 
2148   const LLT S64 = LLT::scalar(64);
2149   const LLT S32 = LLT::scalar(32);
2150 
2151   assert(MRI.getType(Src) == S64);
2152 
2153   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2154   auto ThirtyTwo = B.buildConstant(S32, 32);
2155 
2156   if (MRI.getType(Dst) == S64) {
2157     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2158                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2159 
2160     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2161     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2162                      .addUse(CvtHi.getReg(0))
2163                      .addUse(ThirtyTwo.getReg(0));
2164 
2165     // TODO: Should this propagate fast-math-flags?
2166     B.buildFAdd(Dst, LdExp, CvtLo);
2167     MI.eraseFromParent();
2168     return true;
2169   }
2170 
2171   assert(MRI.getType(Dst) == S32);
2172 
2173   auto One = B.buildConstant(S32, 1);
2174 
2175   MachineInstrBuilder ShAmt;
2176   if (Signed) {
2177     auto ThirtyOne = B.buildConstant(S32, 31);
2178     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2179     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2180     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2181     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2182                                /*HasSideEffects=*/false)
2183                   .addUse(Unmerge.getReg(1));
2184     auto LS2 = B.buildSub(S32, LS, One);
2185     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2186   } else
2187     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2188   auto Norm = B.buildShl(S64, Src, ShAmt);
2189   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2190   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2191   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2192   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2193   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2194   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2195                    /*HasSideEffects=*/false)
2196       .addUse(FVal.getReg(0))
2197       .addUse(Scale.getReg(0));
2198   MI.eraseFromParent();
2199   return true;
2200 }
2201 
2202 // TODO: Copied from DAG implementation. Verify logic and document how this
2203 // actually works.
2204 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2205                                         MachineRegisterInfo &MRI,
2206                                         MachineIRBuilder &B,
2207                                         bool Signed) const {
2208 
2209   Register Dst = MI.getOperand(0).getReg();
2210   Register Src = MI.getOperand(1).getReg();
2211 
2212   const LLT S64 = LLT::scalar(64);
2213   const LLT S32 = LLT::scalar(32);
2214 
2215   const LLT SrcLT = MRI.getType(Src);
2216   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2217 
2218   unsigned Flags = MI.getFlags();
2219 
2220   // The basic idea of converting a floating point number into a pair of 32-bit
2221   // integers is illustrated as follows:
2222   //
2223   //     tf := trunc(val);
2224   //    hif := floor(tf * 2^-32);
2225   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2226   //     hi := fptoi(hif);
2227   //     lo := fptoi(lof);
2228   //
2229   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2230   MachineInstrBuilder Sign;
2231   if (Signed && SrcLT == S32) {
2232     // However, a 32-bit floating point number has only 23 bits mantissa and
2233     // it's not enough to hold all the significant bits of `lof` if val is
2234     // negative. To avoid the loss of precision, We need to take the absolute
2235     // value after truncating and flip the result back based on the original
2236     // signedness.
2237     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2238     Trunc = B.buildFAbs(S32, Trunc, Flags);
2239   }
2240   MachineInstrBuilder K0, K1;
2241   if (SrcLT == S64) {
2242     K0 = B.buildFConstant(S64,
2243                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2244     K1 = B.buildFConstant(S64,
2245                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2246   } else {
2247     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2248     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2249   }
2250 
2251   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2252   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2253   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2254 
2255   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2256                                      : B.buildFPTOUI(S32, FloorMul);
2257   auto Lo = B.buildFPTOUI(S32, Fma);
2258 
2259   if (Signed && SrcLT == S32) {
2260     // Flip the result based on the signedness, which is either all 0s or 1s.
2261     Sign = B.buildMerge(S64, {Sign, Sign});
2262     // r := xor({lo, hi}, sign) - sign;
2263     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2264   } else
2265     B.buildMerge(Dst, {Lo, Hi});
2266   MI.eraseFromParent();
2267 
2268   return true;
2269 }
2270 
2271 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2272                                                MachineInstr &MI) const {
2273   MachineFunction &MF = Helper.MIRBuilder.getMF();
2274   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2275 
2276   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2277                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2278 
2279   // With ieee_mode disabled, the instructions have the correct behavior
2280   // already for G_FMINNUM/G_FMAXNUM
2281   if (!MFI->getMode().IEEE)
2282     return !IsIEEEOp;
2283 
2284   if (IsIEEEOp)
2285     return true;
2286 
2287   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2288 }
2289 
2290 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2291   MachineInstr &MI, MachineRegisterInfo &MRI,
2292   MachineIRBuilder &B) const {
2293   // TODO: Should move some of this into LegalizerHelper.
2294 
2295   // TODO: Promote dynamic indexing of s16 to s32
2296 
2297   // FIXME: Artifact combiner probably should have replaced the truncated
2298   // constant before this, so we shouldn't need
2299   // getIConstantVRegValWithLookThrough.
2300   Optional<ValueAndVReg> MaybeIdxVal =
2301       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2302   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2303     return true;
2304   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2305 
2306   Register Dst = MI.getOperand(0).getReg();
2307   Register Vec = MI.getOperand(1).getReg();
2308 
2309   LLT VecTy = MRI.getType(Vec);
2310   LLT EltTy = VecTy.getElementType();
2311   assert(EltTy == MRI.getType(Dst));
2312 
2313   if (IdxVal < VecTy.getNumElements()) {
2314     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2315     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2316   } else {
2317     B.buildUndef(Dst);
2318   }
2319 
2320   MI.eraseFromParent();
2321   return true;
2322 }
2323 
2324 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2325   MachineInstr &MI, MachineRegisterInfo &MRI,
2326   MachineIRBuilder &B) const {
2327   // TODO: Should move some of this into LegalizerHelper.
2328 
2329   // TODO: Promote dynamic indexing of s16 to s32
2330 
2331   // FIXME: Artifact combiner probably should have replaced the truncated
2332   // constant before this, so we shouldn't need
2333   // getIConstantVRegValWithLookThrough.
2334   Optional<ValueAndVReg> MaybeIdxVal =
2335       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2336   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2337     return true;
2338 
2339   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2340   Register Dst = MI.getOperand(0).getReg();
2341   Register Vec = MI.getOperand(1).getReg();
2342   Register Ins = MI.getOperand(2).getReg();
2343 
2344   LLT VecTy = MRI.getType(Vec);
2345   LLT EltTy = VecTy.getElementType();
2346   assert(EltTy == MRI.getType(Ins));
2347   (void)Ins;
2348 
2349   unsigned NumElts = VecTy.getNumElements();
2350   if (IdxVal < NumElts) {
2351     SmallVector<Register, 8> SrcRegs;
2352     for (unsigned i = 0; i < NumElts; ++i)
2353       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2354     B.buildUnmerge(SrcRegs, Vec);
2355 
2356     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2357     B.buildMerge(Dst, SrcRegs);
2358   } else {
2359     B.buildUndef(Dst);
2360   }
2361 
2362   MI.eraseFromParent();
2363   return true;
2364 }
2365 
2366 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2367   MachineInstr &MI, MachineRegisterInfo &MRI,
2368   MachineIRBuilder &B) const {
2369   const LLT V2S16 = LLT::fixed_vector(2, 16);
2370 
2371   Register Dst = MI.getOperand(0).getReg();
2372   Register Src0 = MI.getOperand(1).getReg();
2373   LLT DstTy = MRI.getType(Dst);
2374   LLT SrcTy = MRI.getType(Src0);
2375 
2376   if (SrcTy == V2S16 && DstTy == V2S16 &&
2377       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2378     return true;
2379 
2380   MachineIRBuilder HelperBuilder(MI);
2381   GISelObserverWrapper DummyObserver;
2382   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2383   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2384 }
2385 
2386 bool AMDGPULegalizerInfo::legalizeSinCos(
2387   MachineInstr &MI, MachineRegisterInfo &MRI,
2388   MachineIRBuilder &B) const {
2389 
2390   Register DstReg = MI.getOperand(0).getReg();
2391   Register SrcReg = MI.getOperand(1).getReg();
2392   LLT Ty = MRI.getType(DstReg);
2393   unsigned Flags = MI.getFlags();
2394 
2395   Register TrigVal;
2396   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2397   if (ST.hasTrigReducedRange()) {
2398     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2399     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2400       .addUse(MulVal.getReg(0))
2401       .setMIFlags(Flags).getReg(0);
2402   } else
2403     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2404 
2405   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2406     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2407   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2408     .addUse(TrigVal)
2409     .setMIFlags(Flags);
2410   MI.eraseFromParent();
2411   return true;
2412 }
2413 
2414 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2415                                                   MachineIRBuilder &B,
2416                                                   const GlobalValue *GV,
2417                                                   int64_t Offset,
2418                                                   unsigned GAFlags) const {
2419   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2420   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2421   // to the following code sequence:
2422   //
2423   // For constant address space:
2424   //   s_getpc_b64 s[0:1]
2425   //   s_add_u32 s0, s0, $symbol
2426   //   s_addc_u32 s1, s1, 0
2427   //
2428   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2429   //   a fixup or relocation is emitted to replace $symbol with a literal
2430   //   constant, which is a pc-relative offset from the encoding of the $symbol
2431   //   operand to the global variable.
2432   //
2433   // For global address space:
2434   //   s_getpc_b64 s[0:1]
2435   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2436   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2437   //
2438   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2439   //   fixups or relocations are emitted to replace $symbol@*@lo and
2440   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2441   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2442   //   operand to the global variable.
2443   //
2444   // What we want here is an offset from the value returned by s_getpc
2445   // (which is the address of the s_add_u32 instruction) to the global
2446   // variable, but since the encoding of $symbol starts 4 bytes after the start
2447   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2448   // small. This requires us to add 4 to the global variable offset in order to
2449   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2450   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2451   // instruction.
2452 
2453   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2454 
2455   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2456     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2457 
2458   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2459     .addDef(PCReg);
2460 
2461   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2462   if (GAFlags == SIInstrInfo::MO_NONE)
2463     MIB.addImm(0);
2464   else
2465     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2466 
2467   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2468 
2469   if (PtrTy.getSizeInBits() == 32)
2470     B.buildExtract(DstReg, PCReg, 0);
2471   return true;
2472  }
2473 
2474 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2475   MachineInstr &MI, MachineRegisterInfo &MRI,
2476   MachineIRBuilder &B) const {
2477   Register DstReg = MI.getOperand(0).getReg();
2478   LLT Ty = MRI.getType(DstReg);
2479   unsigned AS = Ty.getAddressSpace();
2480 
2481   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2482   MachineFunction &MF = B.getMF();
2483   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2484 
2485   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2486     if (!MFI->isModuleEntryFunction() &&
2487         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2488       const Function &Fn = MF.getFunction();
2489       DiagnosticInfoUnsupported BadLDSDecl(
2490         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2491         DS_Warning);
2492       Fn.getContext().diagnose(BadLDSDecl);
2493 
2494       // We currently don't have a way to correctly allocate LDS objects that
2495       // aren't directly associated with a kernel. We do force inlining of
2496       // functions that use local objects. However, if these dead functions are
2497       // not eliminated, we don't want a compile time error. Just emit a warning
2498       // and a trap, since there should be no callable path here.
2499       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2500       B.buildUndef(DstReg);
2501       MI.eraseFromParent();
2502       return true;
2503     }
2504 
2505     // TODO: We could emit code to handle the initialization somewhere.
2506     // We ignore the initializer for now and legalize it to allow selection.
2507     // The initializer will anyway get errored out during assembly emission.
2508     const SITargetLowering *TLI = ST.getTargetLowering();
2509     if (!TLI->shouldUseLDSConstAddress(GV)) {
2510       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2511       return true; // Leave in place;
2512     }
2513 
2514     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2515       Type *Ty = GV->getValueType();
2516       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2517       // zero-sized type in other languages to declare the dynamic shared
2518       // memory which size is not known at the compile time. They will be
2519       // allocated by the runtime and placed directly after the static
2520       // allocated ones. They all share the same offset.
2521       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2522         // Adjust alignment for that dynamic shared memory array.
2523         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2524         LLT S32 = LLT::scalar(32);
2525         auto Sz =
2526             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2527         B.buildIntToPtr(DstReg, Sz);
2528         MI.eraseFromParent();
2529         return true;
2530       }
2531     }
2532 
2533     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2534                                                    *cast<GlobalVariable>(GV)));
2535     MI.eraseFromParent();
2536     return true;
2537   }
2538 
2539   const SITargetLowering *TLI = ST.getTargetLowering();
2540 
2541   if (TLI->shouldEmitFixup(GV)) {
2542     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2543     MI.eraseFromParent();
2544     return true;
2545   }
2546 
2547   if (TLI->shouldEmitPCReloc(GV)) {
2548     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2549     MI.eraseFromParent();
2550     return true;
2551   }
2552 
2553   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2554   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2555 
2556   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2557   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2558       MachinePointerInfo::getGOT(MF),
2559       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2560           MachineMemOperand::MOInvariant,
2561       LoadTy, Align(8));
2562 
2563   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2564 
2565   if (Ty.getSizeInBits() == 32) {
2566     // Truncate if this is a 32-bit constant address.
2567     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2568     B.buildExtract(DstReg, Load, 0);
2569   } else
2570     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2571 
2572   MI.eraseFromParent();
2573   return true;
2574 }
2575 
2576 static LLT widenToNextPowerOf2(LLT Ty) {
2577   if (Ty.isVector())
2578     return Ty.changeElementCount(
2579         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2580   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2581 }
2582 
2583 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2584                                        MachineInstr &MI) const {
2585   MachineIRBuilder &B = Helper.MIRBuilder;
2586   MachineRegisterInfo &MRI = *B.getMRI();
2587   GISelChangeObserver &Observer = Helper.Observer;
2588 
2589   Register PtrReg = MI.getOperand(1).getReg();
2590   LLT PtrTy = MRI.getType(PtrReg);
2591   unsigned AddrSpace = PtrTy.getAddressSpace();
2592 
2593   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2594     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2595     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2596     Observer.changingInstr(MI);
2597     MI.getOperand(1).setReg(Cast.getReg(0));
2598     Observer.changedInstr(MI);
2599     return true;
2600   }
2601 
2602   if (MI.getOpcode() != AMDGPU::G_LOAD)
2603     return false;
2604 
2605   Register ValReg = MI.getOperand(0).getReg();
2606   LLT ValTy = MRI.getType(ValReg);
2607 
2608   MachineMemOperand *MMO = *MI.memoperands_begin();
2609   const unsigned ValSize = ValTy.getSizeInBits();
2610   const LLT MemTy = MMO->getMemoryType();
2611   const Align MemAlign = MMO->getAlign();
2612   const unsigned MemSize = MemTy.getSizeInBits();
2613   const uint64_t AlignInBits = 8 * MemAlign.value();
2614 
2615   // Widen non-power-of-2 loads to the alignment if needed
2616   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2617     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2618 
2619     // This was already the correct extending load result type, so just adjust
2620     // the memory type.
2621     if (WideMemSize == ValSize) {
2622       MachineFunction &MF = B.getMF();
2623 
2624       MachineMemOperand *WideMMO =
2625           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2626       Observer.changingInstr(MI);
2627       MI.setMemRefs(MF, {WideMMO});
2628       Observer.changedInstr(MI);
2629       return true;
2630     }
2631 
2632     // Don't bother handling edge case that should probably never be produced.
2633     if (ValSize > WideMemSize)
2634       return false;
2635 
2636     LLT WideTy = widenToNextPowerOf2(ValTy);
2637 
2638     Register WideLoad;
2639     if (!WideTy.isVector()) {
2640       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2641       B.buildTrunc(ValReg, WideLoad).getReg(0);
2642     } else {
2643       // Extract the subvector.
2644 
2645       if (isRegisterType(ValTy)) {
2646         // If this a case where G_EXTRACT is legal, use it.
2647         // (e.g. <3 x s32> -> <4 x s32>)
2648         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2649         B.buildExtract(ValReg, WideLoad, 0);
2650       } else {
2651         // For cases where the widened type isn't a nice register value, unmerge
2652         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2653         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2654         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2655       }
2656     }
2657 
2658     MI.eraseFromParent();
2659     return true;
2660   }
2661 
2662   return false;
2663 }
2664 
2665 bool AMDGPULegalizerInfo::legalizeFMad(
2666   MachineInstr &MI, MachineRegisterInfo &MRI,
2667   MachineIRBuilder &B) const {
2668   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2669   assert(Ty.isScalar());
2670 
2671   MachineFunction &MF = B.getMF();
2672   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2673 
2674   // TODO: Always legal with future ftz flag.
2675   // FIXME: Do we need just output?
2676   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2677     return true;
2678   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2679     return true;
2680 
2681   MachineIRBuilder HelperBuilder(MI);
2682   GISelObserverWrapper DummyObserver;
2683   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2684   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2685 }
2686 
2687 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2688   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2689   Register DstReg = MI.getOperand(0).getReg();
2690   Register PtrReg = MI.getOperand(1).getReg();
2691   Register CmpVal = MI.getOperand(2).getReg();
2692   Register NewVal = MI.getOperand(3).getReg();
2693 
2694   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2695          "this should not have been custom lowered");
2696 
2697   LLT ValTy = MRI.getType(CmpVal);
2698   LLT VecTy = LLT::fixed_vector(2, ValTy);
2699 
2700   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2701 
2702   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2703     .addDef(DstReg)
2704     .addUse(PtrReg)
2705     .addUse(PackedVal)
2706     .setMemRefs(MI.memoperands());
2707 
2708   MI.eraseFromParent();
2709   return true;
2710 }
2711 
2712 bool AMDGPULegalizerInfo::legalizeFlog(
2713   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2714   Register Dst = MI.getOperand(0).getReg();
2715   Register Src = MI.getOperand(1).getReg();
2716   LLT Ty = B.getMRI()->getType(Dst);
2717   unsigned Flags = MI.getFlags();
2718 
2719   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2720   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2721 
2722   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2723   MI.eraseFromParent();
2724   return true;
2725 }
2726 
2727 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2728                                        MachineIRBuilder &B) const {
2729   Register Dst = MI.getOperand(0).getReg();
2730   Register Src = MI.getOperand(1).getReg();
2731   unsigned Flags = MI.getFlags();
2732   LLT Ty = B.getMRI()->getType(Dst);
2733 
2734   auto K = B.buildFConstant(Ty, numbers::log2e);
2735   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2736   B.buildFExp2(Dst, Mul, Flags);
2737   MI.eraseFromParent();
2738   return true;
2739 }
2740 
2741 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2742                                        MachineIRBuilder &B) const {
2743   Register Dst = MI.getOperand(0).getReg();
2744   Register Src0 = MI.getOperand(1).getReg();
2745   Register Src1 = MI.getOperand(2).getReg();
2746   unsigned Flags = MI.getFlags();
2747   LLT Ty = B.getMRI()->getType(Dst);
2748   const LLT S16 = LLT::scalar(16);
2749   const LLT S32 = LLT::scalar(32);
2750 
2751   if (Ty == S32) {
2752     auto Log = B.buildFLog2(S32, Src0, Flags);
2753     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2754       .addUse(Log.getReg(0))
2755       .addUse(Src1)
2756       .setMIFlags(Flags);
2757     B.buildFExp2(Dst, Mul, Flags);
2758   } else if (Ty == S16) {
2759     // There's no f16 fmul_legacy, so we need to convert for it.
2760     auto Log = B.buildFLog2(S16, Src0, Flags);
2761     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2762     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2763     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2764       .addUse(Ext0.getReg(0))
2765       .addUse(Ext1.getReg(0))
2766       .setMIFlags(Flags);
2767 
2768     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2769   } else
2770     return false;
2771 
2772   MI.eraseFromParent();
2773   return true;
2774 }
2775 
2776 // Find a source register, ignoring any possible source modifiers.
2777 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2778   Register ModSrc = OrigSrc;
2779   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2780     ModSrc = SrcFNeg->getOperand(1).getReg();
2781     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2782       ModSrc = SrcFAbs->getOperand(1).getReg();
2783   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2784     ModSrc = SrcFAbs->getOperand(1).getReg();
2785   return ModSrc;
2786 }
2787 
2788 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2789                                          MachineRegisterInfo &MRI,
2790                                          MachineIRBuilder &B) const {
2791 
2792   const LLT S1 = LLT::scalar(1);
2793   const LLT S64 = LLT::scalar(64);
2794   Register Dst = MI.getOperand(0).getReg();
2795   Register OrigSrc = MI.getOperand(1).getReg();
2796   unsigned Flags = MI.getFlags();
2797   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2798          "this should not have been custom lowered");
2799 
2800   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2801   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2802   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2803   // V_FRACT bug is:
2804   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2805   //
2806   // Convert floor(x) to (x - fract(x))
2807 
2808   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2809     .addUse(OrigSrc)
2810     .setMIFlags(Flags);
2811 
2812   // Give source modifier matching some assistance before obscuring a foldable
2813   // pattern.
2814 
2815   // TODO: We can avoid the neg on the fract? The input sign to fract
2816   // shouldn't matter?
2817   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2818 
2819   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2820 
2821   Register Min = MRI.createGenericVirtualRegister(S64);
2822 
2823   // We don't need to concern ourselves with the snan handling difference, so
2824   // use the one which will directly select.
2825   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2826   if (MFI->getMode().IEEE)
2827     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2828   else
2829     B.buildFMinNum(Min, Fract, Const, Flags);
2830 
2831   Register CorrectedFract = Min;
2832   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2833     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2834     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2835   }
2836 
2837   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2838   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2839 
2840   MI.eraseFromParent();
2841   return true;
2842 }
2843 
2844 // Turn an illegal packed v2s16 build vector into bit operations.
2845 // TODO: This should probably be a bitcast action in LegalizerHelper.
2846 bool AMDGPULegalizerInfo::legalizeBuildVector(
2847   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2848   Register Dst = MI.getOperand(0).getReg();
2849   const LLT S32 = LLT::scalar(32);
2850   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2851 
2852   Register Src0 = MI.getOperand(1).getReg();
2853   Register Src1 = MI.getOperand(2).getReg();
2854   assert(MRI.getType(Src0) == LLT::scalar(16));
2855 
2856   auto Merge = B.buildMerge(S32, {Src0, Src1});
2857   B.buildBitcast(Dst, Merge);
2858 
2859   MI.eraseFromParent();
2860   return true;
2861 }
2862 
2863 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2864 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2865 // case with a single min instruction instead of a compare+select.
2866 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2867                                             MachineRegisterInfo &MRI,
2868                                             MachineIRBuilder &B) const {
2869   Register Dst = MI.getOperand(0).getReg();
2870   Register Src = MI.getOperand(1).getReg();
2871   LLT DstTy = MRI.getType(Dst);
2872   LLT SrcTy = MRI.getType(Src);
2873 
2874   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2875                         ? AMDGPU::G_AMDGPU_FFBH_U32
2876                         : AMDGPU::G_AMDGPU_FFBL_B32;
2877   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2878   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2879 
2880   MI.eraseFromParent();
2881   return true;
2882 }
2883 
2884 // Check that this is a G_XOR x, -1
2885 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2886   if (MI.getOpcode() != TargetOpcode::G_XOR)
2887     return false;
2888   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2889   return ConstVal && *ConstVal == -1;
2890 }
2891 
2892 // Return the use branch instruction, otherwise null if the usage is invalid.
2893 static MachineInstr *
2894 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2895                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2896   Register CondDef = MI.getOperand(0).getReg();
2897   if (!MRI.hasOneNonDBGUse(CondDef))
2898     return nullptr;
2899 
2900   MachineBasicBlock *Parent = MI.getParent();
2901   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2902 
2903   if (isNot(MRI, *UseMI)) {
2904     Register NegatedCond = UseMI->getOperand(0).getReg();
2905     if (!MRI.hasOneNonDBGUse(NegatedCond))
2906       return nullptr;
2907 
2908     // We're deleting the def of this value, so we need to remove it.
2909     eraseInstr(*UseMI, MRI);
2910 
2911     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2912     Negated = true;
2913   }
2914 
2915   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2916     return nullptr;
2917 
2918   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2919   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2920   if (Next == Parent->end()) {
2921     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2922     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2923       return nullptr;
2924     UncondBrTarget = &*NextMBB;
2925   } else {
2926     if (Next->getOpcode() != AMDGPU::G_BR)
2927       return nullptr;
2928     Br = &*Next;
2929     UncondBrTarget = Br->getOperand(0).getMBB();
2930   }
2931 
2932   return UseMI;
2933 }
2934 
2935 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2936                                          const ArgDescriptor *Arg,
2937                                          const TargetRegisterClass *ArgRC,
2938                                          LLT ArgTy) const {
2939   MCRegister SrcReg = Arg->getRegister();
2940   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2941   assert(DstReg.isVirtual() && "Virtual register expected");
2942 
2943   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2944                                              *ArgRC, B.getDebugLoc(), ArgTy);
2945   if (Arg->isMasked()) {
2946     // TODO: Should we try to emit this once in the entry block?
2947     const LLT S32 = LLT::scalar(32);
2948     const unsigned Mask = Arg->getMask();
2949     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2950 
2951     Register AndMaskSrc = LiveIn;
2952 
2953     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2954     // 0.
2955     if (Shift != 0) {
2956       auto ShiftAmt = B.buildConstant(S32, Shift);
2957       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2958     }
2959 
2960     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2961   } else {
2962     B.buildCopy(DstReg, LiveIn);
2963   }
2964 
2965   return true;
2966 }
2967 
2968 bool AMDGPULegalizerInfo::loadInputValue(
2969     Register DstReg, MachineIRBuilder &B,
2970     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2971   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2972   const ArgDescriptor *Arg;
2973   const TargetRegisterClass *ArgRC;
2974   LLT ArgTy;
2975   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2976 
2977   if (!Arg) {
2978     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2979       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2980       // case the pointer argument may be missing and we use null.
2981       B.buildConstant(DstReg, 0);
2982       return true;
2983     }
2984 
2985     // It's undefined behavior if a function marked with the amdgpu-no-*
2986     // attributes uses the corresponding intrinsic.
2987     B.buildUndef(DstReg);
2988     return true;
2989   }
2990 
2991   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2992     return false; // TODO: Handle these
2993   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2994 }
2995 
2996 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2997     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2998     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2999   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3000     return false;
3001 
3002   MI.eraseFromParent();
3003   return true;
3004 }
3005 
3006 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3007                                        MachineRegisterInfo &MRI,
3008                                        MachineIRBuilder &B) const {
3009   Register Dst = MI.getOperand(0).getReg();
3010   LLT DstTy = MRI.getType(Dst);
3011   LLT S16 = LLT::scalar(16);
3012   LLT S32 = LLT::scalar(32);
3013   LLT S64 = LLT::scalar(64);
3014 
3015   if (DstTy == S16)
3016     return legalizeFDIV16(MI, MRI, B);
3017   if (DstTy == S32)
3018     return legalizeFDIV32(MI, MRI, B);
3019   if (DstTy == S64)
3020     return legalizeFDIV64(MI, MRI, B);
3021 
3022   return false;
3023 }
3024 
3025 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3026                                                         Register DstDivReg,
3027                                                         Register DstRemReg,
3028                                                         Register X,
3029                                                         Register Y) const {
3030   const LLT S1 = LLT::scalar(1);
3031   const LLT S32 = LLT::scalar(32);
3032 
3033   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3034   // algorithm used here.
3035 
3036   // Initial estimate of inv(y).
3037   auto FloatY = B.buildUITOFP(S32, Y);
3038   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3039   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3040   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3041   auto Z = B.buildFPTOUI(S32, ScaledY);
3042 
3043   // One round of UNR.
3044   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3045   auto NegYZ = B.buildMul(S32, NegY, Z);
3046   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3047 
3048   // Quotient/remainder estimate.
3049   auto Q = B.buildUMulH(S32, X, Z);
3050   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3051 
3052   // First quotient/remainder refinement.
3053   auto One = B.buildConstant(S32, 1);
3054   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3055   if (DstDivReg)
3056     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3057   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3058 
3059   // Second quotient/remainder refinement.
3060   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3061   if (DstDivReg)
3062     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3063 
3064   if (DstRemReg)
3065     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3066 }
3067 
3068 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3069 //
3070 // Return lo, hi of result
3071 //
3072 // %cvt.lo = G_UITOFP Val.lo
3073 // %cvt.hi = G_UITOFP Val.hi
3074 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3075 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3076 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3077 // %mul2 = G_FMUL %mul1, 2**(-32)
3078 // %trunc = G_INTRINSIC_TRUNC %mul2
3079 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3080 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3081 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3082                                                        Register Val) {
3083   const LLT S32 = LLT::scalar(32);
3084   auto Unmerge = B.buildUnmerge(S32, Val);
3085 
3086   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3087   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3088 
3089   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3090                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3091 
3092   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3093   auto Mul1 =
3094       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3095 
3096   // 2**(-32)
3097   auto Mul2 =
3098       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3099   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3100 
3101   // -(2**32)
3102   auto Mad2 = B.buildFMAD(S32, Trunc,
3103                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3104 
3105   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3106   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3107 
3108   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3109 }
3110 
3111 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3112                                                         Register DstDivReg,
3113                                                         Register DstRemReg,
3114                                                         Register Numer,
3115                                                         Register Denom) const {
3116   const LLT S32 = LLT::scalar(32);
3117   const LLT S64 = LLT::scalar(64);
3118   const LLT S1 = LLT::scalar(1);
3119   Register RcpLo, RcpHi;
3120 
3121   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3122 
3123   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3124 
3125   auto Zero64 = B.buildConstant(S64, 0);
3126   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3127 
3128   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3129   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3130 
3131   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3132   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3133   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3134 
3135   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3136   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3137   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3138 
3139   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3140   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3141   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3142   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3143   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3144 
3145   auto Zero32 = B.buildConstant(S32, 0);
3146   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3147   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3148   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3149 
3150   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3151   Register NumerLo = UnmergeNumer.getReg(0);
3152   Register NumerHi = UnmergeNumer.getReg(1);
3153 
3154   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3155   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3156   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3157   Register Mul3_Lo = UnmergeMul3.getReg(0);
3158   Register Mul3_Hi = UnmergeMul3.getReg(1);
3159   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3160   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3161   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3162   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3163 
3164   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3165   Register DenomLo = UnmergeDenom.getReg(0);
3166   Register DenomHi = UnmergeDenom.getReg(1);
3167 
3168   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3169   auto C1 = B.buildSExt(S32, CmpHi);
3170 
3171   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3172   auto C2 = B.buildSExt(S32, CmpLo);
3173 
3174   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3175   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3176 
3177   // TODO: Here and below portions of the code can be enclosed into if/endif.
3178   // Currently control flow is unconditional and we have 4 selects after
3179   // potential endif to substitute PHIs.
3180 
3181   // if C3 != 0 ...
3182   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3183   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3184   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3185   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3186 
3187   auto One64 = B.buildConstant(S64, 1);
3188   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3189 
3190   auto C4 =
3191       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3192   auto C5 =
3193       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3194   auto C6 = B.buildSelect(
3195       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3196 
3197   // if (C6 != 0)
3198   auto Add4 = B.buildAdd(S64, Add3, One64);
3199   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3200 
3201   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3202   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3203   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3204 
3205   // endif C6
3206   // endif C3
3207 
3208   if (DstDivReg) {
3209     auto Sel1 = B.buildSelect(
3210         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3211     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3212                   Sel1, MulHi3);
3213   }
3214 
3215   if (DstRemReg) {
3216     auto Sel2 = B.buildSelect(
3217         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3218     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3219                   Sel2, Sub1);
3220   }
3221 }
3222 
3223 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3224                                                   MachineRegisterInfo &MRI,
3225                                                   MachineIRBuilder &B) const {
3226   Register DstDivReg, DstRemReg;
3227   switch (MI.getOpcode()) {
3228   default:
3229     llvm_unreachable("Unexpected opcode!");
3230   case AMDGPU::G_UDIV: {
3231     DstDivReg = MI.getOperand(0).getReg();
3232     break;
3233   }
3234   case AMDGPU::G_UREM: {
3235     DstRemReg = MI.getOperand(0).getReg();
3236     break;
3237   }
3238   case AMDGPU::G_UDIVREM: {
3239     DstDivReg = MI.getOperand(0).getReg();
3240     DstRemReg = MI.getOperand(1).getReg();
3241     break;
3242   }
3243   }
3244 
3245   const LLT S64 = LLT::scalar(64);
3246   const LLT S32 = LLT::scalar(32);
3247   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3248   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3249   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3250   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3251 
3252   if (Ty == S32)
3253     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3254   else if (Ty == S64)
3255     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3256   else
3257     return false;
3258 
3259   MI.eraseFromParent();
3260   return true;
3261 }
3262 
3263 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3264                                                 MachineRegisterInfo &MRI,
3265                                                 MachineIRBuilder &B) const {
3266   const LLT S64 = LLT::scalar(64);
3267   const LLT S32 = LLT::scalar(32);
3268 
3269   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3270   if (Ty != S32 && Ty != S64)
3271     return false;
3272 
3273   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3274   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3275   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3276 
3277   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3278   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3279   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3280 
3281   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3282   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3283 
3284   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3285   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3286 
3287   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3288   switch (MI.getOpcode()) {
3289   default:
3290     llvm_unreachable("Unexpected opcode!");
3291   case AMDGPU::G_SDIV: {
3292     DstDivReg = MI.getOperand(0).getReg();
3293     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3294     break;
3295   }
3296   case AMDGPU::G_SREM: {
3297     DstRemReg = MI.getOperand(0).getReg();
3298     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3299     break;
3300   }
3301   case AMDGPU::G_SDIVREM: {
3302     DstDivReg = MI.getOperand(0).getReg();
3303     DstRemReg = MI.getOperand(1).getReg();
3304     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3305     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3306     break;
3307   }
3308   }
3309 
3310   if (Ty == S32)
3311     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3312   else
3313     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3314 
3315   if (DstDivReg) {
3316     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3317     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3318     B.buildSub(DstDivReg, SignXor, Sign);
3319   }
3320 
3321   if (DstRemReg) {
3322     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3323     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3324     B.buildSub(DstRemReg, SignXor, Sign);
3325   }
3326 
3327   MI.eraseFromParent();
3328   return true;
3329 }
3330 
3331 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3332                                                  MachineRegisterInfo &MRI,
3333                                                  MachineIRBuilder &B) const {
3334   Register Res = MI.getOperand(0).getReg();
3335   Register LHS = MI.getOperand(1).getReg();
3336   Register RHS = MI.getOperand(2).getReg();
3337   uint16_t Flags = MI.getFlags();
3338   LLT ResTy = MRI.getType(Res);
3339 
3340   const MachineFunction &MF = B.getMF();
3341   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3342                             MI.getFlag(MachineInstr::FmAfn);
3343 
3344   if (!AllowInaccurateRcp)
3345     return false;
3346 
3347   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3348     // 1 / x -> RCP(x)
3349     if (CLHS->isExactlyValue(1.0)) {
3350       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3351         .addUse(RHS)
3352         .setMIFlags(Flags);
3353 
3354       MI.eraseFromParent();
3355       return true;
3356     }
3357 
3358     // -1 / x -> RCP( FNEG(x) )
3359     if (CLHS->isExactlyValue(-1.0)) {
3360       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3361       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3362         .addUse(FNeg.getReg(0))
3363         .setMIFlags(Flags);
3364 
3365       MI.eraseFromParent();
3366       return true;
3367     }
3368   }
3369 
3370   // x / y -> x * (1.0 / y)
3371   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3372     .addUse(RHS)
3373     .setMIFlags(Flags);
3374   B.buildFMul(Res, LHS, RCP, Flags);
3375 
3376   MI.eraseFromParent();
3377   return true;
3378 }
3379 
3380 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3381                                                    MachineRegisterInfo &MRI,
3382                                                    MachineIRBuilder &B) const {
3383   Register Res = MI.getOperand(0).getReg();
3384   Register X = MI.getOperand(1).getReg();
3385   Register Y = MI.getOperand(2).getReg();
3386   uint16_t Flags = MI.getFlags();
3387   LLT ResTy = MRI.getType(Res);
3388 
3389   const MachineFunction &MF = B.getMF();
3390   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3391                             MI.getFlag(MachineInstr::FmAfn);
3392 
3393   if (!AllowInaccurateRcp)
3394     return false;
3395 
3396   auto NegY = B.buildFNeg(ResTy, Y);
3397   auto One = B.buildFConstant(ResTy, 1.0);
3398 
3399   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3400     .addUse(Y)
3401     .setMIFlags(Flags);
3402 
3403   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3404   R = B.buildFMA(ResTy, Tmp0, R, R);
3405 
3406   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3407   R = B.buildFMA(ResTy, Tmp1, R, R);
3408 
3409   auto Ret = B.buildFMul(ResTy, X, R);
3410   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3411 
3412   B.buildFMA(Res, Tmp2, R, Ret);
3413   MI.eraseFromParent();
3414   return true;
3415 }
3416 
3417 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3418                                          MachineRegisterInfo &MRI,
3419                                          MachineIRBuilder &B) const {
3420   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3421     return true;
3422 
3423   Register Res = MI.getOperand(0).getReg();
3424   Register LHS = MI.getOperand(1).getReg();
3425   Register RHS = MI.getOperand(2).getReg();
3426 
3427   uint16_t Flags = MI.getFlags();
3428 
3429   LLT S16 = LLT::scalar(16);
3430   LLT S32 = LLT::scalar(32);
3431 
3432   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3433   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3434 
3435   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3436     .addUse(RHSExt.getReg(0))
3437     .setMIFlags(Flags);
3438 
3439   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3440   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3441 
3442   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3443     .addUse(RDst.getReg(0))
3444     .addUse(RHS)
3445     .addUse(LHS)
3446     .setMIFlags(Flags);
3447 
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3453 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3454 static void toggleSPDenormMode(bool Enable,
3455                                MachineIRBuilder &B,
3456                                const GCNSubtarget &ST,
3457                                AMDGPU::SIModeRegisterDefaults Mode) {
3458   // Set SP denorm mode to this value.
3459   unsigned SPDenormMode =
3460     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3461 
3462   if (ST.hasDenormModeInst()) {
3463     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3464     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3465 
3466     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3467     B.buildInstr(AMDGPU::S_DENORM_MODE)
3468       .addImm(NewDenormModeValue);
3469 
3470   } else {
3471     // Select FP32 bit field in mode register.
3472     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3473                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3474                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3475 
3476     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3477       .addImm(SPDenormMode)
3478       .addImm(SPDenormModeBitField);
3479   }
3480 }
3481 
3482 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3483                                          MachineRegisterInfo &MRI,
3484                                          MachineIRBuilder &B) const {
3485   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3486     return true;
3487 
3488   Register Res = MI.getOperand(0).getReg();
3489   Register LHS = MI.getOperand(1).getReg();
3490   Register RHS = MI.getOperand(2).getReg();
3491   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3492   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3493 
3494   uint16_t Flags = MI.getFlags();
3495 
3496   LLT S32 = LLT::scalar(32);
3497   LLT S1 = LLT::scalar(1);
3498 
3499   auto One = B.buildFConstant(S32, 1.0f);
3500 
3501   auto DenominatorScaled =
3502     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3503       .addUse(LHS)
3504       .addUse(RHS)
3505       .addImm(0)
3506       .setMIFlags(Flags);
3507   auto NumeratorScaled =
3508     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3509       .addUse(LHS)
3510       .addUse(RHS)
3511       .addImm(1)
3512       .setMIFlags(Flags);
3513 
3514   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3515     .addUse(DenominatorScaled.getReg(0))
3516     .setMIFlags(Flags);
3517   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3518 
3519   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3520   // aren't modeled as reading it.
3521   if (!Mode.allFP32Denormals())
3522     toggleSPDenormMode(true, B, ST, Mode);
3523 
3524   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3525   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3526   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3527   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3528   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3529   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3530 
3531   if (!Mode.allFP32Denormals())
3532     toggleSPDenormMode(false, B, ST, Mode);
3533 
3534   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3535     .addUse(Fma4.getReg(0))
3536     .addUse(Fma1.getReg(0))
3537     .addUse(Fma3.getReg(0))
3538     .addUse(NumeratorScaled.getReg(1))
3539     .setMIFlags(Flags);
3540 
3541   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3542     .addUse(Fmas.getReg(0))
3543     .addUse(RHS)
3544     .addUse(LHS)
3545     .setMIFlags(Flags);
3546 
3547   MI.eraseFromParent();
3548   return true;
3549 }
3550 
3551 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3552                                          MachineRegisterInfo &MRI,
3553                                          MachineIRBuilder &B) const {
3554   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3555     return true;
3556 
3557   Register Res = MI.getOperand(0).getReg();
3558   Register LHS = MI.getOperand(1).getReg();
3559   Register RHS = MI.getOperand(2).getReg();
3560 
3561   uint16_t Flags = MI.getFlags();
3562 
3563   LLT S64 = LLT::scalar(64);
3564   LLT S1 = LLT::scalar(1);
3565 
3566   auto One = B.buildFConstant(S64, 1.0);
3567 
3568   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3569     .addUse(LHS)
3570     .addUse(RHS)
3571     .addImm(0)
3572     .setMIFlags(Flags);
3573 
3574   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3575 
3576   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3577     .addUse(DivScale0.getReg(0))
3578     .setMIFlags(Flags);
3579 
3580   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3581   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3582   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3583 
3584   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3585     .addUse(LHS)
3586     .addUse(RHS)
3587     .addImm(1)
3588     .setMIFlags(Flags);
3589 
3590   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3591   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3592   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3593 
3594   Register Scale;
3595   if (!ST.hasUsableDivScaleConditionOutput()) {
3596     // Workaround a hardware bug on SI where the condition output from div_scale
3597     // is not usable.
3598 
3599     LLT S32 = LLT::scalar(32);
3600 
3601     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3602     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3603     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3604     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3605 
3606     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3607                               Scale1Unmerge.getReg(1));
3608     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3609                               Scale0Unmerge.getReg(1));
3610     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3611   } else {
3612     Scale = DivScale1.getReg(1);
3613   }
3614 
3615   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3616     .addUse(Fma4.getReg(0))
3617     .addUse(Fma3.getReg(0))
3618     .addUse(Mul.getReg(0))
3619     .addUse(Scale)
3620     .setMIFlags(Flags);
3621 
3622   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3623     .addUse(Fmas.getReg(0))
3624     .addUse(RHS)
3625     .addUse(LHS)
3626     .setMIFlags(Flags);
3627 
3628   MI.eraseFromParent();
3629   return true;
3630 }
3631 
3632 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3633                                                  MachineRegisterInfo &MRI,
3634                                                  MachineIRBuilder &B) const {
3635   Register Res = MI.getOperand(0).getReg();
3636   Register LHS = MI.getOperand(2).getReg();
3637   Register RHS = MI.getOperand(3).getReg();
3638   uint16_t Flags = MI.getFlags();
3639 
3640   LLT S32 = LLT::scalar(32);
3641   LLT S1 = LLT::scalar(1);
3642 
3643   auto Abs = B.buildFAbs(S32, RHS, Flags);
3644   const APFloat C0Val(1.0f);
3645 
3646   auto C0 = B.buildConstant(S32, 0x6f800000);
3647   auto C1 = B.buildConstant(S32, 0x2f800000);
3648   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3649 
3650   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3651   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3652 
3653   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3654 
3655   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3656     .addUse(Mul0.getReg(0))
3657     .setMIFlags(Flags);
3658 
3659   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3660 
3661   B.buildFMul(Res, Sel, Mul1, Flags);
3662 
3663   MI.eraseFromParent();
3664   return true;
3665 }
3666 
3667 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3668 // FIXME: Why do we handle this one but not other removed instructions?
3669 //
3670 // Reciprocal square root.  The clamp prevents infinite results, clamping
3671 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3672 // +-max_float.
3673 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3674                                                     MachineRegisterInfo &MRI,
3675                                                     MachineIRBuilder &B) const {
3676   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3677     return true;
3678 
3679   Register Dst = MI.getOperand(0).getReg();
3680   Register Src = MI.getOperand(2).getReg();
3681   auto Flags = MI.getFlags();
3682 
3683   LLT Ty = MRI.getType(Dst);
3684 
3685   const fltSemantics *FltSemantics;
3686   if (Ty == LLT::scalar(32))
3687     FltSemantics = &APFloat::IEEEsingle();
3688   else if (Ty == LLT::scalar(64))
3689     FltSemantics = &APFloat::IEEEdouble();
3690   else
3691     return false;
3692 
3693   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3694     .addUse(Src)
3695     .setMIFlags(Flags);
3696 
3697   // We don't need to concern ourselves with the snan handling difference, since
3698   // the rsq quieted (or not) so use the one which will directly select.
3699   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3700   const bool UseIEEE = MFI->getMode().IEEE;
3701 
3702   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3703   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3704                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3705 
3706   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3707 
3708   if (UseIEEE)
3709     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3710   else
3711     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3712   MI.eraseFromParent();
3713   return true;
3714 }
3715 
3716 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3717   switch (IID) {
3718   case Intrinsic::amdgcn_ds_fadd:
3719     return AMDGPU::G_ATOMICRMW_FADD;
3720   case Intrinsic::amdgcn_ds_fmin:
3721     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3722   case Intrinsic::amdgcn_ds_fmax:
3723     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3724   default:
3725     llvm_unreachable("not a DS FP intrinsic");
3726   }
3727 }
3728 
3729 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3730                                                       MachineInstr &MI,
3731                                                       Intrinsic::ID IID) const {
3732   GISelChangeObserver &Observer = Helper.Observer;
3733   Observer.changingInstr(MI);
3734 
3735   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3736 
3737   // The remaining operands were used to set fields in the MemOperand on
3738   // construction.
3739   for (int I = 6; I > 3; --I)
3740     MI.removeOperand(I);
3741 
3742   MI.removeOperand(1); // Remove the intrinsic ID.
3743   Observer.changedInstr(MI);
3744   return true;
3745 }
3746 
3747 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3748                                             MachineRegisterInfo &MRI,
3749                                             MachineIRBuilder &B) const {
3750   uint64_t Offset =
3751     ST.getTargetLowering()->getImplicitParameterOffset(
3752       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3753   LLT DstTy = MRI.getType(DstReg);
3754   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3755 
3756   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3757   if (!loadInputValue(KernargPtrReg, B,
3758                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3759     return false;
3760 
3761   // FIXME: This should be nuw
3762   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3763   return true;
3764 }
3765 
3766 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3767                                                  MachineRegisterInfo &MRI,
3768                                                  MachineIRBuilder &B) const {
3769   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3770   if (!MFI->isEntryFunction()) {
3771     return legalizePreloadedArgIntrin(MI, MRI, B,
3772                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3773   }
3774 
3775   Register DstReg = MI.getOperand(0).getReg();
3776   if (!getImplicitArgPtr(DstReg, MRI, B))
3777     return false;
3778 
3779   MI.eraseFromParent();
3780   return true;
3781 }
3782 
3783 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3784                                               MachineRegisterInfo &MRI,
3785                                               MachineIRBuilder &B,
3786                                               unsigned AddrSpace) const {
3787   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3788   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3789   Register Hi32 = Unmerge.getReg(1);
3790 
3791   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3792   MI.eraseFromParent();
3793   return true;
3794 }
3795 
3796 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3797 // offset (the offset that is included in bounds checking and swizzling, to be
3798 // split between the instruction's voffset and immoffset fields) and soffset
3799 // (the offset that is excluded from bounds checking and swizzling, to go in
3800 // the instruction's soffset field).  This function takes the first kind of
3801 // offset and figures out how to split it between voffset and immoffset.
3802 std::pair<Register, unsigned>
3803 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3804                                         Register OrigOffset) const {
3805   const unsigned MaxImm = 4095;
3806   Register BaseReg;
3807   unsigned ImmOffset;
3808   const LLT S32 = LLT::scalar(32);
3809   MachineRegisterInfo &MRI = *B.getMRI();
3810 
3811   std::tie(BaseReg, ImmOffset) =
3812       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3813 
3814   // If BaseReg is a pointer, convert it to int.
3815   if (MRI.getType(BaseReg).isPointer())
3816     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3817 
3818   // If the immediate value is too big for the immoffset field, put the value
3819   // and -4096 into the immoffset field so that the value that is copied/added
3820   // for the voffset field is a multiple of 4096, and it stands more chance
3821   // of being CSEd with the copy/add for another similar load/store.
3822   // However, do not do that rounding down to a multiple of 4096 if that is a
3823   // negative number, as it appears to be illegal to have a negative offset
3824   // in the vgpr, even if adding the immediate offset makes it positive.
3825   unsigned Overflow = ImmOffset & ~MaxImm;
3826   ImmOffset -= Overflow;
3827   if ((int32_t)Overflow < 0) {
3828     Overflow += ImmOffset;
3829     ImmOffset = 0;
3830   }
3831 
3832   if (Overflow != 0) {
3833     if (!BaseReg) {
3834       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3835     } else {
3836       auto OverflowVal = B.buildConstant(S32, Overflow);
3837       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3838     }
3839   }
3840 
3841   if (!BaseReg)
3842     BaseReg = B.buildConstant(S32, 0).getReg(0);
3843 
3844   return std::make_pair(BaseReg, ImmOffset);
3845 }
3846 
3847 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3848 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3849                                           Register VOffset, Register SOffset,
3850                                           unsigned ImmOffset, Register VIndex,
3851                                           MachineRegisterInfo &MRI) const {
3852   Optional<ValueAndVReg> MaybeVOffsetVal =
3853       getIConstantVRegValWithLookThrough(VOffset, MRI);
3854   Optional<ValueAndVReg> MaybeSOffsetVal =
3855       getIConstantVRegValWithLookThrough(SOffset, MRI);
3856   Optional<ValueAndVReg> MaybeVIndexVal =
3857       getIConstantVRegValWithLookThrough(VIndex, MRI);
3858   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3859   // update the MMO with that offset. The stride is unknown so we can only do
3860   // this if VIndex is constant 0.
3861   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3862       MaybeVIndexVal->Value == 0) {
3863     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3864                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3865     MMO->setOffset(TotalOffset);
3866   } else {
3867     // We don't have a constant combined offset to use in the MMO. Give up.
3868     MMO->setValue((Value *)nullptr);
3869   }
3870 }
3871 
3872 /// Handle register layout difference for f16 images for some subtargets.
3873 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3874                                              MachineRegisterInfo &MRI,
3875                                              Register Reg,
3876                                              bool ImageStore) const {
3877   const LLT S16 = LLT::scalar(16);
3878   const LLT S32 = LLT::scalar(32);
3879   LLT StoreVT = MRI.getType(Reg);
3880   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3881 
3882   if (ST.hasUnpackedD16VMem()) {
3883     auto Unmerge = B.buildUnmerge(S16, Reg);
3884 
3885     SmallVector<Register, 4> WideRegs;
3886     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3887       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3888 
3889     int NumElts = StoreVT.getNumElements();
3890 
3891     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3892         .getReg(0);
3893   }
3894 
3895   if (ImageStore && ST.hasImageStoreD16Bug()) {
3896     if (StoreVT.getNumElements() == 2) {
3897       SmallVector<Register, 4> PackedRegs;
3898       Reg = B.buildBitcast(S32, Reg).getReg(0);
3899       PackedRegs.push_back(Reg);
3900       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3901       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3902           .getReg(0);
3903     }
3904 
3905     if (StoreVT.getNumElements() == 3) {
3906       SmallVector<Register, 4> PackedRegs;
3907       auto Unmerge = B.buildUnmerge(S16, Reg);
3908       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3909         PackedRegs.push_back(Unmerge.getReg(I));
3910       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3911       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3912       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3913     }
3914 
3915     if (StoreVT.getNumElements() == 4) {
3916       SmallVector<Register, 4> PackedRegs;
3917       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3918       auto Unmerge = B.buildUnmerge(S32, Reg);
3919       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3920         PackedRegs.push_back(Unmerge.getReg(I));
3921       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3922       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3923           .getReg(0);
3924     }
3925 
3926     llvm_unreachable("invalid data type");
3927   }
3928 
3929   if (StoreVT == LLT::fixed_vector(3, S16)) {
3930     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3931               .getReg(0);
3932   }
3933   return Reg;
3934 }
3935 
3936 Register AMDGPULegalizerInfo::fixStoreSourceType(
3937   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3938   MachineRegisterInfo *MRI = B.getMRI();
3939   LLT Ty = MRI->getType(VData);
3940 
3941   const LLT S16 = LLT::scalar(16);
3942 
3943   // Fixup illegal register types for i8 stores.
3944   if (Ty == LLT::scalar(8) || Ty == S16) {
3945     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3946     return AnyExt;
3947   }
3948 
3949   if (Ty.isVector()) {
3950     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3951       if (IsFormat)
3952         return handleD16VData(B, *MRI, VData);
3953     }
3954   }
3955 
3956   return VData;
3957 }
3958 
3959 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3960                                               MachineRegisterInfo &MRI,
3961                                               MachineIRBuilder &B,
3962                                               bool IsTyped,
3963                                               bool IsFormat) const {
3964   Register VData = MI.getOperand(1).getReg();
3965   LLT Ty = MRI.getType(VData);
3966   LLT EltTy = Ty.getScalarType();
3967   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3968   const LLT S32 = LLT::scalar(32);
3969 
3970   VData = fixStoreSourceType(B, VData, IsFormat);
3971   Register RSrc = MI.getOperand(2).getReg();
3972 
3973   MachineMemOperand *MMO = *MI.memoperands_begin();
3974   const int MemSize = MMO->getSize();
3975 
3976   unsigned ImmOffset;
3977 
3978   // The typed intrinsics add an immediate after the registers.
3979   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3980 
3981   // The struct intrinsic variants add one additional operand over raw.
3982   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3983   Register VIndex;
3984   int OpOffset = 0;
3985   if (HasVIndex) {
3986     VIndex = MI.getOperand(3).getReg();
3987     OpOffset = 1;
3988   } else {
3989     VIndex = B.buildConstant(S32, 0).getReg(0);
3990   }
3991 
3992   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3993   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3994 
3995   unsigned Format = 0;
3996   if (IsTyped) {
3997     Format = MI.getOperand(5 + OpOffset).getImm();
3998     ++OpOffset;
3999   }
4000 
4001   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4002 
4003   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4004   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4005 
4006   unsigned Opc;
4007   if (IsTyped) {
4008     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4009                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4010   } else if (IsFormat) {
4011     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4012                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4013   } else {
4014     switch (MemSize) {
4015     case 1:
4016       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4017       break;
4018     case 2:
4019       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4020       break;
4021     default:
4022       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4023       break;
4024     }
4025   }
4026 
4027   auto MIB = B.buildInstr(Opc)
4028     .addUse(VData)              // vdata
4029     .addUse(RSrc)               // rsrc
4030     .addUse(VIndex)             // vindex
4031     .addUse(VOffset)            // voffset
4032     .addUse(SOffset)            // soffset
4033     .addImm(ImmOffset);         // offset(imm)
4034 
4035   if (IsTyped)
4036     MIB.addImm(Format);
4037 
4038   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4039      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4040      .addMemOperand(MMO);
4041 
4042   MI.eraseFromParent();
4043   return true;
4044 }
4045 
4046 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4047                                              MachineRegisterInfo &MRI,
4048                                              MachineIRBuilder &B,
4049                                              bool IsFormat,
4050                                              bool IsTyped) const {
4051   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4052   MachineMemOperand *MMO = *MI.memoperands_begin();
4053   const LLT MemTy = MMO->getMemoryType();
4054   const LLT S32 = LLT::scalar(32);
4055 
4056   Register Dst = MI.getOperand(0).getReg();
4057   Register RSrc = MI.getOperand(2).getReg();
4058 
4059   // The typed intrinsics add an immediate after the registers.
4060   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4061 
4062   // The struct intrinsic variants add one additional operand over raw.
4063   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4064   Register VIndex;
4065   int OpOffset = 0;
4066   if (HasVIndex) {
4067     VIndex = MI.getOperand(3).getReg();
4068     OpOffset = 1;
4069   } else {
4070     VIndex = B.buildConstant(S32, 0).getReg(0);
4071   }
4072 
4073   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4074   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4075 
4076   unsigned Format = 0;
4077   if (IsTyped) {
4078     Format = MI.getOperand(5 + OpOffset).getImm();
4079     ++OpOffset;
4080   }
4081 
4082   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4083   unsigned ImmOffset;
4084 
4085   LLT Ty = MRI.getType(Dst);
4086   LLT EltTy = Ty.getScalarType();
4087   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4088   const bool Unpacked = ST.hasUnpackedD16VMem();
4089 
4090   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4091   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4092 
4093   unsigned Opc;
4094 
4095   if (IsTyped) {
4096     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4097                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4098   } else if (IsFormat) {
4099     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4100                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4101   } else {
4102     switch (MemTy.getSizeInBits()) {
4103     case 8:
4104       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4105       break;
4106     case 16:
4107       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4108       break;
4109     default:
4110       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4111       break;
4112     }
4113   }
4114 
4115   Register LoadDstReg;
4116 
4117   bool IsExtLoad =
4118       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4119   LLT UnpackedTy = Ty.changeElementSize(32);
4120 
4121   if (IsExtLoad)
4122     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4123   else if (Unpacked && IsD16 && Ty.isVector())
4124     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4125   else
4126     LoadDstReg = Dst;
4127 
4128   auto MIB = B.buildInstr(Opc)
4129     .addDef(LoadDstReg)         // vdata
4130     .addUse(RSrc)               // rsrc
4131     .addUse(VIndex)             // vindex
4132     .addUse(VOffset)            // voffset
4133     .addUse(SOffset)            // soffset
4134     .addImm(ImmOffset);         // offset(imm)
4135 
4136   if (IsTyped)
4137     MIB.addImm(Format);
4138 
4139   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4140      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4141      .addMemOperand(MMO);
4142 
4143   if (LoadDstReg != Dst) {
4144     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4145 
4146     // Widen result for extending loads was widened.
4147     if (IsExtLoad)
4148       B.buildTrunc(Dst, LoadDstReg);
4149     else {
4150       // Repack to original 16-bit vector result
4151       // FIXME: G_TRUNC should work, but legalization currently fails
4152       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4153       SmallVector<Register, 4> Repack;
4154       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4155         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4156       B.buildMerge(Dst, Repack);
4157     }
4158   }
4159 
4160   MI.eraseFromParent();
4161   return true;
4162 }
4163 
4164 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4165                                                MachineIRBuilder &B,
4166                                                bool IsInc) const {
4167   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4168                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4169   B.buildInstr(Opc)
4170     .addDef(MI.getOperand(0).getReg())
4171     .addUse(MI.getOperand(2).getReg())
4172     .addUse(MI.getOperand(3).getReg())
4173     .cloneMemRefs(MI);
4174   MI.eraseFromParent();
4175   return true;
4176 }
4177 
4178 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4179   switch (IntrID) {
4180   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4181   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4182     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4183   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4184   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4185     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4186   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4187   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4188     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4189   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4190   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4191     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4192   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4193   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4194     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4195   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4196   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4197     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4198   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4199   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4200     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4201   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4202   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4203     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4204   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4205   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4206     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4207   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4208   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4209     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4210   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4211   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4212     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4213   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4214   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4215     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4216   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4217   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4218     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4219   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4220   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4221     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4222   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4223   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4224     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4225   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4226   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4227     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4228   default:
4229     llvm_unreachable("unhandled atomic opcode");
4230   }
4231 }
4232 
4233 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4234                                                MachineIRBuilder &B,
4235                                                Intrinsic::ID IID) const {
4236   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4237                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4238   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4239 
4240   Register Dst;
4241 
4242   int OpOffset = 0;
4243   if (HasReturn) {
4244     // A few FP atomics do not support return values.
4245     Dst = MI.getOperand(0).getReg();
4246   } else {
4247     OpOffset = -1;
4248   }
4249 
4250   Register VData = MI.getOperand(2 + OpOffset).getReg();
4251   Register CmpVal;
4252 
4253   if (IsCmpSwap) {
4254     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4255     ++OpOffset;
4256   }
4257 
4258   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4259   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4260 
4261   // The struct intrinsic variants add one additional operand over raw.
4262   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4263   Register VIndex;
4264   if (HasVIndex) {
4265     VIndex = MI.getOperand(4 + OpOffset).getReg();
4266     ++OpOffset;
4267   } else {
4268     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4269   }
4270 
4271   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4272   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4273   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4274 
4275   MachineMemOperand *MMO = *MI.memoperands_begin();
4276 
4277   unsigned ImmOffset;
4278   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4279   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4280 
4281   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4282 
4283   if (HasReturn)
4284     MIB.addDef(Dst);
4285 
4286   MIB.addUse(VData); // vdata
4287 
4288   if (IsCmpSwap)
4289     MIB.addReg(CmpVal);
4290 
4291   MIB.addUse(RSrc)               // rsrc
4292      .addUse(VIndex)             // vindex
4293      .addUse(VOffset)            // voffset
4294      .addUse(SOffset)            // soffset
4295      .addImm(ImmOffset)          // offset(imm)
4296      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4297      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4298      .addMemOperand(MMO);
4299 
4300   MI.eraseFromParent();
4301   return true;
4302 }
4303 
4304 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4305 /// vector with s16 typed elements.
4306 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4307                                       SmallVectorImpl<Register> &PackedAddrs,
4308                                       unsigned ArgOffset,
4309                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4310                                       bool IsA16, bool IsG16) {
4311   const LLT S16 = LLT::scalar(16);
4312   const LLT V2S16 = LLT::fixed_vector(2, 16);
4313   auto EndIdx = Intr->VAddrEnd;
4314 
4315   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4316     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4317     if (!SrcOp.isReg())
4318       continue; // _L to _LZ may have eliminated this.
4319 
4320     Register AddrReg = SrcOp.getReg();
4321 
4322     if ((I < Intr->GradientStart) ||
4323         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4324         (I >= Intr->CoordStart && !IsA16)) {
4325       if ((I < Intr->GradientStart) && IsA16 &&
4326           (B.getMRI()->getType(AddrReg) == S16)) {
4327         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4328         // Special handling of bias when A16 is on. Bias is of type half but
4329         // occupies full 32-bit.
4330         PackedAddrs.push_back(
4331             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4332                 .getReg(0));
4333       } else {
4334         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4335                "Bias needs to be converted to 16 bit in A16 mode");
4336         // Handle any gradient or coordinate operands that should not be packed
4337         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4338         PackedAddrs.push_back(AddrReg);
4339       }
4340     } else {
4341       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4342       // derivatives dx/dh and dx/dv are packed with undef.
4343       if (((I + 1) >= EndIdx) ||
4344           ((Intr->NumGradients / 2) % 2 == 1 &&
4345            (I == static_cast<unsigned>(Intr->GradientStart +
4346                                        (Intr->NumGradients / 2) - 1) ||
4347             I == static_cast<unsigned>(Intr->GradientStart +
4348                                        Intr->NumGradients - 1))) ||
4349           // Check for _L to _LZ optimization
4350           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4351         PackedAddrs.push_back(
4352             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4353                 .getReg(0));
4354       } else {
4355         PackedAddrs.push_back(
4356             B.buildBuildVector(
4357                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4358                 .getReg(0));
4359         ++I;
4360       }
4361     }
4362   }
4363 }
4364 
4365 /// Convert from separate vaddr components to a single vector address register,
4366 /// and replace the remaining operands with $noreg.
4367 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4368                                      int DimIdx, int NumVAddrs) {
4369   const LLT S32 = LLT::scalar(32);
4370 
4371   SmallVector<Register, 8> AddrRegs;
4372   for (int I = 0; I != NumVAddrs; ++I) {
4373     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4374     if (SrcOp.isReg()) {
4375       AddrRegs.push_back(SrcOp.getReg());
4376       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4377     }
4378   }
4379 
4380   int NumAddrRegs = AddrRegs.size();
4381   if (NumAddrRegs != 1) {
4382     // Above 8 elements round up to next power of 2 (i.e. 16).
4383     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4384       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4385       auto Undef = B.buildUndef(S32);
4386       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4387       NumAddrRegs = RoundedNumRegs;
4388     }
4389 
4390     auto VAddr =
4391         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4392     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4393   }
4394 
4395   for (int I = 1; I != NumVAddrs; ++I) {
4396     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4397     if (SrcOp.isReg())
4398       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4399   }
4400 }
4401 
4402 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4403 ///
4404 /// Depending on the subtarget, load/store with 16-bit element data need to be
4405 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4406 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4407 /// registers.
4408 ///
4409 /// We don't want to directly select image instructions just yet, but also want
4410 /// to exposes all register repacking to the legalizer/combiners. We also don't
4411 /// want a selected instruction entering RegBankSelect. In order to avoid
4412 /// defining a multitude of intermediate image instructions, directly hack on
4413 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4414 /// padding now unnecessary arguments with $noreg.
4415 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4416     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4417     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4418 
4419   const unsigned NumDefs = MI.getNumExplicitDefs();
4420   const unsigned ArgOffset = NumDefs + 1;
4421   bool IsTFE = NumDefs == 2;
4422   // We are only processing the operands of d16 image operations on subtargets
4423   // that use the unpacked register layout, or need to repack the TFE result.
4424 
4425   // TODO: Do we need to guard against already legalized intrinsics?
4426   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4427       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4428 
4429   MachineRegisterInfo *MRI = B.getMRI();
4430   const LLT S32 = LLT::scalar(32);
4431   const LLT S16 = LLT::scalar(16);
4432   const LLT V2S16 = LLT::fixed_vector(2, 16);
4433 
4434   unsigned DMask = 0;
4435   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4436   LLT Ty = MRI->getType(VData);
4437 
4438   // Check for 16 bit addresses and pack if true.
4439   LLT GradTy =
4440       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4441   LLT AddrTy =
4442       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4443   const bool IsG16 = GradTy == S16;
4444   const bool IsA16 = AddrTy == S16;
4445   const bool IsD16 = Ty.getScalarType() == S16;
4446 
4447   int DMaskLanes = 0;
4448   if (!BaseOpcode->Atomic) {
4449     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4450     if (BaseOpcode->Gather4) {
4451       DMaskLanes = 4;
4452     } else if (DMask != 0) {
4453       DMaskLanes = countPopulation(DMask);
4454     } else if (!IsTFE && !BaseOpcode->Store) {
4455       // If dmask is 0, this is a no-op load. This can be eliminated.
4456       B.buildUndef(MI.getOperand(0));
4457       MI.eraseFromParent();
4458       return true;
4459     }
4460   }
4461 
4462   Observer.changingInstr(MI);
4463   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4464 
4465   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4466                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4467   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4468                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4469   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4470 
4471   // Track that we legalized this
4472   MI.setDesc(B.getTII().get(NewOpcode));
4473 
4474   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4475   // dmask to be at least 1 otherwise the instruction will fail
4476   if (IsTFE && DMask == 0) {
4477     DMask = 0x1;
4478     DMaskLanes = 1;
4479     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4480   }
4481 
4482   if (BaseOpcode->Atomic) {
4483     Register VData0 = MI.getOperand(2).getReg();
4484     LLT Ty = MRI->getType(VData0);
4485 
4486     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4487     if (Ty.isVector())
4488       return false;
4489 
4490     if (BaseOpcode->AtomicX2) {
4491       Register VData1 = MI.getOperand(3).getReg();
4492       // The two values are packed in one register.
4493       LLT PackedTy = LLT::fixed_vector(2, Ty);
4494       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4495       MI.getOperand(2).setReg(Concat.getReg(0));
4496       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4497     }
4498   }
4499 
4500   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4501 
4502   // Rewrite the addressing register layout before doing anything else.
4503   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4504     // 16 bit gradients are supported, but are tied to the A16 control
4505     // so both gradients and addresses must be 16 bit
4506     return false;
4507   }
4508 
4509   if (IsA16 && !ST.hasA16()) {
4510     // A16 not supported
4511     return false;
4512   }
4513 
4514   if (IsA16 || IsG16) {
4515     if (Intr->NumVAddrs > 1) {
4516       SmallVector<Register, 4> PackedRegs;
4517 
4518       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4519                                 IsG16);
4520 
4521       // See also below in the non-a16 branch
4522       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4523                           PackedRegs.size() <= ST.getNSAMaxSize();
4524 
4525       if (!UseNSA && PackedRegs.size() > 1) {
4526         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4527         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4528         PackedRegs[0] = Concat.getReg(0);
4529         PackedRegs.resize(1);
4530       }
4531 
4532       const unsigned NumPacked = PackedRegs.size();
4533       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4534         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4535         if (!SrcOp.isReg()) {
4536           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4537           continue;
4538         }
4539 
4540         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4541 
4542         if (I - Intr->VAddrStart < NumPacked)
4543           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4544         else
4545           SrcOp.setReg(AMDGPU::NoRegister);
4546       }
4547     }
4548   } else {
4549     // If the register allocator cannot place the address registers contiguously
4550     // without introducing moves, then using the non-sequential address encoding
4551     // is always preferable, since it saves VALU instructions and is usually a
4552     // wash in terms of code size or even better.
4553     //
4554     // However, we currently have no way of hinting to the register allocator
4555     // that MIMG addresses should be placed contiguously when it is possible to
4556     // do so, so force non-NSA for the common 2-address case as a heuristic.
4557     //
4558     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4559     // allocation when possible.
4560     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4561                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4562 
4563     if (!UseNSA && Intr->NumVAddrs > 1)
4564       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4565                                Intr->NumVAddrs);
4566   }
4567 
4568   int Flags = 0;
4569   if (IsA16)
4570     Flags |= 1;
4571   if (IsG16)
4572     Flags |= 2;
4573   MI.addOperand(MachineOperand::CreateImm(Flags));
4574 
4575   if (BaseOpcode->Store) { // No TFE for stores?
4576     // TODO: Handle dmask trim
4577     if (!Ty.isVector() || !IsD16)
4578       return true;
4579 
4580     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4581     if (RepackedReg != VData) {
4582       MI.getOperand(1).setReg(RepackedReg);
4583     }
4584 
4585     return true;
4586   }
4587 
4588   Register DstReg = MI.getOperand(0).getReg();
4589   const LLT EltTy = Ty.getScalarType();
4590   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4591 
4592   // Confirm that the return type is large enough for the dmask specified
4593   if (NumElts < DMaskLanes)
4594     return false;
4595 
4596   if (NumElts > 4 || DMaskLanes > 4)
4597     return false;
4598 
4599   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4600   const LLT AdjustedTy =
4601       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4602 
4603   // The raw dword aligned data component of the load. The only legal cases
4604   // where this matters should be when using the packed D16 format, for
4605   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4606   LLT RoundedTy;
4607 
4608   // S32 vector to to cover all data, plus TFE result element.
4609   LLT TFETy;
4610 
4611   // Register type to use for each loaded component. Will be S32 or V2S16.
4612   LLT RegTy;
4613 
4614   if (IsD16 && ST.hasUnpackedD16VMem()) {
4615     RoundedTy =
4616         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4617     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4618     RegTy = S32;
4619   } else {
4620     unsigned EltSize = EltTy.getSizeInBits();
4621     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4622     unsigned RoundedSize = 32 * RoundedElts;
4623     RoundedTy = LLT::scalarOrVector(
4624         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4625     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4626     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4627   }
4628 
4629   // The return type does not need adjustment.
4630   // TODO: Should we change s16 case to s32 or <2 x s16>?
4631   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4632     return true;
4633 
4634   Register Dst1Reg;
4635 
4636   // Insert after the instruction.
4637   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4638 
4639   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4640   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4641   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4642   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4643 
4644   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4645 
4646   MI.getOperand(0).setReg(NewResultReg);
4647 
4648   // In the IR, TFE is supposed to be used with a 2 element struct return
4649   // type. The instruction really returns these two values in one contiguous
4650   // register, with one additional dword beyond the loaded data. Rewrite the
4651   // return type to use a single register result.
4652 
4653   if (IsTFE) {
4654     Dst1Reg = MI.getOperand(1).getReg();
4655     if (MRI->getType(Dst1Reg) != S32)
4656       return false;
4657 
4658     // TODO: Make sure the TFE operand bit is set.
4659     MI.removeOperand(1);
4660 
4661     // Handle the easy case that requires no repack instructions.
4662     if (Ty == S32) {
4663       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4664       return true;
4665     }
4666   }
4667 
4668   // Now figure out how to copy the new result register back into the old
4669   // result.
4670   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4671 
4672   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4673 
4674   if (ResultNumRegs == 1) {
4675     assert(!IsTFE);
4676     ResultRegs[0] = NewResultReg;
4677   } else {
4678     // We have to repack into a new vector of some kind.
4679     for (int I = 0; I != NumDataRegs; ++I)
4680       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4681     B.buildUnmerge(ResultRegs, NewResultReg);
4682 
4683     // Drop the final TFE element to get the data part. The TFE result is
4684     // directly written to the right place already.
4685     if (IsTFE)
4686       ResultRegs.resize(NumDataRegs);
4687   }
4688 
4689   // For an s16 scalar result, we form an s32 result with a truncate regardless
4690   // of packed vs. unpacked.
4691   if (IsD16 && !Ty.isVector()) {
4692     B.buildTrunc(DstReg, ResultRegs[0]);
4693     return true;
4694   }
4695 
4696   // Avoid a build/concat_vector of 1 entry.
4697   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4698     B.buildBitcast(DstReg, ResultRegs[0]);
4699     return true;
4700   }
4701 
4702   assert(Ty.isVector());
4703 
4704   if (IsD16) {
4705     // For packed D16 results with TFE enabled, all the data components are
4706     // S32. Cast back to the expected type.
4707     //
4708     // TODO: We don't really need to use load s32 elements. We would only need one
4709     // cast for the TFE result if a multiple of v2s16 was used.
4710     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4711       for (Register &Reg : ResultRegs)
4712         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4713     } else if (ST.hasUnpackedD16VMem()) {
4714       for (Register &Reg : ResultRegs)
4715         Reg = B.buildTrunc(S16, Reg).getReg(0);
4716     }
4717   }
4718 
4719   auto padWithUndef = [&](LLT Ty, int NumElts) {
4720     if (NumElts == 0)
4721       return;
4722     Register Undef = B.buildUndef(Ty).getReg(0);
4723     for (int I = 0; I != NumElts; ++I)
4724       ResultRegs.push_back(Undef);
4725   };
4726 
4727   // Pad out any elements eliminated due to the dmask.
4728   LLT ResTy = MRI->getType(ResultRegs[0]);
4729   if (!ResTy.isVector()) {
4730     padWithUndef(ResTy, NumElts - ResultRegs.size());
4731     B.buildBuildVector(DstReg, ResultRegs);
4732     return true;
4733   }
4734 
4735   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4736   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4737 
4738   // Deal with the one annoying legal case.
4739   const LLT V3S16 = LLT::fixed_vector(3, 16);
4740   if (Ty == V3S16) {
4741     if (IsTFE) {
4742       if (ResultRegs.size() == 1) {
4743         NewResultReg = ResultRegs[0];
4744       } else if (ResultRegs.size() == 2) {
4745         LLT V4S16 = LLT::fixed_vector(4, 16);
4746         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4747       } else {
4748         return false;
4749       }
4750     }
4751 
4752     if (MRI->getType(DstReg).getNumElements() <
4753         MRI->getType(NewResultReg).getNumElements()) {
4754       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4755     } else {
4756       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4757     }
4758     return true;
4759   }
4760 
4761   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4762   B.buildConcatVectors(DstReg, ResultRegs);
4763   return true;
4764 }
4765 
4766 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4767   LegalizerHelper &Helper, MachineInstr &MI) const {
4768   MachineIRBuilder &B = Helper.MIRBuilder;
4769   GISelChangeObserver &Observer = Helper.Observer;
4770 
4771   Register Dst = MI.getOperand(0).getReg();
4772   LLT Ty = B.getMRI()->getType(Dst);
4773   unsigned Size = Ty.getSizeInBits();
4774   MachineFunction &MF = B.getMF();
4775 
4776   Observer.changingInstr(MI);
4777 
4778   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4779     Ty = getBitcastRegisterType(Ty);
4780     Helper.bitcastDst(MI, Ty, 0);
4781     Dst = MI.getOperand(0).getReg();
4782     B.setInsertPt(B.getMBB(), MI);
4783   }
4784 
4785   // FIXME: We don't really need this intermediate instruction. The intrinsic
4786   // should be fixed to have a memory operand. Since it's readnone, we're not
4787   // allowed to add one.
4788   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4789   MI.removeOperand(1); // Remove intrinsic ID
4790 
4791   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4792   // TODO: Should this use datalayout alignment?
4793   const unsigned MemSize = (Size + 7) / 8;
4794   const Align MemAlign(4);
4795   MachineMemOperand *MMO = MF.getMachineMemOperand(
4796       MachinePointerInfo(),
4797       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4798           MachineMemOperand::MOInvariant,
4799       MemSize, MemAlign);
4800   MI.addMemOperand(MF, MMO);
4801 
4802   // There are no 96-bit result scalar loads, but widening to 128-bit should
4803   // always be legal. We may need to restore this to a 96-bit result if it turns
4804   // out this needs to be converted to a vector load during RegBankSelect.
4805   if (!isPowerOf2_32(Size)) {
4806     if (Ty.isVector())
4807       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4808     else
4809       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4810   }
4811 
4812   Observer.changedInstr(MI);
4813   return true;
4814 }
4815 
4816 // TODO: Move to selection
4817 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4818                                                 MachineRegisterInfo &MRI,
4819                                                 MachineIRBuilder &B) const {
4820   if (!ST.isTrapHandlerEnabled() ||
4821       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4822     return legalizeTrapEndpgm(MI, MRI, B);
4823 
4824   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4825     switch (*HsaAbiVer) {
4826     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4827     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4828       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4829     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4830     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
4831       return ST.supportsGetDoorbellID() ?
4832           legalizeTrapHsa(MI, MRI, B) :
4833           legalizeTrapHsaQueuePtr(MI, MRI, B);
4834     }
4835   }
4836 
4837   llvm_unreachable("Unknown trap handler");
4838 }
4839 
4840 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4841     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4842   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4843   MI.eraseFromParent();
4844   return true;
4845 }
4846 
4847 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4848     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4849   MachineFunction &MF = B.getMF();
4850   const LLT S64 = LLT::scalar(64);
4851 
4852   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4853   // For code object version 5, queue_ptr is passed through implicit kernarg.
4854   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
4855     AMDGPUTargetLowering::ImplicitParameter Param =
4856         AMDGPUTargetLowering::QUEUE_PTR;
4857     uint64_t Offset =
4858         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
4859 
4860     Register KernargPtrReg = MRI.createGenericVirtualRegister(
4861         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4862 
4863     if (!loadInputValue(KernargPtrReg, B,
4864                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4865       return false;
4866 
4867     // TODO: can we be smarter about machine pointer info?
4868     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4869     MachineMemOperand *MMO = MF.getMachineMemOperand(
4870         PtrInfo,
4871         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4872             MachineMemOperand::MOInvariant,
4873         LLT::scalar(64), commonAlignment(Align(64), Offset));
4874 
4875     // Pointer address
4876     Register LoadAddr = MRI.createGenericVirtualRegister(
4877         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4878     B.buildPtrAdd(LoadAddr, KernargPtrReg,
4879                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
4880     // Load address
4881     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
4882     B.buildCopy(SGPR01, Temp);
4883     B.buildInstr(AMDGPU::S_TRAP)
4884         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4885         .addReg(SGPR01, RegState::Implicit);
4886     MI.eraseFromParent();
4887     return true;
4888   }
4889 
4890   // Pass queue pointer to trap handler as input, and insert trap instruction
4891   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4892   Register LiveIn =
4893     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4894   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4895     return false;
4896 
4897   B.buildCopy(SGPR01, LiveIn);
4898   B.buildInstr(AMDGPU::S_TRAP)
4899       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4900       .addReg(SGPR01, RegState::Implicit);
4901 
4902   MI.eraseFromParent();
4903   return true;
4904 }
4905 
4906 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4907     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4908   B.buildInstr(AMDGPU::S_TRAP)
4909       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4910   MI.eraseFromParent();
4911   return true;
4912 }
4913 
4914 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4915     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4916   // Is non-HSA path or trap-handler disabled? Then, report a warning
4917   // accordingly
4918   if (!ST.isTrapHandlerEnabled() ||
4919       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4920     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4921                                      "debugtrap handler not supported",
4922                                      MI.getDebugLoc(), DS_Warning);
4923     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4924     Ctx.diagnose(NoTrap);
4925   } else {
4926     // Insert debug-trap instruction
4927     B.buildInstr(AMDGPU::S_TRAP)
4928         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4929   }
4930 
4931   MI.eraseFromParent();
4932   return true;
4933 }
4934 
4935 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4936                                                MachineIRBuilder &B) const {
4937   MachineRegisterInfo &MRI = *B.getMRI();
4938   const LLT S16 = LLT::scalar(16);
4939   const LLT S32 = LLT::scalar(32);
4940 
4941   Register DstReg = MI.getOperand(0).getReg();
4942   Register NodePtr = MI.getOperand(2).getReg();
4943   Register RayExtent = MI.getOperand(3).getReg();
4944   Register RayOrigin = MI.getOperand(4).getReg();
4945   Register RayDir = MI.getOperand(5).getReg();
4946   Register RayInvDir = MI.getOperand(6).getReg();
4947   Register TDescr = MI.getOperand(7).getReg();
4948 
4949   if (!ST.hasGFX10_AEncoding()) {
4950     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4951                                         "intrinsic not supported on subtarget",
4952                                         MI.getDebugLoc());
4953     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4954     return false;
4955   }
4956 
4957   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4958   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4959   const unsigned NumVDataDwords = 4;
4960   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4961   const bool UseNSA =
4962       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4963   const unsigned BaseOpcodes[2][2] = {
4964       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4965       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4966        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4967   int Opcode;
4968   if (UseNSA) {
4969     Opcode =
4970         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4971                               NumVDataDwords, NumVAddrDwords);
4972   } else {
4973     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4974                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4975                                    PowerOf2Ceil(NumVAddrDwords));
4976   }
4977   assert(Opcode != -1);
4978 
4979   SmallVector<Register, 12> Ops;
4980   if (Is64) {
4981     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4982     Ops.push_back(Unmerge.getReg(0));
4983     Ops.push_back(Unmerge.getReg(1));
4984   } else {
4985     Ops.push_back(NodePtr);
4986   }
4987   Ops.push_back(RayExtent);
4988 
4989   auto packLanes = [&Ops, &S32, &B](Register Src) {
4990     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4991     Ops.push_back(Unmerge.getReg(0));
4992     Ops.push_back(Unmerge.getReg(1));
4993     Ops.push_back(Unmerge.getReg(2));
4994   };
4995 
4996   packLanes(RayOrigin);
4997   if (IsA16) {
4998     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4999     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5000     Register R1 = MRI.createGenericVirtualRegister(S32);
5001     Register R2 = MRI.createGenericVirtualRegister(S32);
5002     Register R3 = MRI.createGenericVirtualRegister(S32);
5003     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5004     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5005     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5006     Ops.push_back(R1);
5007     Ops.push_back(R2);
5008     Ops.push_back(R3);
5009   } else {
5010     packLanes(RayDir);
5011     packLanes(RayInvDir);
5012   }
5013 
5014   if (!UseNSA) {
5015     // Build a single vector containing all the operands so far prepared.
5016     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5017     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5018     Ops.clear();
5019     Ops.push_back(MergedOps);
5020   }
5021 
5022   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5023     .addDef(DstReg)
5024     .addImm(Opcode);
5025 
5026   for (Register R : Ops) {
5027     MIB.addUse(R);
5028   }
5029 
5030   MIB.addUse(TDescr)
5031      .addImm(IsA16 ? 1 : 0)
5032      .cloneMemRefs(MI);
5033 
5034   MI.eraseFromParent();
5035   return true;
5036 }
5037 
5038 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
5039   B.buildConstant(MI.getOperand(0).getReg(), C);
5040   MI.eraseFromParent();
5041   return true;
5042 }
5043 
5044 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5045                                                MachineIRBuilder &B) const {
5046   unsigned Opc;
5047   int RoundMode = MI.getOperand(2).getImm();
5048 
5049   if (RoundMode == (int)RoundingMode::TowardPositive)
5050     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5051   else if (RoundMode == (int)RoundingMode::TowardNegative)
5052     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5053   else
5054     return false;
5055 
5056   B.buildInstr(Opc)
5057       .addDef(MI.getOperand(0).getReg())
5058       .addUse(MI.getOperand(1).getReg());
5059 
5060   MI.eraseFromParent();
5061 
5062   return true;
5063 }
5064 
5065 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5066                                             MachineInstr &MI) const {
5067   MachineIRBuilder &B = Helper.MIRBuilder;
5068   MachineRegisterInfo &MRI = *B.getMRI();
5069 
5070   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5071   auto IntrID = MI.getIntrinsicID();
5072   switch (IntrID) {
5073   case Intrinsic::amdgcn_if:
5074   case Intrinsic::amdgcn_else: {
5075     MachineInstr *Br = nullptr;
5076     MachineBasicBlock *UncondBrTarget = nullptr;
5077     bool Negated = false;
5078     if (MachineInstr *BrCond =
5079             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5080       const SIRegisterInfo *TRI
5081         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5082 
5083       Register Def = MI.getOperand(1).getReg();
5084       Register Use = MI.getOperand(3).getReg();
5085 
5086       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5087 
5088       if (Negated)
5089         std::swap(CondBrTarget, UncondBrTarget);
5090 
5091       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5092       if (IntrID == Intrinsic::amdgcn_if) {
5093         B.buildInstr(AMDGPU::SI_IF)
5094           .addDef(Def)
5095           .addUse(Use)
5096           .addMBB(UncondBrTarget);
5097       } else {
5098         B.buildInstr(AMDGPU::SI_ELSE)
5099             .addDef(Def)
5100             .addUse(Use)
5101             .addMBB(UncondBrTarget);
5102       }
5103 
5104       if (Br) {
5105         Br->getOperand(0).setMBB(CondBrTarget);
5106       } else {
5107         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5108         // since we're swapping branch targets it needs to be reinserted.
5109         // FIXME: IRTranslator should probably not do this
5110         B.buildBr(*CondBrTarget);
5111       }
5112 
5113       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5114       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5115       MI.eraseFromParent();
5116       BrCond->eraseFromParent();
5117       return true;
5118     }
5119 
5120     return false;
5121   }
5122   case Intrinsic::amdgcn_loop: {
5123     MachineInstr *Br = nullptr;
5124     MachineBasicBlock *UncondBrTarget = nullptr;
5125     bool Negated = false;
5126     if (MachineInstr *BrCond =
5127             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5128       const SIRegisterInfo *TRI
5129         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5130 
5131       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5132       Register Reg = MI.getOperand(2).getReg();
5133 
5134       if (Negated)
5135         std::swap(CondBrTarget, UncondBrTarget);
5136 
5137       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5138       B.buildInstr(AMDGPU::SI_LOOP)
5139         .addUse(Reg)
5140         .addMBB(UncondBrTarget);
5141 
5142       if (Br)
5143         Br->getOperand(0).setMBB(CondBrTarget);
5144       else
5145         B.buildBr(*CondBrTarget);
5146 
5147       MI.eraseFromParent();
5148       BrCond->eraseFromParent();
5149       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5150       return true;
5151     }
5152 
5153     return false;
5154   }
5155   case Intrinsic::amdgcn_kernarg_segment_ptr:
5156     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5157       // This only makes sense to call in a kernel, so just lower to null.
5158       B.buildConstant(MI.getOperand(0).getReg(), 0);
5159       MI.eraseFromParent();
5160       return true;
5161     }
5162 
5163     return legalizePreloadedArgIntrin(
5164       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5165   case Intrinsic::amdgcn_implicitarg_ptr:
5166     return legalizeImplicitArgPtr(MI, MRI, B);
5167   case Intrinsic::amdgcn_workitem_id_x:
5168     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5169       return replaceWithConstant(B, MI, 0);
5170     return legalizePreloadedArgIntrin(MI, MRI, B,
5171                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5172   case Intrinsic::amdgcn_workitem_id_y:
5173     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5174       return replaceWithConstant(B, MI, 0);
5175 
5176     return legalizePreloadedArgIntrin(MI, MRI, B,
5177                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5178   case Intrinsic::amdgcn_workitem_id_z:
5179     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5180       return replaceWithConstant(B, MI, 0);
5181 
5182     return legalizePreloadedArgIntrin(MI, MRI, B,
5183                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5184   case Intrinsic::amdgcn_workgroup_id_x:
5185     return legalizePreloadedArgIntrin(MI, MRI, B,
5186                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5187   case Intrinsic::amdgcn_workgroup_id_y:
5188     return legalizePreloadedArgIntrin(MI, MRI, B,
5189                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5190   case Intrinsic::amdgcn_workgroup_id_z:
5191     return legalizePreloadedArgIntrin(MI, MRI, B,
5192                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5193   case Intrinsic::amdgcn_dispatch_ptr:
5194     return legalizePreloadedArgIntrin(MI, MRI, B,
5195                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5196   case Intrinsic::amdgcn_queue_ptr:
5197     return legalizePreloadedArgIntrin(MI, MRI, B,
5198                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5199   case Intrinsic::amdgcn_implicit_buffer_ptr:
5200     return legalizePreloadedArgIntrin(
5201       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5202   case Intrinsic::amdgcn_dispatch_id:
5203     return legalizePreloadedArgIntrin(MI, MRI, B,
5204                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5205   case Intrinsic::amdgcn_fdiv_fast:
5206     return legalizeFDIVFastIntrin(MI, MRI, B);
5207   case Intrinsic::amdgcn_is_shared:
5208     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5209   case Intrinsic::amdgcn_is_private:
5210     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5211   case Intrinsic::amdgcn_wavefrontsize: {
5212     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5213     MI.eraseFromParent();
5214     return true;
5215   }
5216   case Intrinsic::amdgcn_s_buffer_load:
5217     return legalizeSBufferLoad(Helper, MI);
5218   case Intrinsic::amdgcn_raw_buffer_store:
5219   case Intrinsic::amdgcn_struct_buffer_store:
5220     return legalizeBufferStore(MI, MRI, B, false, false);
5221   case Intrinsic::amdgcn_raw_buffer_store_format:
5222   case Intrinsic::amdgcn_struct_buffer_store_format:
5223     return legalizeBufferStore(MI, MRI, B, false, true);
5224   case Intrinsic::amdgcn_raw_tbuffer_store:
5225   case Intrinsic::amdgcn_struct_tbuffer_store:
5226     return legalizeBufferStore(MI, MRI, B, true, true);
5227   case Intrinsic::amdgcn_raw_buffer_load:
5228   case Intrinsic::amdgcn_struct_buffer_load:
5229     return legalizeBufferLoad(MI, MRI, B, false, false);
5230   case Intrinsic::amdgcn_raw_buffer_load_format:
5231   case Intrinsic::amdgcn_struct_buffer_load_format:
5232     return legalizeBufferLoad(MI, MRI, B, true, false);
5233   case Intrinsic::amdgcn_raw_tbuffer_load:
5234   case Intrinsic::amdgcn_struct_tbuffer_load:
5235     return legalizeBufferLoad(MI, MRI, B, true, true);
5236   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5237   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5238   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5239   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5240   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5241   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5242   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5243   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5244   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5245   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5246   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5247   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5248   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5249   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5250   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5251   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5252   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5253   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5254   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5255   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5256   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5257   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5258   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5259   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5260   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5261   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5262   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5263   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5264   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5265   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5266     return legalizeBufferAtomic(MI, B, IntrID);
5267   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5268   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5269     Register DstReg = MI.getOperand(0).getReg();
5270     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5271       Function &F = B.getMF().getFunction();
5272       DiagnosticInfoUnsupported NoFpRet(
5273           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5274           DS_Error);
5275       F.getContext().diagnose(NoFpRet);
5276       B.buildUndef(DstReg);
5277       MI.eraseFromParent();
5278       return true;
5279     }
5280 
5281     return legalizeBufferAtomic(MI, B, IntrID);
5282   }
5283   case Intrinsic::amdgcn_atomic_inc:
5284     return legalizeAtomicIncDec(MI, B, true);
5285   case Intrinsic::amdgcn_atomic_dec:
5286     return legalizeAtomicIncDec(MI, B, false);
5287   case Intrinsic::trap:
5288     return legalizeTrapIntrinsic(MI, MRI, B);
5289   case Intrinsic::debugtrap:
5290     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5291   case Intrinsic::amdgcn_rsq_clamp:
5292     return legalizeRsqClampIntrinsic(MI, MRI, B);
5293   case Intrinsic::amdgcn_ds_fadd:
5294   case Intrinsic::amdgcn_ds_fmin:
5295   case Intrinsic::amdgcn_ds_fmax:
5296     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5297   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5298     return legalizeBVHIntrinsic(MI, B);
5299   default: {
5300     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5301             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5302       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5303     return true;
5304   }
5305   }
5306 
5307   return true;
5308 }
5309