1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElementsStrict(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 
1056     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057       return true;
1058 
1059     const LLT PtrTy = Query.Types[1];
1060     unsigned AS = PtrTy.getAddressSpace();
1061     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062       return true;
1063 
1064     // Catch weird sized loads that don't evenly divide into the access sizes
1065     // TODO: May be able to widen depending on alignment etc.
1066     unsigned NumRegs = (MemSize + 31) / 32;
1067     if (NumRegs == 3) {
1068       if (!ST.hasDwordx3LoadStores())
1069         return true;
1070     } else {
1071       // If the alignment allows, these should have been widened.
1072       if (!isPowerOf2_32(NumRegs))
1073         return true;
1074     }
1075 
1076     return false;
1077   };
1078 
1079   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084   // LDS
1085   // TODO: Unsupported flat for SI.
1086 
1087   for (unsigned Op : {G_LOAD, G_STORE}) {
1088     const bool IsStore = Op == G_STORE;
1089 
1090     auto &Actions = getActionDefinitionsBuilder(Op);
1091     // Explicitly list some common cases.
1092     // TODO: Does this help compile time at all?
1093     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096                                       {S64, GlobalPtr, S64, GlobalAlign32},
1097                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099                                       {S32, GlobalPtr, S8, GlobalAlign8},
1100                                       {S32, GlobalPtr, S16, GlobalAlign16},
1101 
1102                                       {S32, LocalPtr, S32, 32},
1103                                       {S64, LocalPtr, S64, 32},
1104                                       {V2S32, LocalPtr, V2S32, 32},
1105                                       {S32, LocalPtr, S8, 8},
1106                                       {S32, LocalPtr, S16, 16},
1107                                       {V2S16, LocalPtr, S32, 32},
1108 
1109                                       {S32, PrivatePtr, S32, 32},
1110                                       {S32, PrivatePtr, S8, 8},
1111                                       {S32, PrivatePtr, S16, 16},
1112                                       {V2S16, PrivatePtr, S32, 32},
1113 
1114                                       {S32, ConstantPtr, S32, GlobalAlign32},
1115                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117                                       {S64, ConstantPtr, S64, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119     Actions.legalIf(
1120       [=](const LegalityQuery &Query) -> bool {
1121         return isLoadStoreLegal(ST, Query);
1122       });
1123 
1124     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125     // 64-bits.
1126     //
1127     // TODO: Should generalize bitcast action into coerce, which will also cover
1128     // inserting addrspacecasts.
1129     Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131     // Turn any illegal element vectors into something easier to deal
1132     // with. These will ultimately produce 32-bit scalar shifts to extract the
1133     // parts anyway.
1134     //
1135     // For odd 16-bit element vectors, prefer to split those into pieces with
1136     // 16-bit vector parts.
1137     Actions.bitcastIf(
1138       [=](const LegalityQuery &Query) -> bool {
1139         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140                                           Query.MMODescrs[0].MemoryTy);
1141       }, bitcastToRegisterType(0));
1142 
1143     if (!IsStore) {
1144       // Widen suitably aligned loads by loading extra bytes. The standard
1145       // legalization actions can't properly express widening memory operands.
1146       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147         return shouldWidenLoad(ST, Query, G_LOAD);
1148       });
1149     }
1150 
1151     // FIXME: load/store narrowing should be moved to lower action
1152     Actions
1153         .narrowScalarIf(
1154             [=](const LegalityQuery &Query) -> bool {
1155               return !Query.Types[0].isVector() &&
1156                      needToSplitMemOp(Query, Op == G_LOAD);
1157             },
1158             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159               const LLT DstTy = Query.Types[0];
1160               const LLT PtrTy = Query.Types[1];
1161 
1162               const unsigned DstSize = DstTy.getSizeInBits();
1163               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164 
1165               // Split extloads.
1166               if (DstSize > MemSize)
1167                 return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169               unsigned MaxSize = maxSizeForAddrSpace(ST,
1170                                                      PtrTy.getAddressSpace(),
1171                                                      Op == G_LOAD);
1172               if (MemSize > MaxSize)
1173                 return std::make_pair(0, LLT::scalar(MaxSize));
1174 
1175               unsigned Align = Query.MMODescrs[0].AlignInBits;
1176               return std::make_pair(0, LLT::scalar(Align));
1177             })
1178         .fewerElementsIf(
1179             [=](const LegalityQuery &Query) -> bool {
1180               return Query.Types[0].isVector() &&
1181                      needToSplitMemOp(Query, Op == G_LOAD);
1182             },
1183             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184               const LLT DstTy = Query.Types[0];
1185               const LLT PtrTy = Query.Types[1];
1186 
1187               LLT EltTy = DstTy.getElementType();
1188               unsigned MaxSize = maxSizeForAddrSpace(ST,
1189                                                      PtrTy.getAddressSpace(),
1190                                                      Op == G_LOAD);
1191 
1192               // FIXME: Handle widened to power of 2 results better. This ends
1193               // up scalarizing.
1194               // FIXME: 3 element stores scalarized on SI
1195 
1196               // Split if it's too large for the address space.
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198               if (MemSize > MaxSize) {
1199                 unsigned NumElts = DstTy.getNumElements();
1200                 unsigned EltSize = EltTy.getSizeInBits();
1201 
1202                 if (MaxSize % EltSize == 0) {
1203                   return std::make_pair(
1204                       0, LLT::scalarOrVector(
1205                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = MemSize / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(
1217                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               if (DstTy.getSizeInBits() > MemSize)
1222                 return std::make_pair(0, EltTy);
1223 
1224               unsigned EltSize = EltTy.getSizeInBits();
1225               unsigned DstSize = DstTy.getSizeInBits();
1226               if (!isPowerOf2_32(DstSize)) {
1227                 // We're probably decomposing an odd sized store. Try to split
1228                 // to the widest type. TODO: Account for alignment. As-is it
1229                 // should be OK, since the new parts will be further legalized.
1230                 unsigned FloorSize = PowerOf2Floor(DstSize);
1231                 return std::make_pair(
1232                     0, LLT::scalarOrVector(
1233                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234               }
1235 
1236               // May need relegalization for the scalars.
1237               return std::make_pair(0, EltTy);
1238             })
1239     .minScalar(0, S32)
1240     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241     .widenScalarToNextPow2(0)
1242     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243     .lower();
1244   }
1245 
1246   // FIXME: Unaligned accesses not lowered.
1247   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249                                                   {S32, GlobalPtr, S16, 2 * 8},
1250                                                   {S32, LocalPtr, S8, 8},
1251                                                   {S32, LocalPtr, S16, 16},
1252                                                   {S32, PrivatePtr, S8, 8},
1253                                                   {S32, PrivatePtr, S16, 16},
1254                                                   {S32, ConstantPtr, S8, 8},
1255                                                   {S32, ConstantPtr, S16, 2 * 8}})
1256                        .legalIf(
1257                          [=](const LegalityQuery &Query) -> bool {
1258                            return isLoadStoreLegal(ST, Query);
1259                          });
1260 
1261   if (ST.hasFlatAddressSpace()) {
1262     ExtLoads.legalForTypesWithMemDesc(
1263         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264   }
1265 
1266   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267   // 64-bits.
1268   //
1269   // TODO: Should generalize bitcast action into coerce, which will also cover
1270   // inserting addrspacecasts.
1271   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272 
1273   ExtLoads.clampScalar(0, S32, S32)
1274           .widenScalarToNextPow2(0)
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290   if (ST.hasLDSFPAtomicAdd()) {
1291     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292     if (ST.hasGFX90AInsts())
1293       Atomic.legalFor({{S64, LocalPtr}});
1294   }
1295   if (ST.hasAtomicFaddInsts())
1296     Atomic.legalFor({{S32, GlobalPtr}});
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310                                  LocalPtr, FlatPtr, PrivatePtr,
1311                                  LLT::fixed_vector(2, LocalPtr),
1312                                  LLT::fixed_vector(2, PrivatePtr)},
1313                                 {S1, S32})
1314       .clampScalar(0, S16, S64)
1315       .scalarize(1)
1316       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318       .clampMaxNumElements(0, S32, 2)
1319       .clampMaxNumElements(0, LocalPtr, 2)
1320       .clampMaxNumElements(0, PrivatePtr, 2)
1321       .scalarize(0)
1322       .widenScalarToNextPow2(0)
1323       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324 
1325   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326   // be more flexible with the shift amount type.
1327   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328     .legalFor({{S32, S32}, {S64, S32}});
1329   if (ST.has16BitInsts()) {
1330     if (ST.hasVOP3PInsts()) {
1331       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332             .clampMaxNumElements(0, S16, 2);
1333     } else
1334       Shifts.legalFor({{S16, S16}});
1335 
1336     // TODO: Support 16-bit shift amounts for all types
1337     Shifts.widenScalarIf(
1338       [=](const LegalityQuery &Query) {
1339         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340         // 32-bit amount.
1341         const LLT ValTy = Query.Types[0];
1342         const LLT AmountTy = Query.Types[1];
1343         return ValTy.getSizeInBits() <= 16 &&
1344                AmountTy.getSizeInBits() < 16;
1345       }, changeTo(1, S16));
1346     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347     Shifts.clampScalar(1, S32, S32);
1348     Shifts.clampScalar(0, S16, S64);
1349     Shifts.widenScalarToNextPow2(0, 16);
1350 
1351     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352       .minScalar(0, S16)
1353       .scalarize(0)
1354       .lower();
1355   } else {
1356     // Make sure we legalize the shift amount type first, as the general
1357     // expansion for the shifted type will produce much worse code if it hasn't
1358     // been truncated already.
1359     Shifts.clampScalar(1, S32, S32);
1360     Shifts.clampScalar(0, S32, S64);
1361     Shifts.widenScalarToNextPow2(0, 32);
1362 
1363     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364       .minScalar(0, S32)
1365       .scalarize(0)
1366       .lower();
1367   }
1368   Shifts.scalarize(0);
1369 
1370   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373     unsigned IdxTypeIdx = 2;
1374 
1375     getActionDefinitionsBuilder(Op)
1376       .customIf([=](const LegalityQuery &Query) {
1377           const LLT EltTy = Query.Types[EltTypeIdx];
1378           const LLT VecTy = Query.Types[VecTypeIdx];
1379           const LLT IdxTy = Query.Types[IdxTypeIdx];
1380           const unsigned EltSize = EltTy.getSizeInBits();
1381           return (EltSize == 32 || EltSize == 64) &&
1382                   VecTy.getSizeInBits() % 32 == 0 &&
1383                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1384                   IdxTy.getSizeInBits() == 32;
1385         })
1386       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387                  bitcastToVectorElement32(VecTypeIdx))
1388       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389       .bitcastIf(
1390         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391         [=](const LegalityQuery &Query) {
1392           // For > 64-bit element types, try to turn this into a 64-bit
1393           // element vector since we may be able to do better indexing
1394           // if this is scalar. If not, fall back to 32.
1395           const LLT EltTy = Query.Types[EltTypeIdx];
1396           const LLT VecTy = Query.Types[VecTypeIdx];
1397           const unsigned DstEltSize = EltTy.getSizeInBits();
1398           const unsigned VecSize = VecTy.getSizeInBits();
1399 
1400           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401           return std::make_pair(
1402               VecTypeIdx,
1403               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404         })
1405       .clampScalar(EltTypeIdx, S32, S64)
1406       .clampScalar(VecTypeIdx, S32, S64)
1407       .clampScalar(IdxTypeIdx, S32, S32)
1408       .clampMaxNumElements(VecTypeIdx, S32, 32)
1409       // TODO: Clamp elements for 64-bit vectors?
1410       // It should only be necessary with variable indexes.
1411       // As a last resort, lower to the stack
1412       .lower();
1413   }
1414 
1415   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416     .unsupportedIf([=](const LegalityQuery &Query) {
1417         const LLT &EltTy = Query.Types[1].getElementType();
1418         return Query.Types[0] != EltTy;
1419       });
1420 
1421   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425     // FIXME: Doesn't handle extract of illegal sizes.
1426     getActionDefinitionsBuilder(Op)
1427       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428       .lowerIf([=](const LegalityQuery &Query) {
1429           // Sub-vector(or single element) insert and extract.
1430           // TODO: verify immediate offset here since lower only works with
1431           // whole elements.
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           return BigTy.isVector();
1434         })
1435       // FIXME: Multiples of 16 should not be legal.
1436       .legalIf([=](const LegalityQuery &Query) {
1437           const LLT BigTy = Query.Types[BigTyIdx];
1438           const LLT LitTy = Query.Types[LitTyIdx];
1439           return (BigTy.getSizeInBits() % 32 == 0) &&
1440                  (LitTy.getSizeInBits() % 16 == 0);
1441         })
1442       .widenScalarIf(
1443         [=](const LegalityQuery &Query) {
1444           const LLT BigTy = Query.Types[BigTyIdx];
1445           return (BigTy.getScalarSizeInBits() < 16);
1446         },
1447         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1448       .widenScalarIf(
1449         [=](const LegalityQuery &Query) {
1450           const LLT LitTy = Query.Types[LitTyIdx];
1451           return (LitTy.getScalarSizeInBits() < 16);
1452         },
1453         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1454       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455       .widenScalarToNextPow2(BigTyIdx, 32);
1456 
1457   }
1458 
1459   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460     .legalForCartesianProduct(AllS32Vectors, {S32})
1461     .legalForCartesianProduct(AllS64Vectors, {S64})
1462     .clampNumElements(0, V16S32, V32S32)
1463     .clampNumElements(0, V2S64, V16S64)
1464     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465 
1466   if (ST.hasScalarPackInsts()) {
1467     BuildVector
1468       // FIXME: Should probably widen s1 vectors straight to s32
1469       .minScalarOrElt(0, S16)
1470       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471       .minScalar(1, S32);
1472 
1473     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474       .legalFor({V2S16, S32})
1475       .lower();
1476     BuildVector.minScalarOrElt(0, S32);
1477   } else {
1478     BuildVector.customFor({V2S16, S16});
1479     BuildVector.minScalarOrElt(0, S32);
1480 
1481     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482       .customFor({V2S16, S32})
1483       .lower();
1484   }
1485 
1486   BuildVector.legalIf(isRegisterType(0));
1487 
1488   // FIXME: Clamp maximum size
1489   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491     .clampMaxNumElements(0, S32, 32)
1492     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493     .clampMaxNumElements(0, S16, 64);
1494 
1495   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496   // pre-legalize.
1497   if (ST.hasVOP3PInsts()) {
1498     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499       .customFor({V2S16, V2S16})
1500       .lower();
1501   } else
1502     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503 
1504   // Merge/Unmerge
1505   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508 
1509     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510       const LLT Ty = Query.Types[TypeIdx];
1511       if (Ty.isVector()) {
1512         const LLT &EltTy = Ty.getElementType();
1513         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514           return true;
1515         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516           return true;
1517       }
1518       return false;
1519     };
1520 
1521     auto &Builder = getActionDefinitionsBuilder(Op)
1522       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523       .lowerFor({{S16, V2S16}})
1524       .lowerIf([=](const LegalityQuery &Query) {
1525           const LLT BigTy = Query.Types[BigTyIdx];
1526           return BigTy.getSizeInBits() == 32;
1527         })
1528       // Try to widen to s16 first for small types.
1529       // TODO: Only do this on targets with legal s16 shifts
1530       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534                            elementTypeIs(1, S16)),
1535                        changeTo(1, V2S16))
1536       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538       // valid.
1539       .clampScalar(LitTyIdx, S32, S512)
1540       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541       // Break up vectors with weird elements into scalars
1542       .fewerElementsIf(
1543         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544         scalarize(0))
1545       .fewerElementsIf(
1546         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547         scalarize(1))
1548       .clampScalar(BigTyIdx, S32, MaxScalar);
1549 
1550     if (Op == G_MERGE_VALUES) {
1551       Builder.widenScalarIf(
1552         // TODO: Use 16-bit shifts if legal for 8-bit values?
1553         [=](const LegalityQuery &Query) {
1554           const LLT Ty = Query.Types[LitTyIdx];
1555           return Ty.getSizeInBits() < 32;
1556         },
1557         changeTo(LitTyIdx, S32));
1558     }
1559 
1560     Builder.widenScalarIf(
1561       [=](const LegalityQuery &Query) {
1562         const LLT Ty = Query.Types[BigTyIdx];
1563         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564           Ty.getSizeInBits() % 16 != 0;
1565       },
1566       [=](const LegalityQuery &Query) {
1567         // Pick the next power of 2, or a multiple of 64 over 128.
1568         // Whichever is smaller.
1569         const LLT &Ty = Query.Types[BigTyIdx];
1570         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571         if (NewSizeInBits >= 256) {
1572           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573           if (RoundedTo < NewSizeInBits)
1574             NewSizeInBits = RoundedTo;
1575         }
1576         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577       })
1578       // Any vectors left are the wrong size. Scalarize them.
1579       .scalarize(0)
1580       .scalarize(1);
1581   }
1582 
1583   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584   // RegBankSelect.
1585   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586     .legalFor({{S32}, {S64}});
1587 
1588   if (ST.hasVOP3PInsts()) {
1589     SextInReg.lowerFor({{V2S16}})
1590       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591       // get more vector shift opportunities, since we'll get those when
1592       // expanded.
1593       .clampMaxNumElementsStrict(0, S16, 2);
1594   } else if (ST.has16BitInsts()) {
1595     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596   } else {
1597     // Prefer to promote to s32 before lowering if we don't have 16-bit
1598     // shifts. This avoid a lot of intermediate truncate and extend operations.
1599     SextInReg.lowerFor({{S32}, {S64}});
1600   }
1601 
1602   SextInReg
1603     .scalarize(0)
1604     .clampScalar(0, S32, S64)
1605     .lower();
1606 
1607   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608     .scalarize(0)
1609     .lower();
1610 
1611   // TODO: Only Try to form v2s16 with legal packed instructions.
1612   getActionDefinitionsBuilder(G_FSHR)
1613     .legalFor({{S32, S32}})
1614     .lowerFor({{V2S16, V2S16}})
1615     .clampMaxNumElementsStrict(0, S16, 2)
1616     .scalarize(0)
1617     .lower();
1618 
1619   if (ST.hasVOP3PInsts()) {
1620     getActionDefinitionsBuilder(G_FSHL)
1621       .lowerFor({{V2S16, V2S16}})
1622       .clampMaxNumElementsStrict(0, S16, 2)
1623       .scalarize(0)
1624       .lower();
1625   } else {
1626     getActionDefinitionsBuilder(G_FSHL)
1627       .scalarize(0)
1628       .lower();
1629   }
1630 
1631   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632     .legalFor({S64});
1633 
1634   getActionDefinitionsBuilder(G_FENCE)
1635     .alwaysLegal();
1636 
1637   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638       .scalarize(0)
1639       .minScalar(0, S32)
1640       .lower();
1641 
1642   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643       .legalFor({{S32, S32}, {S64, S32}})
1644       .clampScalar(1, S32, S32)
1645       .clampScalar(0, S32, S64)
1646       .widenScalarToNextPow2(0)
1647       .scalarize(0);
1648 
1649   getActionDefinitionsBuilder({
1650       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651       G_FCOPYSIGN,
1652 
1653       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654       G_ATOMICRMW_NAND,
1655       G_ATOMICRMW_FSUB,
1656       G_READ_REGISTER,
1657       G_WRITE_REGISTER,
1658 
1659       G_SADDO, G_SSUBO,
1660 
1661        // TODO: Implement
1662       G_FMINIMUM, G_FMAXIMUM}).lower();
1663 
1664   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665       .lower();
1666 
1667   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670     .unsupported();
1671 
1672   getLegacyLegalizerInfo().computeTables();
1673   verify(*ST.getInstrInfo());
1674 }
1675 
1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677                                          MachineInstr &MI) const {
1678   MachineIRBuilder &B = Helper.MIRBuilder;
1679   MachineRegisterInfo &MRI = *B.getMRI();
1680 
1681   switch (MI.getOpcode()) {
1682   case TargetOpcode::G_ADDRSPACE_CAST:
1683     return legalizeAddrSpaceCast(MI, MRI, B);
1684   case TargetOpcode::G_FRINT:
1685     return legalizeFrint(MI, MRI, B);
1686   case TargetOpcode::G_FCEIL:
1687     return legalizeFceil(MI, MRI, B);
1688   case TargetOpcode::G_FREM:
1689     return legalizeFrem(MI, MRI, B);
1690   case TargetOpcode::G_INTRINSIC_TRUNC:
1691     return legalizeIntrinsicTrunc(MI, MRI, B);
1692   case TargetOpcode::G_SITOFP:
1693     return legalizeITOFP(MI, MRI, B, true);
1694   case TargetOpcode::G_UITOFP:
1695     return legalizeITOFP(MI, MRI, B, false);
1696   case TargetOpcode::G_FPTOSI:
1697     return legalizeFPTOI(MI, MRI, B, true);
1698   case TargetOpcode::G_FPTOUI:
1699     return legalizeFPTOI(MI, MRI, B, false);
1700   case TargetOpcode::G_FMINNUM:
1701   case TargetOpcode::G_FMAXNUM:
1702   case TargetOpcode::G_FMINNUM_IEEE:
1703   case TargetOpcode::G_FMAXNUM_IEEE:
1704     return legalizeMinNumMaxNum(Helper, MI);
1705   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706     return legalizeExtractVectorElt(MI, MRI, B);
1707   case TargetOpcode::G_INSERT_VECTOR_ELT:
1708     return legalizeInsertVectorElt(MI, MRI, B);
1709   case TargetOpcode::G_SHUFFLE_VECTOR:
1710     return legalizeShuffleVector(MI, MRI, B);
1711   case TargetOpcode::G_FSIN:
1712   case TargetOpcode::G_FCOS:
1713     return legalizeSinCos(MI, MRI, B);
1714   case TargetOpcode::G_GLOBAL_VALUE:
1715     return legalizeGlobalValue(MI, MRI, B);
1716   case TargetOpcode::G_LOAD:
1717   case TargetOpcode::G_SEXTLOAD:
1718   case TargetOpcode::G_ZEXTLOAD:
1719     return legalizeLoad(Helper, MI);
1720   case TargetOpcode::G_FMAD:
1721     return legalizeFMad(MI, MRI, B);
1722   case TargetOpcode::G_FDIV:
1723     return legalizeFDIV(MI, MRI, B);
1724   case TargetOpcode::G_UDIV:
1725   case TargetOpcode::G_UREM:
1726   case TargetOpcode::G_UDIVREM:
1727     return legalizeUnsignedDIV_REM(MI, MRI, B);
1728   case TargetOpcode::G_SDIV:
1729   case TargetOpcode::G_SREM:
1730   case TargetOpcode::G_SDIVREM:
1731     return legalizeSignedDIV_REM(MI, MRI, B);
1732   case TargetOpcode::G_ATOMIC_CMPXCHG:
1733     return legalizeAtomicCmpXChg(MI, MRI, B);
1734   case TargetOpcode::G_FLOG:
1735     return legalizeFlog(MI, B, numbers::ln2f);
1736   case TargetOpcode::G_FLOG10:
1737     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738   case TargetOpcode::G_FEXP:
1739     return legalizeFExp(MI, B);
1740   case TargetOpcode::G_FPOW:
1741     return legalizeFPow(MI, B);
1742   case TargetOpcode::G_FFLOOR:
1743     return legalizeFFloor(MI, MRI, B);
1744   case TargetOpcode::G_BUILD_VECTOR:
1745     return legalizeBuildVector(MI, MRI, B);
1746   case TargetOpcode::G_CTLZ:
1747   case TargetOpcode::G_CTTZ:
1748     return legalizeCTLZ_CTTZ(MI, MRI, B);
1749   default:
1750     return false;
1751   }
1752 
1753   llvm_unreachable("expected switch to return");
1754 }
1755 
1756 Register AMDGPULegalizerInfo::getSegmentAperture(
1757   unsigned AS,
1758   MachineRegisterInfo &MRI,
1759   MachineIRBuilder &B) const {
1760   MachineFunction &MF = B.getMF();
1761   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762   const LLT S32 = LLT::scalar(32);
1763 
1764   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1765 
1766   if (ST.hasApertureRegs()) {
1767     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768     // getreg.
1769     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1771         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1772     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1773         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1774         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1775     unsigned Encoding =
1776         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1777         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1778         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779 
1780     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781 
1782     B.buildInstr(AMDGPU::S_GETREG_B32)
1783       .addDef(GetReg)
1784       .addImm(Encoding);
1785     MRI.setType(GetReg, S32);
1786 
1787     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789   }
1790 
1791   Register QueuePtr = MRI.createGenericVirtualRegister(
1792     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1793 
1794   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1795     return Register();
1796 
1797   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798   // private_segment_aperture_base_hi.
1799   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800 
1801   // TODO: can we be smarter about machine pointer info?
1802   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1803   MachineMemOperand *MMO = MF.getMachineMemOperand(
1804       PtrInfo,
1805       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1806           MachineMemOperand::MOInvariant,
1807       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808 
1809   Register LoadAddr;
1810 
1811   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813 }
1814 
1815 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B) const {
1818   MachineFunction &MF = B.getMF();
1819 
1820   const LLT S32 = LLT::scalar(32);
1821   Register Dst = MI.getOperand(0).getReg();
1822   Register Src = MI.getOperand(1).getReg();
1823 
1824   LLT DstTy = MRI.getType(Dst);
1825   LLT SrcTy = MRI.getType(Src);
1826   unsigned DestAS = DstTy.getAddressSpace();
1827   unsigned SrcAS = SrcTy.getAddressSpace();
1828 
1829   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1830   // vector element.
1831   assert(!DstTy.isVector());
1832 
1833   const AMDGPUTargetMachine &TM
1834     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1835 
1836   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1837     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1838     return true;
1839   }
1840 
1841   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1842     // Truncate.
1843     B.buildExtract(Dst, Src, 0);
1844     MI.eraseFromParent();
1845     return true;
1846   }
1847 
1848   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1849     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1850     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1851 
1852     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1853     // another. Merge operands are required to be the same type, but creating an
1854     // extra ptrtoint would be kind of pointless.
1855     auto HighAddr = B.buildConstant(
1856       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1857     B.buildMerge(Dst, {Src, HighAddr});
1858     MI.eraseFromParent();
1859     return true;
1860   }
1861 
1862   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1863     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1864            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1865     unsigned NullVal = TM.getNullPointerValue(DestAS);
1866 
1867     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1868     auto FlatNull = B.buildConstant(SrcTy, 0);
1869 
1870     // Extract low 32-bits of the pointer.
1871     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1872 
1873     auto CmpRes =
1874         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1875     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1876 
1877     MI.eraseFromParent();
1878     return true;
1879   }
1880 
1881   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1882     return false;
1883 
1884   if (!ST.hasFlatAddressSpace())
1885     return false;
1886 
1887   auto SegmentNull =
1888       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1889   auto FlatNull =
1890       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1891 
1892   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1893   if (!ApertureReg.isValid())
1894     return false;
1895 
1896   auto CmpRes =
1897       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1898 
1899   // Coerce the type of the low half of the result so we can use merge_values.
1900   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1901 
1902   // TODO: Should we allow mismatched types but matching sizes in merges to
1903   // avoid the ptrtoint?
1904   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1905   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1906 
1907   MI.eraseFromParent();
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFrint(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914   Register Src = MI.getOperand(1).getReg();
1915   LLT Ty = MRI.getType(Src);
1916   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1917 
1918   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1919   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1920 
1921   auto C1 = B.buildFConstant(Ty, C1Val);
1922   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1923 
1924   // TODO: Should this propagate fast-math-flags?
1925   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1926   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1927 
1928   auto C2 = B.buildFConstant(Ty, C2Val);
1929   auto Fabs = B.buildFAbs(Ty, Src);
1930 
1931   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1932   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1933   MI.eraseFromParent();
1934   return true;
1935 }
1936 
1937 bool AMDGPULegalizerInfo::legalizeFceil(
1938   MachineInstr &MI, MachineRegisterInfo &MRI,
1939   MachineIRBuilder &B) const {
1940 
1941   const LLT S1 = LLT::scalar(1);
1942   const LLT S64 = LLT::scalar(64);
1943 
1944   Register Src = MI.getOperand(1).getReg();
1945   assert(MRI.getType(Src) == S64);
1946 
1947   // result = trunc(src)
1948   // if (src > 0.0 && src != result)
1949   //   result += 1.0
1950 
1951   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1952 
1953   const auto Zero = B.buildFConstant(S64, 0.0);
1954   const auto One = B.buildFConstant(S64, 1.0);
1955   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1956   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1957   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1958   auto Add = B.buildSelect(S64, And, One, Zero);
1959 
1960   // TODO: Should this propagate fast-math-flags?
1961   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1962   return true;
1963 }
1964 
1965 bool AMDGPULegalizerInfo::legalizeFrem(
1966   MachineInstr &MI, MachineRegisterInfo &MRI,
1967   MachineIRBuilder &B) const {
1968     Register DstReg = MI.getOperand(0).getReg();
1969     Register Src0Reg = MI.getOperand(1).getReg();
1970     Register Src1Reg = MI.getOperand(2).getReg();
1971     auto Flags = MI.getFlags();
1972     LLT Ty = MRI.getType(DstReg);
1973 
1974     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1975     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1976     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1977     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1978     MI.eraseFromParent();
1979     return true;
1980 }
1981 
1982 static MachineInstrBuilder extractF64Exponent(Register Hi,
1983                                               MachineIRBuilder &B) {
1984   const unsigned FractBits = 52;
1985   const unsigned ExpBits = 11;
1986   LLT S32 = LLT::scalar(32);
1987 
1988   auto Const0 = B.buildConstant(S32, FractBits - 32);
1989   auto Const1 = B.buildConstant(S32, ExpBits);
1990 
1991   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1992     .addUse(Hi)
1993     .addUse(Const0.getReg(0))
1994     .addUse(Const1.getReg(0));
1995 
1996   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1997 }
1998 
1999 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2000   MachineInstr &MI, MachineRegisterInfo &MRI,
2001   MachineIRBuilder &B) const {
2002   const LLT S1 = LLT::scalar(1);
2003   const LLT S32 = LLT::scalar(32);
2004   const LLT S64 = LLT::scalar(64);
2005 
2006   Register Src = MI.getOperand(1).getReg();
2007   assert(MRI.getType(Src) == S64);
2008 
2009   // TODO: Should this use extract since the low half is unused?
2010   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2011   Register Hi = Unmerge.getReg(1);
2012 
2013   // Extract the upper half, since this is where we will find the sign and
2014   // exponent.
2015   auto Exp = extractF64Exponent(Hi, B);
2016 
2017   const unsigned FractBits = 52;
2018 
2019   // Extract the sign bit.
2020   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2021   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2022 
2023   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2024 
2025   const auto Zero32 = B.buildConstant(S32, 0);
2026 
2027   // Extend back to 64-bits.
2028   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2029 
2030   auto Shr = B.buildAShr(S64, FractMask, Exp);
2031   auto Not = B.buildNot(S64, Shr);
2032   auto Tmp0 = B.buildAnd(S64, Src, Not);
2033   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2034 
2035   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2036   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2037 
2038   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2039   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2040   MI.eraseFromParent();
2041   return true;
2042 }
2043 
2044 bool AMDGPULegalizerInfo::legalizeITOFP(
2045   MachineInstr &MI, MachineRegisterInfo &MRI,
2046   MachineIRBuilder &B, bool Signed) const {
2047 
2048   Register Dst = MI.getOperand(0).getReg();
2049   Register Src = MI.getOperand(1).getReg();
2050 
2051   const LLT S64 = LLT::scalar(64);
2052   const LLT S32 = LLT::scalar(32);
2053 
2054   assert(MRI.getType(Src) == S64);
2055 
2056   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2057   auto ThirtyTwo = B.buildConstant(S32, 32);
2058 
2059   if (MRI.getType(Dst) == S64) {
2060     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2061                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2062 
2063     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2064     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2065                      .addUse(CvtHi.getReg(0))
2066                      .addUse(ThirtyTwo.getReg(0));
2067 
2068     // TODO: Should this propagate fast-math-flags?
2069     B.buildFAdd(Dst, LdExp, CvtLo);
2070     MI.eraseFromParent();
2071     return true;
2072   }
2073 
2074   assert(MRI.getType(Dst) == S32);
2075 
2076   auto One = B.buildConstant(S32, 1);
2077 
2078   MachineInstrBuilder ShAmt;
2079   if (Signed) {
2080     auto ThirtyOne = B.buildConstant(S32, 31);
2081     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2082     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2083     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2084     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2085                                /*HasSideEffects=*/false)
2086                   .addUse(Unmerge.getReg(1));
2087     auto LS2 = B.buildSub(S32, LS, One);
2088     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2089   } else
2090     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2091   auto Norm = B.buildShl(S64, Src, ShAmt);
2092   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2093   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2094   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2095   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2096   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2097   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2098                    /*HasSideEffects=*/false)
2099       .addUse(FVal.getReg(0))
2100       .addUse(Scale.getReg(0));
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104 
2105 // TODO: Copied from DAG implementation. Verify logic and document how this
2106 // actually works.
2107 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2108                                         MachineRegisterInfo &MRI,
2109                                         MachineIRBuilder &B,
2110                                         bool Signed) const {
2111 
2112   Register Dst = MI.getOperand(0).getReg();
2113   Register Src = MI.getOperand(1).getReg();
2114 
2115   const LLT S64 = LLT::scalar(64);
2116   const LLT S32 = LLT::scalar(32);
2117 
2118   const LLT SrcLT = MRI.getType(Src);
2119   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2120 
2121   unsigned Flags = MI.getFlags();
2122 
2123   // The basic idea of converting a floating point number into a pair of 32-bit
2124   // integers is illustrated as follows:
2125   //
2126   //     tf := trunc(val);
2127   //    hif := floor(tf * 2^-32);
2128   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2129   //     hi := fptoi(hif);
2130   //     lo := fptoi(lof);
2131   //
2132   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2133   MachineInstrBuilder Sign;
2134   if (Signed && SrcLT == S32) {
2135     // However, a 32-bit floating point number has only 23 bits mantissa and
2136     // it's not enough to hold all the significant bits of `lof` if val is
2137     // negative. To avoid the loss of precision, We need to take the absolute
2138     // value after truncating and flip the result back based on the original
2139     // signedness.
2140     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2141     Trunc = B.buildFAbs(S32, Trunc, Flags);
2142   }
2143   MachineInstrBuilder K0, K1;
2144   if (SrcLT == S64) {
2145     K0 = B.buildFConstant(S64,
2146                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2147     K1 = B.buildFConstant(S64,
2148                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2149   } else {
2150     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2151     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2152   }
2153 
2154   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2155   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2156   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2157 
2158   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2159                                      : B.buildFPTOUI(S32, FloorMul);
2160   auto Lo = B.buildFPTOUI(S32, Fma);
2161 
2162   if (Signed && SrcLT == S32) {
2163     // Flip the result based on the signedness, which is either all 0s or 1s.
2164     Sign = B.buildMerge(S64, {Sign, Sign});
2165     // r := xor({lo, hi}, sign) - sign;
2166     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2167   } else
2168     B.buildMerge(Dst, {Lo, Hi});
2169   MI.eraseFromParent();
2170 
2171   return true;
2172 }
2173 
2174 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2175                                                MachineInstr &MI) const {
2176   MachineFunction &MF = Helper.MIRBuilder.getMF();
2177   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2178 
2179   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2180                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2181 
2182   // With ieee_mode disabled, the instructions have the correct behavior
2183   // already for G_FMINNUM/G_FMAXNUM
2184   if (!MFI->getMode().IEEE)
2185     return !IsIEEEOp;
2186 
2187   if (IsIEEEOp)
2188     return true;
2189 
2190   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2191 }
2192 
2193 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2194   MachineInstr &MI, MachineRegisterInfo &MRI,
2195   MachineIRBuilder &B) const {
2196   // TODO: Should move some of this into LegalizerHelper.
2197 
2198   // TODO: Promote dynamic indexing of s16 to s32
2199 
2200   // FIXME: Artifact combiner probably should have replaced the truncated
2201   // constant before this, so we shouldn't need
2202   // getIConstantVRegValWithLookThrough.
2203   Optional<ValueAndVReg> MaybeIdxVal =
2204       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2205   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2206     return true;
2207   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2208 
2209   Register Dst = MI.getOperand(0).getReg();
2210   Register Vec = MI.getOperand(1).getReg();
2211 
2212   LLT VecTy = MRI.getType(Vec);
2213   LLT EltTy = VecTy.getElementType();
2214   assert(EltTy == MRI.getType(Dst));
2215 
2216   if (IdxVal < VecTy.getNumElements())
2217     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2218   else
2219     B.buildUndef(Dst);
2220 
2221   MI.eraseFromParent();
2222   return true;
2223 }
2224 
2225 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2226   MachineInstr &MI, MachineRegisterInfo &MRI,
2227   MachineIRBuilder &B) const {
2228   // TODO: Should move some of this into LegalizerHelper.
2229 
2230   // TODO: Promote dynamic indexing of s16 to s32
2231 
2232   // FIXME: Artifact combiner probably should have replaced the truncated
2233   // constant before this, so we shouldn't need
2234   // getIConstantVRegValWithLookThrough.
2235   Optional<ValueAndVReg> MaybeIdxVal =
2236       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2237   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2238     return true;
2239 
2240   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2241   Register Dst = MI.getOperand(0).getReg();
2242   Register Vec = MI.getOperand(1).getReg();
2243   Register Ins = MI.getOperand(2).getReg();
2244 
2245   LLT VecTy = MRI.getType(Vec);
2246   LLT EltTy = VecTy.getElementType();
2247   assert(EltTy == MRI.getType(Ins));
2248 
2249   if (IdxVal < VecTy.getNumElements())
2250     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2251   else
2252     B.buildUndef(Dst);
2253 
2254   MI.eraseFromParent();
2255   return true;
2256 }
2257 
2258 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2259   MachineInstr &MI, MachineRegisterInfo &MRI,
2260   MachineIRBuilder &B) const {
2261   const LLT V2S16 = LLT::fixed_vector(2, 16);
2262 
2263   Register Dst = MI.getOperand(0).getReg();
2264   Register Src0 = MI.getOperand(1).getReg();
2265   LLT DstTy = MRI.getType(Dst);
2266   LLT SrcTy = MRI.getType(Src0);
2267 
2268   if (SrcTy == V2S16 && DstTy == V2S16 &&
2269       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2270     return true;
2271 
2272   MachineIRBuilder HelperBuilder(MI);
2273   GISelObserverWrapper DummyObserver;
2274   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2275   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2276 }
2277 
2278 bool AMDGPULegalizerInfo::legalizeSinCos(
2279   MachineInstr &MI, MachineRegisterInfo &MRI,
2280   MachineIRBuilder &B) const {
2281 
2282   Register DstReg = MI.getOperand(0).getReg();
2283   Register SrcReg = MI.getOperand(1).getReg();
2284   LLT Ty = MRI.getType(DstReg);
2285   unsigned Flags = MI.getFlags();
2286 
2287   Register TrigVal;
2288   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2289   if (ST.hasTrigReducedRange()) {
2290     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2291     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2292       .addUse(MulVal.getReg(0))
2293       .setMIFlags(Flags).getReg(0);
2294   } else
2295     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2296 
2297   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2298     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2299   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2300     .addUse(TrigVal)
2301     .setMIFlags(Flags);
2302   MI.eraseFromParent();
2303   return true;
2304 }
2305 
2306 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2307                                                   MachineIRBuilder &B,
2308                                                   const GlobalValue *GV,
2309                                                   int64_t Offset,
2310                                                   unsigned GAFlags) const {
2311   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2312   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2313   // to the following code sequence:
2314   //
2315   // For constant address space:
2316   //   s_getpc_b64 s[0:1]
2317   //   s_add_u32 s0, s0, $symbol
2318   //   s_addc_u32 s1, s1, 0
2319   //
2320   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2321   //   a fixup or relocation is emitted to replace $symbol with a literal
2322   //   constant, which is a pc-relative offset from the encoding of the $symbol
2323   //   operand to the global variable.
2324   //
2325   // For global address space:
2326   //   s_getpc_b64 s[0:1]
2327   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2328   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2329   //
2330   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2331   //   fixups or relocations are emitted to replace $symbol@*@lo and
2332   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2333   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2334   //   operand to the global variable.
2335   //
2336   // What we want here is an offset from the value returned by s_getpc
2337   // (which is the address of the s_add_u32 instruction) to the global
2338   // variable, but since the encoding of $symbol starts 4 bytes after the start
2339   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2340   // small. This requires us to add 4 to the global variable offset in order to
2341   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2342   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2343   // instruction.
2344 
2345   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2346 
2347   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2348     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2349 
2350   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2351     .addDef(PCReg);
2352 
2353   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2354   if (GAFlags == SIInstrInfo::MO_NONE)
2355     MIB.addImm(0);
2356   else
2357     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2358 
2359   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2360 
2361   if (PtrTy.getSizeInBits() == 32)
2362     B.buildExtract(DstReg, PCReg, 0);
2363   return true;
2364  }
2365 
2366 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2367   MachineInstr &MI, MachineRegisterInfo &MRI,
2368   MachineIRBuilder &B) const {
2369   Register DstReg = MI.getOperand(0).getReg();
2370   LLT Ty = MRI.getType(DstReg);
2371   unsigned AS = Ty.getAddressSpace();
2372 
2373   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2374   MachineFunction &MF = B.getMF();
2375   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2376 
2377   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2378     if (!MFI->isModuleEntryFunction() &&
2379         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2380       const Function &Fn = MF.getFunction();
2381       DiagnosticInfoUnsupported BadLDSDecl(
2382         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2383         DS_Warning);
2384       Fn.getContext().diagnose(BadLDSDecl);
2385 
2386       // We currently don't have a way to correctly allocate LDS objects that
2387       // aren't directly associated with a kernel. We do force inlining of
2388       // functions that use local objects. However, if these dead functions are
2389       // not eliminated, we don't want a compile time error. Just emit a warning
2390       // and a trap, since there should be no callable path here.
2391       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2392       B.buildUndef(DstReg);
2393       MI.eraseFromParent();
2394       return true;
2395     }
2396 
2397     // TODO: We could emit code to handle the initialization somewhere.
2398     // We ignore the initializer for now and legalize it to allow selection.
2399     // The initializer will anyway get errored out during assembly emission.
2400     const SITargetLowering *TLI = ST.getTargetLowering();
2401     if (!TLI->shouldUseLDSConstAddress(GV)) {
2402       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2403       return true; // Leave in place;
2404     }
2405 
2406     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2407       Type *Ty = GV->getValueType();
2408       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2409       // zero-sized type in other languages to declare the dynamic shared
2410       // memory which size is not known at the compile time. They will be
2411       // allocated by the runtime and placed directly after the static
2412       // allocated ones. They all share the same offset.
2413       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2414         // Adjust alignment for that dynamic shared memory array.
2415         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2416         LLT S32 = LLT::scalar(32);
2417         auto Sz =
2418             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2419         B.buildIntToPtr(DstReg, Sz);
2420         MI.eraseFromParent();
2421         return true;
2422       }
2423     }
2424 
2425     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2426                                                    *cast<GlobalVariable>(GV)));
2427     MI.eraseFromParent();
2428     return true;
2429   }
2430 
2431   const SITargetLowering *TLI = ST.getTargetLowering();
2432 
2433   if (TLI->shouldEmitFixup(GV)) {
2434     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2435     MI.eraseFromParent();
2436     return true;
2437   }
2438 
2439   if (TLI->shouldEmitPCReloc(GV)) {
2440     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2441     MI.eraseFromParent();
2442     return true;
2443   }
2444 
2445   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2446   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2447 
2448   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2449   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2450       MachinePointerInfo::getGOT(MF),
2451       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2452           MachineMemOperand::MOInvariant,
2453       LoadTy, Align(8));
2454 
2455   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2456 
2457   if (Ty.getSizeInBits() == 32) {
2458     // Truncate if this is a 32-bit constant address.
2459     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2460     B.buildExtract(DstReg, Load, 0);
2461   } else
2462     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2463 
2464   MI.eraseFromParent();
2465   return true;
2466 }
2467 
2468 static LLT widenToNextPowerOf2(LLT Ty) {
2469   if (Ty.isVector())
2470     return Ty.changeElementCount(
2471         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2472   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2473 }
2474 
2475 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2476                                        MachineInstr &MI) const {
2477   MachineIRBuilder &B = Helper.MIRBuilder;
2478   MachineRegisterInfo &MRI = *B.getMRI();
2479   GISelChangeObserver &Observer = Helper.Observer;
2480 
2481   Register PtrReg = MI.getOperand(1).getReg();
2482   LLT PtrTy = MRI.getType(PtrReg);
2483   unsigned AddrSpace = PtrTy.getAddressSpace();
2484 
2485   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2486     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2487     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2488     Observer.changingInstr(MI);
2489     MI.getOperand(1).setReg(Cast.getReg(0));
2490     Observer.changedInstr(MI);
2491     return true;
2492   }
2493 
2494   if (MI.getOpcode() != AMDGPU::G_LOAD)
2495     return false;
2496 
2497   Register ValReg = MI.getOperand(0).getReg();
2498   LLT ValTy = MRI.getType(ValReg);
2499 
2500   MachineMemOperand *MMO = *MI.memoperands_begin();
2501   const unsigned ValSize = ValTy.getSizeInBits();
2502   const LLT MemTy = MMO->getMemoryType();
2503   const Align MemAlign = MMO->getAlign();
2504   const unsigned MemSize = MemTy.getSizeInBits();
2505   const unsigned AlignInBits = 8 * MemAlign.value();
2506 
2507   // Widen non-power-of-2 loads to the alignment if needed
2508   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2509     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2510 
2511     // This was already the correct extending load result type, so just adjust
2512     // the memory type.
2513     if (WideMemSize == ValSize) {
2514       MachineFunction &MF = B.getMF();
2515 
2516       MachineMemOperand *WideMMO =
2517           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2518       Observer.changingInstr(MI);
2519       MI.setMemRefs(MF, {WideMMO});
2520       Observer.changedInstr(MI);
2521       return true;
2522     }
2523 
2524     // Don't bother handling edge case that should probably never be produced.
2525     if (ValSize > WideMemSize)
2526       return false;
2527 
2528     LLT WideTy = widenToNextPowerOf2(ValTy);
2529 
2530     Register WideLoad;
2531     if (!WideTy.isVector()) {
2532       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2533       B.buildTrunc(ValReg, WideLoad).getReg(0);
2534     } else {
2535       // Extract the subvector.
2536 
2537       if (isRegisterType(ValTy)) {
2538         // If this a case where G_EXTRACT is legal, use it.
2539         // (e.g. <3 x s32> -> <4 x s32>)
2540         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2541         B.buildExtract(ValReg, WideLoad, 0);
2542       } else {
2543         // For cases where the widened type isn't a nice register value, unmerge
2544         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2545         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2546         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2547       }
2548     }
2549 
2550     MI.eraseFromParent();
2551     return true;
2552   }
2553 
2554   return false;
2555 }
2556 
2557 bool AMDGPULegalizerInfo::legalizeFMad(
2558   MachineInstr &MI, MachineRegisterInfo &MRI,
2559   MachineIRBuilder &B) const {
2560   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2561   assert(Ty.isScalar());
2562 
2563   MachineFunction &MF = B.getMF();
2564   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2565 
2566   // TODO: Always legal with future ftz flag.
2567   // FIXME: Do we need just output?
2568   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2569     return true;
2570   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2571     return true;
2572 
2573   MachineIRBuilder HelperBuilder(MI);
2574   GISelObserverWrapper DummyObserver;
2575   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2576   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2577 }
2578 
2579 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2580   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2581   Register DstReg = MI.getOperand(0).getReg();
2582   Register PtrReg = MI.getOperand(1).getReg();
2583   Register CmpVal = MI.getOperand(2).getReg();
2584   Register NewVal = MI.getOperand(3).getReg();
2585 
2586   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2587          "this should not have been custom lowered");
2588 
2589   LLT ValTy = MRI.getType(CmpVal);
2590   LLT VecTy = LLT::fixed_vector(2, ValTy);
2591 
2592   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2593 
2594   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2595     .addDef(DstReg)
2596     .addUse(PtrReg)
2597     .addUse(PackedVal)
2598     .setMemRefs(MI.memoperands());
2599 
2600   MI.eraseFromParent();
2601   return true;
2602 }
2603 
2604 bool AMDGPULegalizerInfo::legalizeFlog(
2605   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2606   Register Dst = MI.getOperand(0).getReg();
2607   Register Src = MI.getOperand(1).getReg();
2608   LLT Ty = B.getMRI()->getType(Dst);
2609   unsigned Flags = MI.getFlags();
2610 
2611   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2612   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2613 
2614   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2615   MI.eraseFromParent();
2616   return true;
2617 }
2618 
2619 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2620                                        MachineIRBuilder &B) const {
2621   Register Dst = MI.getOperand(0).getReg();
2622   Register Src = MI.getOperand(1).getReg();
2623   unsigned Flags = MI.getFlags();
2624   LLT Ty = B.getMRI()->getType(Dst);
2625 
2626   auto K = B.buildFConstant(Ty, numbers::log2e);
2627   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2628   B.buildFExp2(Dst, Mul, Flags);
2629   MI.eraseFromParent();
2630   return true;
2631 }
2632 
2633 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2634                                        MachineIRBuilder &B) const {
2635   Register Dst = MI.getOperand(0).getReg();
2636   Register Src0 = MI.getOperand(1).getReg();
2637   Register Src1 = MI.getOperand(2).getReg();
2638   unsigned Flags = MI.getFlags();
2639   LLT Ty = B.getMRI()->getType(Dst);
2640   const LLT S16 = LLT::scalar(16);
2641   const LLT S32 = LLT::scalar(32);
2642 
2643   if (Ty == S32) {
2644     auto Log = B.buildFLog2(S32, Src0, Flags);
2645     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2646       .addUse(Log.getReg(0))
2647       .addUse(Src1)
2648       .setMIFlags(Flags);
2649     B.buildFExp2(Dst, Mul, Flags);
2650   } else if (Ty == S16) {
2651     // There's no f16 fmul_legacy, so we need to convert for it.
2652     auto Log = B.buildFLog2(S16, Src0, Flags);
2653     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2654     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2655     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2656       .addUse(Ext0.getReg(0))
2657       .addUse(Ext1.getReg(0))
2658       .setMIFlags(Flags);
2659 
2660     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2661   } else
2662     return false;
2663 
2664   MI.eraseFromParent();
2665   return true;
2666 }
2667 
2668 // Find a source register, ignoring any possible source modifiers.
2669 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2670   Register ModSrc = OrigSrc;
2671   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2672     ModSrc = SrcFNeg->getOperand(1).getReg();
2673     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2674       ModSrc = SrcFAbs->getOperand(1).getReg();
2675   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2676     ModSrc = SrcFAbs->getOperand(1).getReg();
2677   return ModSrc;
2678 }
2679 
2680 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2681                                          MachineRegisterInfo &MRI,
2682                                          MachineIRBuilder &B) const {
2683 
2684   const LLT S1 = LLT::scalar(1);
2685   const LLT S64 = LLT::scalar(64);
2686   Register Dst = MI.getOperand(0).getReg();
2687   Register OrigSrc = MI.getOperand(1).getReg();
2688   unsigned Flags = MI.getFlags();
2689   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2690          "this should not have been custom lowered");
2691 
2692   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2693   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2694   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2695   // V_FRACT bug is:
2696   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2697   //
2698   // Convert floor(x) to (x - fract(x))
2699 
2700   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2701     .addUse(OrigSrc)
2702     .setMIFlags(Flags);
2703 
2704   // Give source modifier matching some assistance before obscuring a foldable
2705   // pattern.
2706 
2707   // TODO: We can avoid the neg on the fract? The input sign to fract
2708   // shouldn't matter?
2709   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2710 
2711   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2712 
2713   Register Min = MRI.createGenericVirtualRegister(S64);
2714 
2715   // We don't need to concern ourselves with the snan handling difference, so
2716   // use the one which will directly select.
2717   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2718   if (MFI->getMode().IEEE)
2719     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2720   else
2721     B.buildFMinNum(Min, Fract, Const, Flags);
2722 
2723   Register CorrectedFract = Min;
2724   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2725     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2726     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2727   }
2728 
2729   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2730   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2731 
2732   MI.eraseFromParent();
2733   return true;
2734 }
2735 
2736 // Turn an illegal packed v2s16 build vector into bit operations.
2737 // TODO: This should probably be a bitcast action in LegalizerHelper.
2738 bool AMDGPULegalizerInfo::legalizeBuildVector(
2739   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2740   Register Dst = MI.getOperand(0).getReg();
2741   const LLT S32 = LLT::scalar(32);
2742   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2743 
2744   Register Src0 = MI.getOperand(1).getReg();
2745   Register Src1 = MI.getOperand(2).getReg();
2746   assert(MRI.getType(Src0) == LLT::scalar(16));
2747 
2748   auto Merge = B.buildMerge(S32, {Src0, Src1});
2749   B.buildBitcast(Dst, Merge);
2750 
2751   MI.eraseFromParent();
2752   return true;
2753 }
2754 
2755 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2756 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2757 // case with a single min instruction instead of a compare+select.
2758 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2759                                             MachineRegisterInfo &MRI,
2760                                             MachineIRBuilder &B) const {
2761   Register Dst = MI.getOperand(0).getReg();
2762   Register Src = MI.getOperand(1).getReg();
2763   LLT DstTy = MRI.getType(Dst);
2764   LLT SrcTy = MRI.getType(Src);
2765 
2766   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2767                         ? AMDGPU::G_AMDGPU_FFBH_U32
2768                         : AMDGPU::G_AMDGPU_FFBL_B32;
2769   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2770   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2771 
2772   MI.eraseFromParent();
2773   return true;
2774 }
2775 
2776 // Check that this is a G_XOR x, -1
2777 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2778   if (MI.getOpcode() != TargetOpcode::G_XOR)
2779     return false;
2780   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2781   return ConstVal && *ConstVal == -1;
2782 }
2783 
2784 // Return the use branch instruction, otherwise null if the usage is invalid.
2785 static MachineInstr *
2786 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2787                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2788   Register CondDef = MI.getOperand(0).getReg();
2789   if (!MRI.hasOneNonDBGUse(CondDef))
2790     return nullptr;
2791 
2792   MachineBasicBlock *Parent = MI.getParent();
2793   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2794 
2795   if (isNot(MRI, *UseMI)) {
2796     Register NegatedCond = UseMI->getOperand(0).getReg();
2797     if (!MRI.hasOneNonDBGUse(NegatedCond))
2798       return nullptr;
2799 
2800     // We're deleting the def of this value, so we need to remove it.
2801     eraseInstr(*UseMI, MRI);
2802 
2803     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2804     Negated = true;
2805   }
2806 
2807   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2808     return nullptr;
2809 
2810   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2811   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2812   if (Next == Parent->end()) {
2813     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2814     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2815       return nullptr;
2816     UncondBrTarget = &*NextMBB;
2817   } else {
2818     if (Next->getOpcode() != AMDGPU::G_BR)
2819       return nullptr;
2820     Br = &*Next;
2821     UncondBrTarget = Br->getOperand(0).getMBB();
2822   }
2823 
2824   return UseMI;
2825 }
2826 
2827 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2828                                          const ArgDescriptor *Arg,
2829                                          const TargetRegisterClass *ArgRC,
2830                                          LLT ArgTy) const {
2831   MCRegister SrcReg = Arg->getRegister();
2832   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2833   assert(DstReg.isVirtual() && "Virtual register expected");
2834 
2835   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2836                                              ArgTy);
2837   if (Arg->isMasked()) {
2838     // TODO: Should we try to emit this once in the entry block?
2839     const LLT S32 = LLT::scalar(32);
2840     const unsigned Mask = Arg->getMask();
2841     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2842 
2843     Register AndMaskSrc = LiveIn;
2844 
2845     if (Shift != 0) {
2846       auto ShiftAmt = B.buildConstant(S32, Shift);
2847       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2848     }
2849 
2850     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2851   } else {
2852     B.buildCopy(DstReg, LiveIn);
2853   }
2854 
2855   return true;
2856 }
2857 
2858 bool AMDGPULegalizerInfo::loadInputValue(
2859     Register DstReg, MachineIRBuilder &B,
2860     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2861   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2862   const ArgDescriptor *Arg;
2863   const TargetRegisterClass *ArgRC;
2864   LLT ArgTy;
2865   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2866 
2867   if (!Arg) {
2868     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2869       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2870       // case the pointer argument may be missing and we use null.
2871       B.buildConstant(DstReg, 0);
2872       return true;
2873     }
2874 
2875     // It's undefined behavior if a function marked with the amdgpu-no-*
2876     // attributes uses the corresponding intrinsic.
2877     B.buildUndef(DstReg);
2878     return true;
2879   }
2880 
2881   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2882     return false; // TODO: Handle these
2883   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2884 }
2885 
2886 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2887     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2888     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2889   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2890     return false;
2891 
2892   MI.eraseFromParent();
2893   return true;
2894 }
2895 
2896 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2897                                        MachineRegisterInfo &MRI,
2898                                        MachineIRBuilder &B) const {
2899   Register Dst = MI.getOperand(0).getReg();
2900   LLT DstTy = MRI.getType(Dst);
2901   LLT S16 = LLT::scalar(16);
2902   LLT S32 = LLT::scalar(32);
2903   LLT S64 = LLT::scalar(64);
2904 
2905   if (DstTy == S16)
2906     return legalizeFDIV16(MI, MRI, B);
2907   if (DstTy == S32)
2908     return legalizeFDIV32(MI, MRI, B);
2909   if (DstTy == S64)
2910     return legalizeFDIV64(MI, MRI, B);
2911 
2912   return false;
2913 }
2914 
2915 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2916                                                         Register DstDivReg,
2917                                                         Register DstRemReg,
2918                                                         Register X,
2919                                                         Register Y) const {
2920   const LLT S1 = LLT::scalar(1);
2921   const LLT S32 = LLT::scalar(32);
2922 
2923   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2924   // algorithm used here.
2925 
2926   // Initial estimate of inv(y).
2927   auto FloatY = B.buildUITOFP(S32, Y);
2928   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2929   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2930   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2931   auto Z = B.buildFPTOUI(S32, ScaledY);
2932 
2933   // One round of UNR.
2934   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2935   auto NegYZ = B.buildMul(S32, NegY, Z);
2936   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2937 
2938   // Quotient/remainder estimate.
2939   auto Q = B.buildUMulH(S32, X, Z);
2940   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2941 
2942   // First quotient/remainder refinement.
2943   auto One = B.buildConstant(S32, 1);
2944   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2945   if (DstDivReg)
2946     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2947   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2948 
2949   // Second quotient/remainder refinement.
2950   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2951   if (DstDivReg)
2952     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2953 
2954   if (DstRemReg)
2955     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2956 }
2957 
2958 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2959 //
2960 // Return lo, hi of result
2961 //
2962 // %cvt.lo = G_UITOFP Val.lo
2963 // %cvt.hi = G_UITOFP Val.hi
2964 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2965 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2966 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2967 // %mul2 = G_FMUL %mul1, 2**(-32)
2968 // %trunc = G_INTRINSIC_TRUNC %mul2
2969 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2970 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2971 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2972                                                        Register Val) {
2973   const LLT S32 = LLT::scalar(32);
2974   auto Unmerge = B.buildUnmerge(S32, Val);
2975 
2976   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2977   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2978 
2979   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2980                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2981 
2982   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2983   auto Mul1 =
2984       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2985 
2986   // 2**(-32)
2987   auto Mul2 =
2988       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2989   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2990 
2991   // -(2**32)
2992   auto Mad2 = B.buildFMAD(S32, Trunc,
2993                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2994 
2995   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2996   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2997 
2998   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2999 }
3000 
3001 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3002                                                         Register DstDivReg,
3003                                                         Register DstRemReg,
3004                                                         Register Numer,
3005                                                         Register Denom) const {
3006   const LLT S32 = LLT::scalar(32);
3007   const LLT S64 = LLT::scalar(64);
3008   const LLT S1 = LLT::scalar(1);
3009   Register RcpLo, RcpHi;
3010 
3011   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3012 
3013   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3014 
3015   auto Zero64 = B.buildConstant(S64, 0);
3016   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3017 
3018   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3019   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3020 
3021   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3022   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3023   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3024 
3025   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3026   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3027   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3028 
3029   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3030   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3031   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3032   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3033   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3034 
3035   auto Zero32 = B.buildConstant(S32, 0);
3036   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3037   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3038   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3039 
3040   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3041   Register NumerLo = UnmergeNumer.getReg(0);
3042   Register NumerHi = UnmergeNumer.getReg(1);
3043 
3044   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3045   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3046   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3047   Register Mul3_Lo = UnmergeMul3.getReg(0);
3048   Register Mul3_Hi = UnmergeMul3.getReg(1);
3049   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3050   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3051   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3052   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3053 
3054   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3055   Register DenomLo = UnmergeDenom.getReg(0);
3056   Register DenomHi = UnmergeDenom.getReg(1);
3057 
3058   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3059   auto C1 = B.buildSExt(S32, CmpHi);
3060 
3061   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3062   auto C2 = B.buildSExt(S32, CmpLo);
3063 
3064   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3065   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3066 
3067   // TODO: Here and below portions of the code can be enclosed into if/endif.
3068   // Currently control flow is unconditional and we have 4 selects after
3069   // potential endif to substitute PHIs.
3070 
3071   // if C3 != 0 ...
3072   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3073   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3074   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3075   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3076 
3077   auto One64 = B.buildConstant(S64, 1);
3078   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3079 
3080   auto C4 =
3081       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3082   auto C5 =
3083       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3084   auto C6 = B.buildSelect(
3085       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3086 
3087   // if (C6 != 0)
3088   auto Add4 = B.buildAdd(S64, Add3, One64);
3089   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3090 
3091   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3092   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3093   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3094 
3095   // endif C6
3096   // endif C3
3097 
3098   if (DstDivReg) {
3099     auto Sel1 = B.buildSelect(
3100         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3101     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3102                   Sel1, MulHi3);
3103   }
3104 
3105   if (DstRemReg) {
3106     auto Sel2 = B.buildSelect(
3107         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3108     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3109                   Sel2, Sub1);
3110   }
3111 }
3112 
3113 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3114                                                   MachineRegisterInfo &MRI,
3115                                                   MachineIRBuilder &B) const {
3116   Register DstDivReg, DstRemReg;
3117   switch (MI.getOpcode()) {
3118   default:
3119     llvm_unreachable("Unexpected opcode!");
3120   case AMDGPU::G_UDIV: {
3121     DstDivReg = MI.getOperand(0).getReg();
3122     break;
3123   }
3124   case AMDGPU::G_UREM: {
3125     DstRemReg = MI.getOperand(0).getReg();
3126     break;
3127   }
3128   case AMDGPU::G_UDIVREM: {
3129     DstDivReg = MI.getOperand(0).getReg();
3130     DstRemReg = MI.getOperand(1).getReg();
3131     break;
3132   }
3133   }
3134 
3135   const LLT S64 = LLT::scalar(64);
3136   const LLT S32 = LLT::scalar(32);
3137   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3138   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3139   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3140   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3141 
3142   if (Ty == S32)
3143     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3144   else if (Ty == S64)
3145     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3146   else
3147     return false;
3148 
3149   MI.eraseFromParent();
3150   return true;
3151 }
3152 
3153 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3154                                                 MachineRegisterInfo &MRI,
3155                                                 MachineIRBuilder &B) const {
3156   const LLT S64 = LLT::scalar(64);
3157   const LLT S32 = LLT::scalar(32);
3158 
3159   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3160   if (Ty != S32 && Ty != S64)
3161     return false;
3162 
3163   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3164   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3165   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3166 
3167   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3168   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3169   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3170 
3171   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3172   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3173 
3174   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3175   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3176 
3177   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3178   switch (MI.getOpcode()) {
3179   default:
3180     llvm_unreachable("Unexpected opcode!");
3181   case AMDGPU::G_SDIV: {
3182     DstDivReg = MI.getOperand(0).getReg();
3183     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3184     break;
3185   }
3186   case AMDGPU::G_SREM: {
3187     DstRemReg = MI.getOperand(0).getReg();
3188     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3189     break;
3190   }
3191   case AMDGPU::G_SDIVREM: {
3192     DstDivReg = MI.getOperand(0).getReg();
3193     DstRemReg = MI.getOperand(1).getReg();
3194     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3195     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3196     break;
3197   }
3198   }
3199 
3200   if (Ty == S32)
3201     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3202   else
3203     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3204 
3205   if (DstDivReg) {
3206     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3207     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3208     B.buildSub(DstDivReg, SignXor, Sign);
3209   }
3210 
3211   if (DstRemReg) {
3212     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3213     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3214     B.buildSub(DstRemReg, SignXor, Sign);
3215   }
3216 
3217   MI.eraseFromParent();
3218   return true;
3219 }
3220 
3221 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3222                                                  MachineRegisterInfo &MRI,
3223                                                  MachineIRBuilder &B) const {
3224   Register Res = MI.getOperand(0).getReg();
3225   Register LHS = MI.getOperand(1).getReg();
3226   Register RHS = MI.getOperand(2).getReg();
3227   uint16_t Flags = MI.getFlags();
3228   LLT ResTy = MRI.getType(Res);
3229 
3230   const MachineFunction &MF = B.getMF();
3231   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3232                             MI.getFlag(MachineInstr::FmAfn);
3233 
3234   if (!AllowInaccurateRcp)
3235     return false;
3236 
3237   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3238     // 1 / x -> RCP(x)
3239     if (CLHS->isExactlyValue(1.0)) {
3240       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3241         .addUse(RHS)
3242         .setMIFlags(Flags);
3243 
3244       MI.eraseFromParent();
3245       return true;
3246     }
3247 
3248     // -1 / x -> RCP( FNEG(x) )
3249     if (CLHS->isExactlyValue(-1.0)) {
3250       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3251       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3252         .addUse(FNeg.getReg(0))
3253         .setMIFlags(Flags);
3254 
3255       MI.eraseFromParent();
3256       return true;
3257     }
3258   }
3259 
3260   // x / y -> x * (1.0 / y)
3261   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3262     .addUse(RHS)
3263     .setMIFlags(Flags);
3264   B.buildFMul(Res, LHS, RCP, Flags);
3265 
3266   MI.eraseFromParent();
3267   return true;
3268 }
3269 
3270 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3271                                                    MachineRegisterInfo &MRI,
3272                                                    MachineIRBuilder &B) const {
3273   Register Res = MI.getOperand(0).getReg();
3274   Register X = MI.getOperand(1).getReg();
3275   Register Y = MI.getOperand(2).getReg();
3276   uint16_t Flags = MI.getFlags();
3277   LLT ResTy = MRI.getType(Res);
3278 
3279   const MachineFunction &MF = B.getMF();
3280   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3281                             MI.getFlag(MachineInstr::FmAfn);
3282 
3283   if (!AllowInaccurateRcp)
3284     return false;
3285 
3286   auto NegY = B.buildFNeg(ResTy, Y);
3287   auto One = B.buildFConstant(ResTy, 1.0);
3288 
3289   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3290     .addUse(Y)
3291     .setMIFlags(Flags);
3292 
3293   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3294   R = B.buildFMA(ResTy, Tmp0, R, R);
3295 
3296   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3297   R = B.buildFMA(ResTy, Tmp1, R, R);
3298 
3299   auto Ret = B.buildFMul(ResTy, X, R);
3300   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3301 
3302   B.buildFMA(Res, Tmp2, R, Ret);
3303   MI.eraseFromParent();
3304   return true;
3305 }
3306 
3307 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3308                                          MachineRegisterInfo &MRI,
3309                                          MachineIRBuilder &B) const {
3310   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3311     return true;
3312 
3313   Register Res = MI.getOperand(0).getReg();
3314   Register LHS = MI.getOperand(1).getReg();
3315   Register RHS = MI.getOperand(2).getReg();
3316 
3317   uint16_t Flags = MI.getFlags();
3318 
3319   LLT S16 = LLT::scalar(16);
3320   LLT S32 = LLT::scalar(32);
3321 
3322   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3323   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3324 
3325   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3326     .addUse(RHSExt.getReg(0))
3327     .setMIFlags(Flags);
3328 
3329   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3330   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3331 
3332   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3333     .addUse(RDst.getReg(0))
3334     .addUse(RHS)
3335     .addUse(LHS)
3336     .setMIFlags(Flags);
3337 
3338   MI.eraseFromParent();
3339   return true;
3340 }
3341 
3342 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3343 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3344 static void toggleSPDenormMode(bool Enable,
3345                                MachineIRBuilder &B,
3346                                const GCNSubtarget &ST,
3347                                AMDGPU::SIModeRegisterDefaults Mode) {
3348   // Set SP denorm mode to this value.
3349   unsigned SPDenormMode =
3350     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3351 
3352   if (ST.hasDenormModeInst()) {
3353     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3354     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3355 
3356     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3357     B.buildInstr(AMDGPU::S_DENORM_MODE)
3358       .addImm(NewDenormModeValue);
3359 
3360   } else {
3361     // Select FP32 bit field in mode register.
3362     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3363                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3364                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3365 
3366     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3367       .addImm(SPDenormMode)
3368       .addImm(SPDenormModeBitField);
3369   }
3370 }
3371 
3372 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3373                                          MachineRegisterInfo &MRI,
3374                                          MachineIRBuilder &B) const {
3375   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3376     return true;
3377 
3378   Register Res = MI.getOperand(0).getReg();
3379   Register LHS = MI.getOperand(1).getReg();
3380   Register RHS = MI.getOperand(2).getReg();
3381   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3382   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3383 
3384   uint16_t Flags = MI.getFlags();
3385 
3386   LLT S32 = LLT::scalar(32);
3387   LLT S1 = LLT::scalar(1);
3388 
3389   auto One = B.buildFConstant(S32, 1.0f);
3390 
3391   auto DenominatorScaled =
3392     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3393       .addUse(LHS)
3394       .addUse(RHS)
3395       .addImm(0)
3396       .setMIFlags(Flags);
3397   auto NumeratorScaled =
3398     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3399       .addUse(LHS)
3400       .addUse(RHS)
3401       .addImm(1)
3402       .setMIFlags(Flags);
3403 
3404   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3405     .addUse(DenominatorScaled.getReg(0))
3406     .setMIFlags(Flags);
3407   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3408 
3409   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3410   // aren't modeled as reading it.
3411   if (!Mode.allFP32Denormals())
3412     toggleSPDenormMode(true, B, ST, Mode);
3413 
3414   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3415   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3416   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3417   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3418   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3419   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3420 
3421   if (!Mode.allFP32Denormals())
3422     toggleSPDenormMode(false, B, ST, Mode);
3423 
3424   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3425     .addUse(Fma4.getReg(0))
3426     .addUse(Fma1.getReg(0))
3427     .addUse(Fma3.getReg(0))
3428     .addUse(NumeratorScaled.getReg(1))
3429     .setMIFlags(Flags);
3430 
3431   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3432     .addUse(Fmas.getReg(0))
3433     .addUse(RHS)
3434     .addUse(LHS)
3435     .setMIFlags(Flags);
3436 
3437   MI.eraseFromParent();
3438   return true;
3439 }
3440 
3441 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3442                                          MachineRegisterInfo &MRI,
3443                                          MachineIRBuilder &B) const {
3444   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3445     return true;
3446 
3447   Register Res = MI.getOperand(0).getReg();
3448   Register LHS = MI.getOperand(1).getReg();
3449   Register RHS = MI.getOperand(2).getReg();
3450 
3451   uint16_t Flags = MI.getFlags();
3452 
3453   LLT S64 = LLT::scalar(64);
3454   LLT S1 = LLT::scalar(1);
3455 
3456   auto One = B.buildFConstant(S64, 1.0);
3457 
3458   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3459     .addUse(LHS)
3460     .addUse(RHS)
3461     .addImm(0)
3462     .setMIFlags(Flags);
3463 
3464   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3465 
3466   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3467     .addUse(DivScale0.getReg(0))
3468     .setMIFlags(Flags);
3469 
3470   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3471   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3472   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3473 
3474   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3475     .addUse(LHS)
3476     .addUse(RHS)
3477     .addImm(1)
3478     .setMIFlags(Flags);
3479 
3480   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3481   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3482   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3483 
3484   Register Scale;
3485   if (!ST.hasUsableDivScaleConditionOutput()) {
3486     // Workaround a hardware bug on SI where the condition output from div_scale
3487     // is not usable.
3488 
3489     LLT S32 = LLT::scalar(32);
3490 
3491     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3492     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3493     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3494     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3495 
3496     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3497                               Scale1Unmerge.getReg(1));
3498     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3499                               Scale0Unmerge.getReg(1));
3500     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3501   } else {
3502     Scale = DivScale1.getReg(1);
3503   }
3504 
3505   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3506     .addUse(Fma4.getReg(0))
3507     .addUse(Fma3.getReg(0))
3508     .addUse(Mul.getReg(0))
3509     .addUse(Scale)
3510     .setMIFlags(Flags);
3511 
3512   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3513     .addUse(Fmas.getReg(0))
3514     .addUse(RHS)
3515     .addUse(LHS)
3516     .setMIFlags(Flags);
3517 
3518   MI.eraseFromParent();
3519   return true;
3520 }
3521 
3522 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3523                                                  MachineRegisterInfo &MRI,
3524                                                  MachineIRBuilder &B) const {
3525   Register Res = MI.getOperand(0).getReg();
3526   Register LHS = MI.getOperand(2).getReg();
3527   Register RHS = MI.getOperand(3).getReg();
3528   uint16_t Flags = MI.getFlags();
3529 
3530   LLT S32 = LLT::scalar(32);
3531   LLT S1 = LLT::scalar(1);
3532 
3533   auto Abs = B.buildFAbs(S32, RHS, Flags);
3534   const APFloat C0Val(1.0f);
3535 
3536   auto C0 = B.buildConstant(S32, 0x6f800000);
3537   auto C1 = B.buildConstant(S32, 0x2f800000);
3538   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3539 
3540   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3541   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3542 
3543   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3544 
3545   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3546     .addUse(Mul0.getReg(0))
3547     .setMIFlags(Flags);
3548 
3549   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3550 
3551   B.buildFMul(Res, Sel, Mul1, Flags);
3552 
3553   MI.eraseFromParent();
3554   return true;
3555 }
3556 
3557 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3558 // FIXME: Why do we handle this one but not other removed instructions?
3559 //
3560 // Reciprocal square root.  The clamp prevents infinite results, clamping
3561 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3562 // +-max_float.
3563 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3564                                                     MachineRegisterInfo &MRI,
3565                                                     MachineIRBuilder &B) const {
3566   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3567     return true;
3568 
3569   Register Dst = MI.getOperand(0).getReg();
3570   Register Src = MI.getOperand(2).getReg();
3571   auto Flags = MI.getFlags();
3572 
3573   LLT Ty = MRI.getType(Dst);
3574 
3575   const fltSemantics *FltSemantics;
3576   if (Ty == LLT::scalar(32))
3577     FltSemantics = &APFloat::IEEEsingle();
3578   else if (Ty == LLT::scalar(64))
3579     FltSemantics = &APFloat::IEEEdouble();
3580   else
3581     return false;
3582 
3583   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3584     .addUse(Src)
3585     .setMIFlags(Flags);
3586 
3587   // We don't need to concern ourselves with the snan handling difference, since
3588   // the rsq quieted (or not) so use the one which will directly select.
3589   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3590   const bool UseIEEE = MFI->getMode().IEEE;
3591 
3592   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3593   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3594                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3595 
3596   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3597 
3598   if (UseIEEE)
3599     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3600   else
3601     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3602   MI.eraseFromParent();
3603   return true;
3604 }
3605 
3606 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3607   switch (IID) {
3608   case Intrinsic::amdgcn_ds_fadd:
3609     return AMDGPU::G_ATOMICRMW_FADD;
3610   case Intrinsic::amdgcn_ds_fmin:
3611     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3612   case Intrinsic::amdgcn_ds_fmax:
3613     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3614   default:
3615     llvm_unreachable("not a DS FP intrinsic");
3616   }
3617 }
3618 
3619 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3620                                                       MachineInstr &MI,
3621                                                       Intrinsic::ID IID) const {
3622   GISelChangeObserver &Observer = Helper.Observer;
3623   Observer.changingInstr(MI);
3624 
3625   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3626 
3627   // The remaining operands were used to set fields in the MemOperand on
3628   // construction.
3629   for (int I = 6; I > 3; --I)
3630     MI.RemoveOperand(I);
3631 
3632   MI.RemoveOperand(1); // Remove the intrinsic ID.
3633   Observer.changedInstr(MI);
3634   return true;
3635 }
3636 
3637 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3638                                             MachineRegisterInfo &MRI,
3639                                             MachineIRBuilder &B) const {
3640   uint64_t Offset =
3641     ST.getTargetLowering()->getImplicitParameterOffset(
3642       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3643   LLT DstTy = MRI.getType(DstReg);
3644   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3645 
3646   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3647   if (!loadInputValue(KernargPtrReg, B,
3648                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3649     return false;
3650 
3651   // FIXME: This should be nuw
3652   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3653   return true;
3654 }
3655 
3656 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3657                                                  MachineRegisterInfo &MRI,
3658                                                  MachineIRBuilder &B) const {
3659   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3660   if (!MFI->isEntryFunction()) {
3661     return legalizePreloadedArgIntrin(MI, MRI, B,
3662                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3663   }
3664 
3665   Register DstReg = MI.getOperand(0).getReg();
3666   if (!getImplicitArgPtr(DstReg, MRI, B))
3667     return false;
3668 
3669   MI.eraseFromParent();
3670   return true;
3671 }
3672 
3673 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3674                                               MachineRegisterInfo &MRI,
3675                                               MachineIRBuilder &B,
3676                                               unsigned AddrSpace) const {
3677   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3678   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3679   Register Hi32 = Unmerge.getReg(1);
3680 
3681   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3682   MI.eraseFromParent();
3683   return true;
3684 }
3685 
3686 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3687 // offset (the offset that is included in bounds checking and swizzling, to be
3688 // split between the instruction's voffset and immoffset fields) and soffset
3689 // (the offset that is excluded from bounds checking and swizzling, to go in
3690 // the instruction's soffset field).  This function takes the first kind of
3691 // offset and figures out how to split it between voffset and immoffset.
3692 std::pair<Register, unsigned>
3693 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3694                                         Register OrigOffset) const {
3695   const unsigned MaxImm = 4095;
3696   Register BaseReg;
3697   unsigned ImmOffset;
3698   const LLT S32 = LLT::scalar(32);
3699   MachineRegisterInfo &MRI = *B.getMRI();
3700 
3701   std::tie(BaseReg, ImmOffset) =
3702       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3703 
3704   // If BaseReg is a pointer, convert it to int.
3705   if (MRI.getType(BaseReg).isPointer())
3706     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3707 
3708   // If the immediate value is too big for the immoffset field, put the value
3709   // and -4096 into the immoffset field so that the value that is copied/added
3710   // for the voffset field is a multiple of 4096, and it stands more chance
3711   // of being CSEd with the copy/add for another similar load/store.
3712   // However, do not do that rounding down to a multiple of 4096 if that is a
3713   // negative number, as it appears to be illegal to have a negative offset
3714   // in the vgpr, even if adding the immediate offset makes it positive.
3715   unsigned Overflow = ImmOffset & ~MaxImm;
3716   ImmOffset -= Overflow;
3717   if ((int32_t)Overflow < 0) {
3718     Overflow += ImmOffset;
3719     ImmOffset = 0;
3720   }
3721 
3722   if (Overflow != 0) {
3723     if (!BaseReg) {
3724       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3725     } else {
3726       auto OverflowVal = B.buildConstant(S32, Overflow);
3727       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3728     }
3729   }
3730 
3731   if (!BaseReg)
3732     BaseReg = B.buildConstant(S32, 0).getReg(0);
3733 
3734   return std::make_pair(BaseReg, ImmOffset);
3735 }
3736 
3737 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3738 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3739                                           Register VOffset, Register SOffset,
3740                                           unsigned ImmOffset, Register VIndex,
3741                                           MachineRegisterInfo &MRI) const {
3742   Optional<ValueAndVReg> MaybeVOffsetVal =
3743       getIConstantVRegValWithLookThrough(VOffset, MRI);
3744   Optional<ValueAndVReg> MaybeSOffsetVal =
3745       getIConstantVRegValWithLookThrough(SOffset, MRI);
3746   Optional<ValueAndVReg> MaybeVIndexVal =
3747       getIConstantVRegValWithLookThrough(VIndex, MRI);
3748   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3749   // update the MMO with that offset. The stride is unknown so we can only do
3750   // this if VIndex is constant 0.
3751   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3752       MaybeVIndexVal->Value == 0) {
3753     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3754                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3755     MMO->setOffset(TotalOffset);
3756   } else {
3757     // We don't have a constant combined offset to use in the MMO. Give up.
3758     MMO->setValue((Value *)nullptr);
3759   }
3760 }
3761 
3762 /// Handle register layout difference for f16 images for some subtargets.
3763 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3764                                              MachineRegisterInfo &MRI,
3765                                              Register Reg,
3766                                              bool ImageStore) const {
3767   const LLT S16 = LLT::scalar(16);
3768   const LLT S32 = LLT::scalar(32);
3769   LLT StoreVT = MRI.getType(Reg);
3770   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3771 
3772   if (ST.hasUnpackedD16VMem()) {
3773     auto Unmerge = B.buildUnmerge(S16, Reg);
3774 
3775     SmallVector<Register, 4> WideRegs;
3776     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3777       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3778 
3779     int NumElts = StoreVT.getNumElements();
3780 
3781     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3782         .getReg(0);
3783   }
3784 
3785   if (ImageStore && ST.hasImageStoreD16Bug()) {
3786     if (StoreVT.getNumElements() == 2) {
3787       SmallVector<Register, 4> PackedRegs;
3788       Reg = B.buildBitcast(S32, Reg).getReg(0);
3789       PackedRegs.push_back(Reg);
3790       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3791       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3792           .getReg(0);
3793     }
3794 
3795     if (StoreVT.getNumElements() == 3) {
3796       SmallVector<Register, 4> PackedRegs;
3797       auto Unmerge = B.buildUnmerge(S16, Reg);
3798       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3799         PackedRegs.push_back(Unmerge.getReg(I));
3800       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3801       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3802       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3803     }
3804 
3805     if (StoreVT.getNumElements() == 4) {
3806       SmallVector<Register, 4> PackedRegs;
3807       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3808       auto Unmerge = B.buildUnmerge(S32, Reg);
3809       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3810         PackedRegs.push_back(Unmerge.getReg(I));
3811       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3812       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3813           .getReg(0);
3814     }
3815 
3816     llvm_unreachable("invalid data type");
3817   }
3818 
3819   if (StoreVT == LLT::fixed_vector(3, S16)) {
3820     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3821               .getReg(0);
3822   }
3823   return Reg;
3824 }
3825 
3826 Register AMDGPULegalizerInfo::fixStoreSourceType(
3827   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3828   MachineRegisterInfo *MRI = B.getMRI();
3829   LLT Ty = MRI->getType(VData);
3830 
3831   const LLT S16 = LLT::scalar(16);
3832 
3833   // Fixup illegal register types for i8 stores.
3834   if (Ty == LLT::scalar(8) || Ty == S16) {
3835     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3836     return AnyExt;
3837   }
3838 
3839   if (Ty.isVector()) {
3840     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3841       if (IsFormat)
3842         return handleD16VData(B, *MRI, VData);
3843     }
3844   }
3845 
3846   return VData;
3847 }
3848 
3849 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3850                                               MachineRegisterInfo &MRI,
3851                                               MachineIRBuilder &B,
3852                                               bool IsTyped,
3853                                               bool IsFormat) const {
3854   Register VData = MI.getOperand(1).getReg();
3855   LLT Ty = MRI.getType(VData);
3856   LLT EltTy = Ty.getScalarType();
3857   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3858   const LLT S32 = LLT::scalar(32);
3859 
3860   VData = fixStoreSourceType(B, VData, IsFormat);
3861   Register RSrc = MI.getOperand(2).getReg();
3862 
3863   MachineMemOperand *MMO = *MI.memoperands_begin();
3864   const int MemSize = MMO->getSize();
3865 
3866   unsigned ImmOffset;
3867 
3868   // The typed intrinsics add an immediate after the registers.
3869   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3870 
3871   // The struct intrinsic variants add one additional operand over raw.
3872   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3873   Register VIndex;
3874   int OpOffset = 0;
3875   if (HasVIndex) {
3876     VIndex = MI.getOperand(3).getReg();
3877     OpOffset = 1;
3878   } else {
3879     VIndex = B.buildConstant(S32, 0).getReg(0);
3880   }
3881 
3882   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3883   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3884 
3885   unsigned Format = 0;
3886   if (IsTyped) {
3887     Format = MI.getOperand(5 + OpOffset).getImm();
3888     ++OpOffset;
3889   }
3890 
3891   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3892 
3893   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3894   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3895 
3896   unsigned Opc;
3897   if (IsTyped) {
3898     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3899                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3900   } else if (IsFormat) {
3901     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3902                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3903   } else {
3904     switch (MemSize) {
3905     case 1:
3906       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3907       break;
3908     case 2:
3909       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3910       break;
3911     default:
3912       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3913       break;
3914     }
3915   }
3916 
3917   auto MIB = B.buildInstr(Opc)
3918     .addUse(VData)              // vdata
3919     .addUse(RSrc)               // rsrc
3920     .addUse(VIndex)             // vindex
3921     .addUse(VOffset)            // voffset
3922     .addUse(SOffset)            // soffset
3923     .addImm(ImmOffset);         // offset(imm)
3924 
3925   if (IsTyped)
3926     MIB.addImm(Format);
3927 
3928   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3929      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3930      .addMemOperand(MMO);
3931 
3932   MI.eraseFromParent();
3933   return true;
3934 }
3935 
3936 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3937                                              MachineRegisterInfo &MRI,
3938                                              MachineIRBuilder &B,
3939                                              bool IsFormat,
3940                                              bool IsTyped) const {
3941   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3942   MachineMemOperand *MMO = *MI.memoperands_begin();
3943   const LLT MemTy = MMO->getMemoryType();
3944   const LLT S32 = LLT::scalar(32);
3945 
3946   Register Dst = MI.getOperand(0).getReg();
3947   Register RSrc = MI.getOperand(2).getReg();
3948 
3949   // The typed intrinsics add an immediate after the registers.
3950   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3951 
3952   // The struct intrinsic variants add one additional operand over raw.
3953   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3954   Register VIndex;
3955   int OpOffset = 0;
3956   if (HasVIndex) {
3957     VIndex = MI.getOperand(3).getReg();
3958     OpOffset = 1;
3959   } else {
3960     VIndex = B.buildConstant(S32, 0).getReg(0);
3961   }
3962 
3963   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3964   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3965 
3966   unsigned Format = 0;
3967   if (IsTyped) {
3968     Format = MI.getOperand(5 + OpOffset).getImm();
3969     ++OpOffset;
3970   }
3971 
3972   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3973   unsigned ImmOffset;
3974 
3975   LLT Ty = MRI.getType(Dst);
3976   LLT EltTy = Ty.getScalarType();
3977   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3978   const bool Unpacked = ST.hasUnpackedD16VMem();
3979 
3980   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3981   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3982 
3983   unsigned Opc;
3984 
3985   if (IsTyped) {
3986     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3987                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3988   } else if (IsFormat) {
3989     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3990                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3991   } else {
3992     switch (MemTy.getSizeInBits()) {
3993     case 8:
3994       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3995       break;
3996     case 16:
3997       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3998       break;
3999     default:
4000       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4001       break;
4002     }
4003   }
4004 
4005   Register LoadDstReg;
4006 
4007   bool IsExtLoad =
4008       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4009   LLT UnpackedTy = Ty.changeElementSize(32);
4010 
4011   if (IsExtLoad)
4012     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4013   else if (Unpacked && IsD16 && Ty.isVector())
4014     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4015   else
4016     LoadDstReg = Dst;
4017 
4018   auto MIB = B.buildInstr(Opc)
4019     .addDef(LoadDstReg)         // vdata
4020     .addUse(RSrc)               // rsrc
4021     .addUse(VIndex)             // vindex
4022     .addUse(VOffset)            // voffset
4023     .addUse(SOffset)            // soffset
4024     .addImm(ImmOffset);         // offset(imm)
4025 
4026   if (IsTyped)
4027     MIB.addImm(Format);
4028 
4029   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4030      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4031      .addMemOperand(MMO);
4032 
4033   if (LoadDstReg != Dst) {
4034     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4035 
4036     // Widen result for extending loads was widened.
4037     if (IsExtLoad)
4038       B.buildTrunc(Dst, LoadDstReg);
4039     else {
4040       // Repack to original 16-bit vector result
4041       // FIXME: G_TRUNC should work, but legalization currently fails
4042       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4043       SmallVector<Register, 4> Repack;
4044       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4045         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4046       B.buildMerge(Dst, Repack);
4047     }
4048   }
4049 
4050   MI.eraseFromParent();
4051   return true;
4052 }
4053 
4054 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4055                                                MachineIRBuilder &B,
4056                                                bool IsInc) const {
4057   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4058                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4059   B.buildInstr(Opc)
4060     .addDef(MI.getOperand(0).getReg())
4061     .addUse(MI.getOperand(2).getReg())
4062     .addUse(MI.getOperand(3).getReg())
4063     .cloneMemRefs(MI);
4064   MI.eraseFromParent();
4065   return true;
4066 }
4067 
4068 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4069   switch (IntrID) {
4070   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4071   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4072     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4073   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4074   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4075     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4076   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4077   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4078     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4079   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4080   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4081     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4082   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4083   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4084     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4085   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4086   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4087     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4088   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4089   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4090     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4091   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4092   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4093     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4094   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4095   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4096     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4097   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4098   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4099     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4100   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4101   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4102     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4103   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4104   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4105     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4106   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4107   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4108     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4109   case Intrinsic::amdgcn_buffer_atomic_fadd:
4110   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4111   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4112     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4113   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4114   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4115     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4116   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4117   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4118     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4119   default:
4120     llvm_unreachable("unhandled atomic opcode");
4121   }
4122 }
4123 
4124 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4125                                                MachineIRBuilder &B,
4126                                                Intrinsic::ID IID) const {
4127   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4128                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4129   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4130 
4131   Register Dst;
4132 
4133   int OpOffset = 0;
4134   if (HasReturn) {
4135     // A few FP atomics do not support return values.
4136     Dst = MI.getOperand(0).getReg();
4137   } else {
4138     OpOffset = -1;
4139   }
4140 
4141   Register VData = MI.getOperand(2 + OpOffset).getReg();
4142   Register CmpVal;
4143 
4144   if (IsCmpSwap) {
4145     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4146     ++OpOffset;
4147   }
4148 
4149   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4150   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4151 
4152   // The struct intrinsic variants add one additional operand over raw.
4153   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4154   Register VIndex;
4155   if (HasVIndex) {
4156     VIndex = MI.getOperand(4 + OpOffset).getReg();
4157     ++OpOffset;
4158   } else {
4159     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4160   }
4161 
4162   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4163   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4164   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4165 
4166   MachineMemOperand *MMO = *MI.memoperands_begin();
4167 
4168   unsigned ImmOffset;
4169   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4170   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4171 
4172   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4173 
4174   if (HasReturn)
4175     MIB.addDef(Dst);
4176 
4177   MIB.addUse(VData); // vdata
4178 
4179   if (IsCmpSwap)
4180     MIB.addReg(CmpVal);
4181 
4182   MIB.addUse(RSrc)               // rsrc
4183      .addUse(VIndex)             // vindex
4184      .addUse(VOffset)            // voffset
4185      .addUse(SOffset)            // soffset
4186      .addImm(ImmOffset)          // offset(imm)
4187      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4188      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4189      .addMemOperand(MMO);
4190 
4191   MI.eraseFromParent();
4192   return true;
4193 }
4194 
4195 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4196 /// vector with s16 typed elements.
4197 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4198                                       SmallVectorImpl<Register> &PackedAddrs,
4199                                       unsigned ArgOffset,
4200                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4201                                       bool IsA16, bool IsG16) {
4202   const LLT S16 = LLT::scalar(16);
4203   const LLT V2S16 = LLT::fixed_vector(2, 16);
4204   auto EndIdx = Intr->VAddrEnd;
4205 
4206   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4207     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4208     if (!SrcOp.isReg())
4209       continue; // _L to _LZ may have eliminated this.
4210 
4211     Register AddrReg = SrcOp.getReg();
4212 
4213     if ((I < Intr->GradientStart) ||
4214         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4215         (I >= Intr->CoordStart && !IsA16)) {
4216       // Handle any gradient or coordinate operands that should not be packed
4217       if ((I < Intr->GradientStart) && IsA16 &&
4218           (B.getMRI()->getType(AddrReg) == S16)) {
4219         // Special handling of bias when A16 is on. Bias is of type half but
4220         // occupies full 32-bit.
4221         PackedAddrs.push_back(
4222             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4223                 .getReg(0));
4224       } else {
4225         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4226         PackedAddrs.push_back(AddrReg);
4227       }
4228     } else {
4229       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4230       // derivatives dx/dh and dx/dv are packed with undef.
4231       if (((I + 1) >= EndIdx) ||
4232           ((Intr->NumGradients / 2) % 2 == 1 &&
4233            (I == static_cast<unsigned>(Intr->GradientStart +
4234                                        (Intr->NumGradients / 2) - 1) ||
4235             I == static_cast<unsigned>(Intr->GradientStart +
4236                                        Intr->NumGradients - 1))) ||
4237           // Check for _L to _LZ optimization
4238           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4239         PackedAddrs.push_back(
4240             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4241                 .getReg(0));
4242       } else {
4243         PackedAddrs.push_back(
4244             B.buildBuildVector(
4245                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4246                 .getReg(0));
4247         ++I;
4248       }
4249     }
4250   }
4251 }
4252 
4253 /// Convert from separate vaddr components to a single vector address register,
4254 /// and replace the remaining operands with $noreg.
4255 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4256                                      int DimIdx, int NumVAddrs) {
4257   const LLT S32 = LLT::scalar(32);
4258 
4259   SmallVector<Register, 8> AddrRegs;
4260   for (int I = 0; I != NumVAddrs; ++I) {
4261     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4262     if (SrcOp.isReg()) {
4263       AddrRegs.push_back(SrcOp.getReg());
4264       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4265     }
4266   }
4267 
4268   int NumAddrRegs = AddrRegs.size();
4269   if (NumAddrRegs != 1) {
4270     // Above 8 elements round up to next power of 2 (i.e. 16).
4271     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4272       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4273       auto Undef = B.buildUndef(S32);
4274       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4275       NumAddrRegs = RoundedNumRegs;
4276     }
4277 
4278     auto VAddr =
4279         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4280     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4281   }
4282 
4283   for (int I = 1; I != NumVAddrs; ++I) {
4284     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4285     if (SrcOp.isReg())
4286       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4287   }
4288 }
4289 
4290 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4291 ///
4292 /// Depending on the subtarget, load/store with 16-bit element data need to be
4293 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4294 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4295 /// registers.
4296 ///
4297 /// We don't want to directly select image instructions just yet, but also want
4298 /// to exposes all register repacking to the legalizer/combiners. We also don't
4299 /// want a selected instrution entering RegBankSelect. In order to avoid
4300 /// defining a multitude of intermediate image instructions, directly hack on
4301 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4302 /// padding now unnecessary arguments with $noreg.
4303 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4304     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4305     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4306 
4307   const unsigned NumDefs = MI.getNumExplicitDefs();
4308   const unsigned ArgOffset = NumDefs + 1;
4309   bool IsTFE = NumDefs == 2;
4310   // We are only processing the operands of d16 image operations on subtargets
4311   // that use the unpacked register layout, or need to repack the TFE result.
4312 
4313   // TODO: Do we need to guard against already legalized intrinsics?
4314   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4315       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4316 
4317   MachineRegisterInfo *MRI = B.getMRI();
4318   const LLT S32 = LLT::scalar(32);
4319   const LLT S16 = LLT::scalar(16);
4320   const LLT V2S16 = LLT::fixed_vector(2, 16);
4321 
4322   unsigned DMask = 0;
4323 
4324   // Check for 16 bit addresses and pack if true.
4325   LLT GradTy =
4326       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4327   LLT AddrTy =
4328       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4329   const bool IsG16 = GradTy == S16;
4330   const bool IsA16 = AddrTy == S16;
4331 
4332   int DMaskLanes = 0;
4333   if (!BaseOpcode->Atomic) {
4334     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4335     if (BaseOpcode->Gather4) {
4336       DMaskLanes = 4;
4337     } else if (DMask != 0) {
4338       DMaskLanes = countPopulation(DMask);
4339     } else if (!IsTFE && !BaseOpcode->Store) {
4340       // If dmask is 0, this is a no-op load. This can be eliminated.
4341       B.buildUndef(MI.getOperand(0));
4342       MI.eraseFromParent();
4343       return true;
4344     }
4345   }
4346 
4347   Observer.changingInstr(MI);
4348   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4349 
4350   unsigned NewOpcode = NumDefs == 0 ?
4351     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4352 
4353   // Track that we legalized this
4354   MI.setDesc(B.getTII().get(NewOpcode));
4355 
4356   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4357   // dmask to be at least 1 otherwise the instruction will fail
4358   if (IsTFE && DMask == 0) {
4359     DMask = 0x1;
4360     DMaskLanes = 1;
4361     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4362   }
4363 
4364   if (BaseOpcode->Atomic) {
4365     Register VData0 = MI.getOperand(2).getReg();
4366     LLT Ty = MRI->getType(VData0);
4367 
4368     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4369     if (Ty.isVector())
4370       return false;
4371 
4372     if (BaseOpcode->AtomicX2) {
4373       Register VData1 = MI.getOperand(3).getReg();
4374       // The two values are packed in one register.
4375       LLT PackedTy = LLT::fixed_vector(2, Ty);
4376       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4377       MI.getOperand(2).setReg(Concat.getReg(0));
4378       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4379     }
4380   }
4381 
4382   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4383 
4384   // Optimize _L to _LZ when _L is zero
4385   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4386           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4387     const ConstantFP *ConstantLod;
4388 
4389     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4390                  m_GFCst(ConstantLod))) {
4391       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4392         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4393         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4394             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4395                                                      Intr->Dim);
4396 
4397         // The starting indexes should remain in the same place.
4398         --CorrectedNumVAddrs;
4399 
4400         MI.getOperand(MI.getNumExplicitDefs())
4401             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4402         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4403         Intr = NewImageDimIntr;
4404       }
4405     }
4406   }
4407 
4408   // Optimize _mip away, when 'lod' is zero
4409   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4410     int64_t ConstantLod;
4411     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4412                  m_ICst(ConstantLod))) {
4413       if (ConstantLod == 0) {
4414         // TODO: Change intrinsic opcode and remove operand instead or replacing
4415         // it with 0, as the _L to _LZ handling is done above.
4416         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4417         --CorrectedNumVAddrs;
4418       }
4419     }
4420   }
4421 
4422   // Rewrite the addressing register layout before doing anything else.
4423   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4424     // 16 bit gradients are supported, but are tied to the A16 control
4425     // so both gradients and addresses must be 16 bit
4426     return false;
4427   }
4428 
4429   if (IsA16 && !ST.hasA16()) {
4430     // A16 not supported
4431     return false;
4432   }
4433 
4434   if (IsA16 || IsG16) {
4435     if (Intr->NumVAddrs > 1) {
4436       SmallVector<Register, 4> PackedRegs;
4437 
4438       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4439                                 IsG16);
4440 
4441       // See also below in the non-a16 branch
4442       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4443                           PackedRegs.size() <= ST.getNSAMaxSize();
4444 
4445       if (!UseNSA && PackedRegs.size() > 1) {
4446         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4447         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4448         PackedRegs[0] = Concat.getReg(0);
4449         PackedRegs.resize(1);
4450       }
4451 
4452       const unsigned NumPacked = PackedRegs.size();
4453       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4454         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4455         if (!SrcOp.isReg()) {
4456           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4457           continue;
4458         }
4459 
4460         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4461 
4462         if (I - Intr->VAddrStart < NumPacked)
4463           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4464         else
4465           SrcOp.setReg(AMDGPU::NoRegister);
4466       }
4467     }
4468   } else {
4469     // If the register allocator cannot place the address registers contiguously
4470     // without introducing moves, then using the non-sequential address encoding
4471     // is always preferable, since it saves VALU instructions and is usually a
4472     // wash in terms of code size or even better.
4473     //
4474     // However, we currently have no way of hinting to the register allocator
4475     // that MIMG addresses should be placed contiguously when it is possible to
4476     // do so, so force non-NSA for the common 2-address case as a heuristic.
4477     //
4478     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4479     // allocation when possible.
4480     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4481                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4482 
4483     if (!UseNSA && Intr->NumVAddrs > 1)
4484       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4485                                Intr->NumVAddrs);
4486   }
4487 
4488   int Flags = 0;
4489   if (IsA16)
4490     Flags |= 1;
4491   if (IsG16)
4492     Flags |= 2;
4493   MI.addOperand(MachineOperand::CreateImm(Flags));
4494 
4495   if (BaseOpcode->Store) { // No TFE for stores?
4496     // TODO: Handle dmask trim
4497     Register VData = MI.getOperand(1).getReg();
4498     LLT Ty = MRI->getType(VData);
4499     if (!Ty.isVector() || Ty.getElementType() != S16)
4500       return true;
4501 
4502     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4503     if (RepackedReg != VData) {
4504       MI.getOperand(1).setReg(RepackedReg);
4505     }
4506 
4507     return true;
4508   }
4509 
4510   Register DstReg = MI.getOperand(0).getReg();
4511   LLT Ty = MRI->getType(DstReg);
4512   const LLT EltTy = Ty.getScalarType();
4513   const bool IsD16 = Ty.getScalarType() == S16;
4514   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4515 
4516   // Confirm that the return type is large enough for the dmask specified
4517   if (NumElts < DMaskLanes)
4518     return false;
4519 
4520   if (NumElts > 4 || DMaskLanes > 4)
4521     return false;
4522 
4523   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4524   const LLT AdjustedTy =
4525       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4526 
4527   // The raw dword aligned data component of the load. The only legal cases
4528   // where this matters should be when using the packed D16 format, for
4529   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4530   LLT RoundedTy;
4531 
4532   // S32 vector to to cover all data, plus TFE result element.
4533   LLT TFETy;
4534 
4535   // Register type to use for each loaded component. Will be S32 or V2S16.
4536   LLT RegTy;
4537 
4538   if (IsD16 && ST.hasUnpackedD16VMem()) {
4539     RoundedTy =
4540         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4541     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4542     RegTy = S32;
4543   } else {
4544     unsigned EltSize = EltTy.getSizeInBits();
4545     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4546     unsigned RoundedSize = 32 * RoundedElts;
4547     RoundedTy = LLT::scalarOrVector(
4548         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4549     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4550     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4551   }
4552 
4553   // The return type does not need adjustment.
4554   // TODO: Should we change s16 case to s32 or <2 x s16>?
4555   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4556     return true;
4557 
4558   Register Dst1Reg;
4559 
4560   // Insert after the instruction.
4561   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4562 
4563   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4564   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4565   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4566   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4567 
4568   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4569 
4570   MI.getOperand(0).setReg(NewResultReg);
4571 
4572   // In the IR, TFE is supposed to be used with a 2 element struct return
4573   // type. The instruction really returns these two values in one contiguous
4574   // register, with one additional dword beyond the loaded data. Rewrite the
4575   // return type to use a single register result.
4576 
4577   if (IsTFE) {
4578     Dst1Reg = MI.getOperand(1).getReg();
4579     if (MRI->getType(Dst1Reg) != S32)
4580       return false;
4581 
4582     // TODO: Make sure the TFE operand bit is set.
4583     MI.RemoveOperand(1);
4584 
4585     // Handle the easy case that requires no repack instructions.
4586     if (Ty == S32) {
4587       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4588       return true;
4589     }
4590   }
4591 
4592   // Now figure out how to copy the new result register back into the old
4593   // result.
4594   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4595 
4596   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4597 
4598   if (ResultNumRegs == 1) {
4599     assert(!IsTFE);
4600     ResultRegs[0] = NewResultReg;
4601   } else {
4602     // We have to repack into a new vector of some kind.
4603     for (int I = 0; I != NumDataRegs; ++I)
4604       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4605     B.buildUnmerge(ResultRegs, NewResultReg);
4606 
4607     // Drop the final TFE element to get the data part. The TFE result is
4608     // directly written to the right place already.
4609     if (IsTFE)
4610       ResultRegs.resize(NumDataRegs);
4611   }
4612 
4613   // For an s16 scalar result, we form an s32 result with a truncate regardless
4614   // of packed vs. unpacked.
4615   if (IsD16 && !Ty.isVector()) {
4616     B.buildTrunc(DstReg, ResultRegs[0]);
4617     return true;
4618   }
4619 
4620   // Avoid a build/concat_vector of 1 entry.
4621   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4622     B.buildBitcast(DstReg, ResultRegs[0]);
4623     return true;
4624   }
4625 
4626   assert(Ty.isVector());
4627 
4628   if (IsD16) {
4629     // For packed D16 results with TFE enabled, all the data components are
4630     // S32. Cast back to the expected type.
4631     //
4632     // TODO: We don't really need to use load s32 elements. We would only need one
4633     // cast for the TFE result if a multiple of v2s16 was used.
4634     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4635       for (Register &Reg : ResultRegs)
4636         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4637     } else if (ST.hasUnpackedD16VMem()) {
4638       for (Register &Reg : ResultRegs)
4639         Reg = B.buildTrunc(S16, Reg).getReg(0);
4640     }
4641   }
4642 
4643   auto padWithUndef = [&](LLT Ty, int NumElts) {
4644     if (NumElts == 0)
4645       return;
4646     Register Undef = B.buildUndef(Ty).getReg(0);
4647     for (int I = 0; I != NumElts; ++I)
4648       ResultRegs.push_back(Undef);
4649   };
4650 
4651   // Pad out any elements eliminated due to the dmask.
4652   LLT ResTy = MRI->getType(ResultRegs[0]);
4653   if (!ResTy.isVector()) {
4654     padWithUndef(ResTy, NumElts - ResultRegs.size());
4655     B.buildBuildVector(DstReg, ResultRegs);
4656     return true;
4657   }
4658 
4659   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4660   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4661 
4662   // Deal with the one annoying legal case.
4663   const LLT V3S16 = LLT::fixed_vector(3, 16);
4664   if (Ty == V3S16) {
4665     if (IsTFE) {
4666       if (ResultRegs.size() == 1) {
4667         NewResultReg = ResultRegs[0];
4668       } else if (ResultRegs.size() == 2) {
4669         LLT V4S16 = LLT::fixed_vector(4, 16);
4670         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4671       } else {
4672         return false;
4673       }
4674     }
4675 
4676     if (MRI->getType(DstReg).getNumElements() <
4677         MRI->getType(NewResultReg).getNumElements()) {
4678       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4679     } else {
4680       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4681     }
4682     return true;
4683   }
4684 
4685   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4686   B.buildConcatVectors(DstReg, ResultRegs);
4687   return true;
4688 }
4689 
4690 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4691   LegalizerHelper &Helper, MachineInstr &MI) const {
4692   MachineIRBuilder &B = Helper.MIRBuilder;
4693   GISelChangeObserver &Observer = Helper.Observer;
4694 
4695   Register Dst = MI.getOperand(0).getReg();
4696   LLT Ty = B.getMRI()->getType(Dst);
4697   unsigned Size = Ty.getSizeInBits();
4698   MachineFunction &MF = B.getMF();
4699 
4700   Observer.changingInstr(MI);
4701 
4702   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4703     Ty = getBitcastRegisterType(Ty);
4704     Helper.bitcastDst(MI, Ty, 0);
4705     Dst = MI.getOperand(0).getReg();
4706     B.setInsertPt(B.getMBB(), MI);
4707   }
4708 
4709   // FIXME: We don't really need this intermediate instruction. The intrinsic
4710   // should be fixed to have a memory operand. Since it's readnone, we're not
4711   // allowed to add one.
4712   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4713   MI.RemoveOperand(1); // Remove intrinsic ID
4714 
4715   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4716   // TODO: Should this use datalayout alignment?
4717   const unsigned MemSize = (Size + 7) / 8;
4718   const Align MemAlign(4);
4719   MachineMemOperand *MMO = MF.getMachineMemOperand(
4720       MachinePointerInfo(),
4721       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4722           MachineMemOperand::MOInvariant,
4723       MemSize, MemAlign);
4724   MI.addMemOperand(MF, MMO);
4725 
4726   // There are no 96-bit result scalar loads, but widening to 128-bit should
4727   // always be legal. We may need to restore this to a 96-bit result if it turns
4728   // out this needs to be converted to a vector load during RegBankSelect.
4729   if (!isPowerOf2_32(Size)) {
4730     if (Ty.isVector())
4731       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4732     else
4733       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4734   }
4735 
4736   Observer.changedInstr(MI);
4737   return true;
4738 }
4739 
4740 // TODO: Move to selection
4741 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4742                                                 MachineRegisterInfo &MRI,
4743                                                 MachineIRBuilder &B) const {
4744   if (!ST.isTrapHandlerEnabled() ||
4745       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4746     return legalizeTrapEndpgm(MI, MRI, B);
4747 
4748   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4749     switch (*HsaAbiVer) {
4750     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4751     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4752       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4753     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4754       return ST.supportsGetDoorbellID() ?
4755           legalizeTrapHsa(MI, MRI, B) :
4756           legalizeTrapHsaQueuePtr(MI, MRI, B);
4757     }
4758   }
4759 
4760   llvm_unreachable("Unknown trap handler");
4761 }
4762 
4763 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4764     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4765   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4766   MI.eraseFromParent();
4767   return true;
4768 }
4769 
4770 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4771     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4772   // Pass queue pointer to trap handler as input, and insert trap instruction
4773   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4774   Register LiveIn =
4775     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4776   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4777     return false;
4778 
4779   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4780   B.buildCopy(SGPR01, LiveIn);
4781   B.buildInstr(AMDGPU::S_TRAP)
4782       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4783       .addReg(SGPR01, RegState::Implicit);
4784 
4785   MI.eraseFromParent();
4786   return true;
4787 }
4788 
4789 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4790     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4791   B.buildInstr(AMDGPU::S_TRAP)
4792       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4793   MI.eraseFromParent();
4794   return true;
4795 }
4796 
4797 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4798     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4799   // Is non-HSA path or trap-handler disabled? Then, report a warning
4800   // accordingly
4801   if (!ST.isTrapHandlerEnabled() ||
4802       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4803     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4804                                      "debugtrap handler not supported",
4805                                      MI.getDebugLoc(), DS_Warning);
4806     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4807     Ctx.diagnose(NoTrap);
4808   } else {
4809     // Insert debug-trap instruction
4810     B.buildInstr(AMDGPU::S_TRAP)
4811         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4812   }
4813 
4814   MI.eraseFromParent();
4815   return true;
4816 }
4817 
4818 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4819                                                MachineIRBuilder &B) const {
4820   MachineRegisterInfo &MRI = *B.getMRI();
4821   const LLT S16 = LLT::scalar(16);
4822   const LLT S32 = LLT::scalar(32);
4823 
4824   Register DstReg = MI.getOperand(0).getReg();
4825   Register NodePtr = MI.getOperand(2).getReg();
4826   Register RayExtent = MI.getOperand(3).getReg();
4827   Register RayOrigin = MI.getOperand(4).getReg();
4828   Register RayDir = MI.getOperand(5).getReg();
4829   Register RayInvDir = MI.getOperand(6).getReg();
4830   Register TDescr = MI.getOperand(7).getReg();
4831 
4832   if (!ST.hasGFX10_AEncoding()) {
4833     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4834                                         "intrinsic not supported on subtarget",
4835                                         MI.getDebugLoc());
4836     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4837     return false;
4838   }
4839 
4840   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4841   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4842   const unsigned NumVDataDwords = 4;
4843   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4844   const bool UseNSA =
4845       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4846   const unsigned BaseOpcodes[2][2] = {
4847       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4848       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4849        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4850   int Opcode;
4851   if (UseNSA) {
4852     Opcode =
4853         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4854                               NumVDataDwords, NumVAddrDwords);
4855   } else {
4856     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4857                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4858                                    PowerOf2Ceil(NumVAddrDwords));
4859   }
4860   assert(Opcode != -1);
4861 
4862   SmallVector<Register, 12> Ops;
4863   if (Is64) {
4864     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4865     Ops.push_back(Unmerge.getReg(0));
4866     Ops.push_back(Unmerge.getReg(1));
4867   } else {
4868     Ops.push_back(NodePtr);
4869   }
4870   Ops.push_back(RayExtent);
4871 
4872   auto packLanes = [&Ops, &S32, &B](Register Src) {
4873     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4874     Ops.push_back(Unmerge.getReg(0));
4875     Ops.push_back(Unmerge.getReg(1));
4876     Ops.push_back(Unmerge.getReg(2));
4877   };
4878 
4879   packLanes(RayOrigin);
4880   if (IsA16) {
4881     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4882     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4883     Register R1 = MRI.createGenericVirtualRegister(S32);
4884     Register R2 = MRI.createGenericVirtualRegister(S32);
4885     Register R3 = MRI.createGenericVirtualRegister(S32);
4886     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4887     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4888     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4889     Ops.push_back(R1);
4890     Ops.push_back(R2);
4891     Ops.push_back(R3);
4892   } else {
4893     packLanes(RayDir);
4894     packLanes(RayInvDir);
4895   }
4896 
4897   if (!UseNSA) {
4898     // Build a single vector containing all the operands so far prepared.
4899     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4900     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4901     Ops.clear();
4902     Ops.push_back(MergedOps);
4903   }
4904 
4905   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4906     .addDef(DstReg)
4907     .addImm(Opcode);
4908 
4909   for (Register R : Ops) {
4910     MIB.addUse(R);
4911   }
4912 
4913   MIB.addUse(TDescr)
4914      .addImm(IsA16 ? 1 : 0)
4915      .cloneMemRefs(MI);
4916 
4917   MI.eraseFromParent();
4918   return true;
4919 }
4920 
4921 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4922                                             MachineInstr &MI) const {
4923   MachineIRBuilder &B = Helper.MIRBuilder;
4924   MachineRegisterInfo &MRI = *B.getMRI();
4925 
4926   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4927   auto IntrID = MI.getIntrinsicID();
4928   switch (IntrID) {
4929   case Intrinsic::amdgcn_if:
4930   case Intrinsic::amdgcn_else: {
4931     MachineInstr *Br = nullptr;
4932     MachineBasicBlock *UncondBrTarget = nullptr;
4933     bool Negated = false;
4934     if (MachineInstr *BrCond =
4935             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4936       const SIRegisterInfo *TRI
4937         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4938 
4939       Register Def = MI.getOperand(1).getReg();
4940       Register Use = MI.getOperand(3).getReg();
4941 
4942       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4943 
4944       if (Negated)
4945         std::swap(CondBrTarget, UncondBrTarget);
4946 
4947       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4948       if (IntrID == Intrinsic::amdgcn_if) {
4949         B.buildInstr(AMDGPU::SI_IF)
4950           .addDef(Def)
4951           .addUse(Use)
4952           .addMBB(UncondBrTarget);
4953       } else {
4954         B.buildInstr(AMDGPU::SI_ELSE)
4955             .addDef(Def)
4956             .addUse(Use)
4957             .addMBB(UncondBrTarget);
4958       }
4959 
4960       if (Br) {
4961         Br->getOperand(0).setMBB(CondBrTarget);
4962       } else {
4963         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4964         // since we're swapping branch targets it needs to be reinserted.
4965         // FIXME: IRTranslator should probably not do this
4966         B.buildBr(*CondBrTarget);
4967       }
4968 
4969       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4970       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4971       MI.eraseFromParent();
4972       BrCond->eraseFromParent();
4973       return true;
4974     }
4975 
4976     return false;
4977   }
4978   case Intrinsic::amdgcn_loop: {
4979     MachineInstr *Br = nullptr;
4980     MachineBasicBlock *UncondBrTarget = nullptr;
4981     bool Negated = false;
4982     if (MachineInstr *BrCond =
4983             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4984       const SIRegisterInfo *TRI
4985         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4986 
4987       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4988       Register Reg = MI.getOperand(2).getReg();
4989 
4990       if (Negated)
4991         std::swap(CondBrTarget, UncondBrTarget);
4992 
4993       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4994       B.buildInstr(AMDGPU::SI_LOOP)
4995         .addUse(Reg)
4996         .addMBB(UncondBrTarget);
4997 
4998       if (Br)
4999         Br->getOperand(0).setMBB(CondBrTarget);
5000       else
5001         B.buildBr(*CondBrTarget);
5002 
5003       MI.eraseFromParent();
5004       BrCond->eraseFromParent();
5005       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5006       return true;
5007     }
5008 
5009     return false;
5010   }
5011   case Intrinsic::amdgcn_kernarg_segment_ptr:
5012     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5013       // This only makes sense to call in a kernel, so just lower to null.
5014       B.buildConstant(MI.getOperand(0).getReg(), 0);
5015       MI.eraseFromParent();
5016       return true;
5017     }
5018 
5019     return legalizePreloadedArgIntrin(
5020       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5021   case Intrinsic::amdgcn_implicitarg_ptr:
5022     return legalizeImplicitArgPtr(MI, MRI, B);
5023   case Intrinsic::amdgcn_workitem_id_x:
5024     return legalizePreloadedArgIntrin(MI, MRI, B,
5025                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5026   case Intrinsic::amdgcn_workitem_id_y:
5027     return legalizePreloadedArgIntrin(MI, MRI, B,
5028                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5029   case Intrinsic::amdgcn_workitem_id_z:
5030     return legalizePreloadedArgIntrin(MI, MRI, B,
5031                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5032   case Intrinsic::amdgcn_workgroup_id_x:
5033     return legalizePreloadedArgIntrin(MI, MRI, B,
5034                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5035   case Intrinsic::amdgcn_workgroup_id_y:
5036     return legalizePreloadedArgIntrin(MI, MRI, B,
5037                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5038   case Intrinsic::amdgcn_workgroup_id_z:
5039     return legalizePreloadedArgIntrin(MI, MRI, B,
5040                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5041   case Intrinsic::amdgcn_dispatch_ptr:
5042     return legalizePreloadedArgIntrin(MI, MRI, B,
5043                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5044   case Intrinsic::amdgcn_queue_ptr:
5045     return legalizePreloadedArgIntrin(MI, MRI, B,
5046                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5047   case Intrinsic::amdgcn_implicit_buffer_ptr:
5048     return legalizePreloadedArgIntrin(
5049       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5050   case Intrinsic::amdgcn_dispatch_id:
5051     return legalizePreloadedArgIntrin(MI, MRI, B,
5052                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5053   case Intrinsic::amdgcn_fdiv_fast:
5054     return legalizeFDIVFastIntrin(MI, MRI, B);
5055   case Intrinsic::amdgcn_is_shared:
5056     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5057   case Intrinsic::amdgcn_is_private:
5058     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5059   case Intrinsic::amdgcn_wavefrontsize: {
5060     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5061     MI.eraseFromParent();
5062     return true;
5063   }
5064   case Intrinsic::amdgcn_s_buffer_load:
5065     return legalizeSBufferLoad(Helper, MI);
5066   case Intrinsic::amdgcn_raw_buffer_store:
5067   case Intrinsic::amdgcn_struct_buffer_store:
5068     return legalizeBufferStore(MI, MRI, B, false, false);
5069   case Intrinsic::amdgcn_raw_buffer_store_format:
5070   case Intrinsic::amdgcn_struct_buffer_store_format:
5071     return legalizeBufferStore(MI, MRI, B, false, true);
5072   case Intrinsic::amdgcn_raw_tbuffer_store:
5073   case Intrinsic::amdgcn_struct_tbuffer_store:
5074     return legalizeBufferStore(MI, MRI, B, true, true);
5075   case Intrinsic::amdgcn_raw_buffer_load:
5076   case Intrinsic::amdgcn_struct_buffer_load:
5077     return legalizeBufferLoad(MI, MRI, B, false, false);
5078   case Intrinsic::amdgcn_raw_buffer_load_format:
5079   case Intrinsic::amdgcn_struct_buffer_load_format:
5080     return legalizeBufferLoad(MI, MRI, B, true, false);
5081   case Intrinsic::amdgcn_raw_tbuffer_load:
5082   case Intrinsic::amdgcn_struct_tbuffer_load:
5083     return legalizeBufferLoad(MI, MRI, B, true, true);
5084   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5085   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5086   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5087   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5088   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5089   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5090   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5091   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5092   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5093   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5094   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5095   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5096   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5097   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5098   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5099   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5100   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5101   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5102   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5103   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5104   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5105   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5106   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5107   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5108   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5109   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5110   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5111   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5112   case Intrinsic::amdgcn_buffer_atomic_fadd:
5113   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5114   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5115   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5116   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5117     return legalizeBufferAtomic(MI, B, IntrID);
5118   case Intrinsic::amdgcn_atomic_inc:
5119     return legalizeAtomicIncDec(MI, B, true);
5120   case Intrinsic::amdgcn_atomic_dec:
5121     return legalizeAtomicIncDec(MI, B, false);
5122   case Intrinsic::trap:
5123     return legalizeTrapIntrinsic(MI, MRI, B);
5124   case Intrinsic::debugtrap:
5125     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5126   case Intrinsic::amdgcn_rsq_clamp:
5127     return legalizeRsqClampIntrinsic(MI, MRI, B);
5128   case Intrinsic::amdgcn_ds_fadd:
5129   case Intrinsic::amdgcn_ds_fmin:
5130   case Intrinsic::amdgcn_ds_fmax:
5131     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5132   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5133     return legalizeBVHIntrinsic(MI, B);
5134   default: {
5135     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5136             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5137       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5138     return true;
5139   }
5140   }
5141 
5142   return true;
5143 }
5144