1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElements(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElements(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElements(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElements(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElements(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 
1056     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057       return true;
1058 
1059     const LLT PtrTy = Query.Types[1];
1060     unsigned AS = PtrTy.getAddressSpace();
1061     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062       return true;
1063 
1064     // Catch weird sized loads that don't evenly divide into the access sizes
1065     // TODO: May be able to widen depending on alignment etc.
1066     unsigned NumRegs = (MemSize + 31) / 32;
1067     if (NumRegs == 3) {
1068       if (!ST.hasDwordx3LoadStores())
1069         return true;
1070     } else {
1071       // If the alignment allows, these should have been widened.
1072       if (!isPowerOf2_32(NumRegs))
1073         return true;
1074     }
1075 
1076     return false;
1077   };
1078 
1079   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084   // LDS
1085   // TODO: Unsupported flat for SI.
1086 
1087   for (unsigned Op : {G_LOAD, G_STORE}) {
1088     const bool IsStore = Op == G_STORE;
1089 
1090     auto &Actions = getActionDefinitionsBuilder(Op);
1091     // Explicitly list some common cases.
1092     // TODO: Does this help compile time at all?
1093     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096                                       {S64, GlobalPtr, S64, GlobalAlign32},
1097                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099                                       {S32, GlobalPtr, S8, GlobalAlign8},
1100                                       {S32, GlobalPtr, S16, GlobalAlign16},
1101 
1102                                       {S32, LocalPtr, S32, 32},
1103                                       {S64, LocalPtr, S64, 32},
1104                                       {V2S32, LocalPtr, V2S32, 32},
1105                                       {S32, LocalPtr, S8, 8},
1106                                       {S32, LocalPtr, S16, 16},
1107                                       {V2S16, LocalPtr, S32, 32},
1108 
1109                                       {S32, PrivatePtr, S32, 32},
1110                                       {S32, PrivatePtr, S8, 8},
1111                                       {S32, PrivatePtr, S16, 16},
1112                                       {V2S16, PrivatePtr, S32, 32},
1113 
1114                                       {S32, ConstantPtr, S32, GlobalAlign32},
1115                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117                                       {S64, ConstantPtr, S64, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119     Actions.legalIf(
1120       [=](const LegalityQuery &Query) -> bool {
1121         return isLoadStoreLegal(ST, Query);
1122       });
1123 
1124     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125     // 64-bits.
1126     //
1127     // TODO: Should generalize bitcast action into coerce, which will also cover
1128     // inserting addrspacecasts.
1129     Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131     // Turn any illegal element vectors into something easier to deal
1132     // with. These will ultimately produce 32-bit scalar shifts to extract the
1133     // parts anyway.
1134     //
1135     // For odd 16-bit element vectors, prefer to split those into pieces with
1136     // 16-bit vector parts.
1137     Actions.bitcastIf(
1138       [=](const LegalityQuery &Query) -> bool {
1139         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140                                           Query.MMODescrs[0].MemoryTy);
1141       }, bitcastToRegisterType(0));
1142 
1143     if (!IsStore) {
1144       // Widen suitably aligned loads by loading extra bytes. The standard
1145       // legalization actions can't properly express widening memory operands.
1146       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147         return shouldWidenLoad(ST, Query, G_LOAD);
1148       });
1149     }
1150 
1151     // FIXME: load/store narrowing should be moved to lower action
1152     Actions
1153         .narrowScalarIf(
1154             [=](const LegalityQuery &Query) -> bool {
1155               return !Query.Types[0].isVector() &&
1156                      needToSplitMemOp(Query, Op == G_LOAD);
1157             },
1158             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159               const LLT DstTy = Query.Types[0];
1160               const LLT PtrTy = Query.Types[1];
1161 
1162               const unsigned DstSize = DstTy.getSizeInBits();
1163               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164 
1165               // Split extloads.
1166               if (DstSize > MemSize)
1167                 return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169               unsigned MaxSize = maxSizeForAddrSpace(ST,
1170                                                      PtrTy.getAddressSpace(),
1171                                                      Op == G_LOAD);
1172               if (MemSize > MaxSize)
1173                 return std::make_pair(0, LLT::scalar(MaxSize));
1174 
1175               unsigned Align = Query.MMODescrs[0].AlignInBits;
1176               return std::make_pair(0, LLT::scalar(Align));
1177             })
1178         .fewerElementsIf(
1179             [=](const LegalityQuery &Query) -> bool {
1180               return Query.Types[0].isVector() &&
1181                      needToSplitMemOp(Query, Op == G_LOAD);
1182             },
1183             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184               const LLT DstTy = Query.Types[0];
1185               const LLT PtrTy = Query.Types[1];
1186 
1187               LLT EltTy = DstTy.getElementType();
1188               unsigned MaxSize = maxSizeForAddrSpace(ST,
1189                                                      PtrTy.getAddressSpace(),
1190                                                      Op == G_LOAD);
1191 
1192               // FIXME: Handle widened to power of 2 results better. This ends
1193               // up scalarizing.
1194               // FIXME: 3 element stores scalarized on SI
1195 
1196               // Split if it's too large for the address space.
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198               if (MemSize > MaxSize) {
1199                 unsigned NumElts = DstTy.getNumElements();
1200                 unsigned EltSize = EltTy.getSizeInBits();
1201 
1202                 if (MaxSize % EltSize == 0) {
1203                   return std::make_pair(
1204                       0, LLT::scalarOrVector(
1205                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = MemSize / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(
1217                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               if (DstTy.getSizeInBits() > MemSize)
1222                 return std::make_pair(0, EltTy);
1223 
1224               unsigned EltSize = EltTy.getSizeInBits();
1225               unsigned DstSize = DstTy.getSizeInBits();
1226               if (!isPowerOf2_32(DstSize)) {
1227                 // We're probably decomposing an odd sized store. Try to split
1228                 // to the widest type. TODO: Account for alignment. As-is it
1229                 // should be OK, since the new parts will be further legalized.
1230                 unsigned FloorSize = PowerOf2Floor(DstSize);
1231                 return std::make_pair(
1232                     0, LLT::scalarOrVector(
1233                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234               }
1235 
1236               // May need relegalization for the scalars.
1237               return std::make_pair(0, EltTy);
1238             })
1239     .minScalar(0, S32)
1240     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241     .widenScalarToNextPow2(0)
1242     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243     .lower();
1244   }
1245 
1246   // FIXME: Unaligned accesses not lowered.
1247   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249                                                   {S32, GlobalPtr, S16, 2 * 8},
1250                                                   {S32, LocalPtr, S8, 8},
1251                                                   {S32, LocalPtr, S16, 16},
1252                                                   {S32, PrivatePtr, S8, 8},
1253                                                   {S32, PrivatePtr, S16, 16},
1254                                                   {S32, ConstantPtr, S8, 8},
1255                                                   {S32, ConstantPtr, S16, 2 * 8}})
1256                        .legalIf(
1257                          [=](const LegalityQuery &Query) -> bool {
1258                            return isLoadStoreLegal(ST, Query);
1259                          });
1260 
1261   if (ST.hasFlatAddressSpace()) {
1262     ExtLoads.legalForTypesWithMemDesc(
1263         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264   }
1265 
1266   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267   // 64-bits.
1268   //
1269   // TODO: Should generalize bitcast action into coerce, which will also cover
1270   // inserting addrspacecasts.
1271   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272 
1273   ExtLoads.clampScalar(0, S32, S32)
1274           .widenScalarToNextPow2(0)
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290   if (ST.hasLDSFPAtomicAdd()) {
1291     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292     if (ST.hasGFX90AInsts())
1293       Atomic.legalFor({{S64, LocalPtr}});
1294   }
1295   if (ST.hasAtomicFaddInsts())
1296     Atomic.legalFor({{S32, GlobalPtr}});
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310                                  LocalPtr, FlatPtr, PrivatePtr,
1311                                  LLT::fixed_vector(2, LocalPtr),
1312                                  LLT::fixed_vector(2, PrivatePtr)},
1313                                 {S1, S32})
1314       .clampScalar(0, S16, S64)
1315       .scalarize(1)
1316       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318       .clampMaxNumElements(0, S32, 2)
1319       .clampMaxNumElements(0, LocalPtr, 2)
1320       .clampMaxNumElements(0, PrivatePtr, 2)
1321       .scalarize(0)
1322       .widenScalarToNextPow2(0)
1323       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324 
1325   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326   // be more flexible with the shift amount type.
1327   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328     .legalFor({{S32, S32}, {S64, S32}});
1329   if (ST.has16BitInsts()) {
1330     if (ST.hasVOP3PInsts()) {
1331       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332             .clampMaxNumElements(0, S16, 2);
1333     } else
1334       Shifts.legalFor({{S16, S16}});
1335 
1336     // TODO: Support 16-bit shift amounts for all types
1337     Shifts.widenScalarIf(
1338       [=](const LegalityQuery &Query) {
1339         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340         // 32-bit amount.
1341         const LLT ValTy = Query.Types[0];
1342         const LLT AmountTy = Query.Types[1];
1343         return ValTy.getSizeInBits() <= 16 &&
1344                AmountTy.getSizeInBits() < 16;
1345       }, changeTo(1, S16));
1346     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347     Shifts.clampScalar(1, S32, S32);
1348     Shifts.clampScalar(0, S16, S64);
1349     Shifts.widenScalarToNextPow2(0, 16);
1350 
1351     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352       .minScalar(0, S16)
1353       .scalarize(0)
1354       .lower();
1355   } else {
1356     // Make sure we legalize the shift amount type first, as the general
1357     // expansion for the shifted type will produce much worse code if it hasn't
1358     // been truncated already.
1359     Shifts.clampScalar(1, S32, S32);
1360     Shifts.clampScalar(0, S32, S64);
1361     Shifts.widenScalarToNextPow2(0, 32);
1362 
1363     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364       .minScalar(0, S32)
1365       .scalarize(0)
1366       .lower();
1367   }
1368   Shifts.scalarize(0);
1369 
1370   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373     unsigned IdxTypeIdx = 2;
1374 
1375     getActionDefinitionsBuilder(Op)
1376       .customIf([=](const LegalityQuery &Query) {
1377           const LLT EltTy = Query.Types[EltTypeIdx];
1378           const LLT VecTy = Query.Types[VecTypeIdx];
1379           const LLT IdxTy = Query.Types[IdxTypeIdx];
1380           const unsigned EltSize = EltTy.getSizeInBits();
1381           return (EltSize == 32 || EltSize == 64) &&
1382                   VecTy.getSizeInBits() % 32 == 0 &&
1383                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1384                   IdxTy.getSizeInBits() == 32;
1385         })
1386       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387                  bitcastToVectorElement32(VecTypeIdx))
1388       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389       .bitcastIf(
1390         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391         [=](const LegalityQuery &Query) {
1392           // For > 64-bit element types, try to turn this into a 64-bit
1393           // element vector since we may be able to do better indexing
1394           // if this is scalar. If not, fall back to 32.
1395           const LLT EltTy = Query.Types[EltTypeIdx];
1396           const LLT VecTy = Query.Types[VecTypeIdx];
1397           const unsigned DstEltSize = EltTy.getSizeInBits();
1398           const unsigned VecSize = VecTy.getSizeInBits();
1399 
1400           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401           return std::make_pair(
1402               VecTypeIdx,
1403               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404         })
1405       .clampScalar(EltTypeIdx, S32, S64)
1406       .clampScalar(VecTypeIdx, S32, S64)
1407       .clampScalar(IdxTypeIdx, S32, S32)
1408       .clampMaxNumElements(VecTypeIdx, S32, 32)
1409       // TODO: Clamp elements for 64-bit vectors?
1410       // It should only be necessary with variable indexes.
1411       // As a last resort, lower to the stack
1412       .lower();
1413   }
1414 
1415   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416     .unsupportedIf([=](const LegalityQuery &Query) {
1417         const LLT &EltTy = Query.Types[1].getElementType();
1418         return Query.Types[0] != EltTy;
1419       });
1420 
1421   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425     // FIXME: Doesn't handle extract of illegal sizes.
1426     getActionDefinitionsBuilder(Op)
1427       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428       // FIXME: Multiples of 16 should not be legal.
1429       .legalIf([=](const LegalityQuery &Query) {
1430           const LLT BigTy = Query.Types[BigTyIdx];
1431           const LLT LitTy = Query.Types[LitTyIdx];
1432           return (BigTy.getSizeInBits() % 32 == 0) &&
1433                  (LitTy.getSizeInBits() % 16 == 0);
1434         })
1435       .widenScalarIf(
1436         [=](const LegalityQuery &Query) {
1437           const LLT BigTy = Query.Types[BigTyIdx];
1438           return (BigTy.getScalarSizeInBits() < 16);
1439         },
1440         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1441       .widenScalarIf(
1442         [=](const LegalityQuery &Query) {
1443           const LLT LitTy = Query.Types[LitTyIdx];
1444           return (LitTy.getScalarSizeInBits() < 16);
1445         },
1446         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1447       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1448       .widenScalarToNextPow2(BigTyIdx, 32);
1449 
1450   }
1451 
1452   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1453     .legalForCartesianProduct(AllS32Vectors, {S32})
1454     .legalForCartesianProduct(AllS64Vectors, {S64})
1455     .clampNumElements(0, V16S32, V32S32)
1456     .clampNumElements(0, V2S64, V16S64)
1457     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1458 
1459   if (ST.hasScalarPackInsts()) {
1460     BuildVector
1461       // FIXME: Should probably widen s1 vectors straight to s32
1462       .minScalarOrElt(0, S16)
1463       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1464       .minScalar(1, S32);
1465 
1466     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1467       .legalFor({V2S16, S32})
1468       .lower();
1469     BuildVector.minScalarOrElt(0, S32);
1470   } else {
1471     BuildVector.customFor({V2S16, S16});
1472     BuildVector.minScalarOrElt(0, S32);
1473 
1474     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1475       .customFor({V2S16, S32})
1476       .lower();
1477   }
1478 
1479   BuildVector.legalIf(isRegisterType(0));
1480 
1481   // FIXME: Clamp maximum size
1482   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1483     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1484     .clampMaxNumElements(0, S32, 32)
1485     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1486     .clampMaxNumElements(0, S16, 64);
1487 
1488   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1489   // pre-legalize.
1490   if (ST.hasVOP3PInsts()) {
1491     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1492       .customFor({V2S16, V2S16})
1493       .lower();
1494   } else
1495     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1496 
1497   // Merge/Unmerge
1498   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1499     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1500     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1501 
1502     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1503       const LLT Ty = Query.Types[TypeIdx];
1504       if (Ty.isVector()) {
1505         const LLT &EltTy = Ty.getElementType();
1506         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1507           return true;
1508         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1509           return true;
1510       }
1511       return false;
1512     };
1513 
1514     auto &Builder = getActionDefinitionsBuilder(Op)
1515       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516       .lowerFor({{S16, V2S16}})
1517       .lowerIf([=](const LegalityQuery &Query) {
1518           const LLT BigTy = Query.Types[BigTyIdx];
1519           return BigTy.getSizeInBits() == 32;
1520         })
1521       // Try to widen to s16 first for small types.
1522       // TODO: Only do this on targets with legal s16 shifts
1523       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1524       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1525       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1526       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1527                            elementTypeIs(1, S16)),
1528                        changeTo(1, V2S16))
1529       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1530       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1531       // valid.
1532       .clampScalar(LitTyIdx, S32, S512)
1533       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1534       // Break up vectors with weird elements into scalars
1535       .fewerElementsIf(
1536         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1537         scalarize(0))
1538       .fewerElementsIf(
1539         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1540         scalarize(1))
1541       .clampScalar(BigTyIdx, S32, MaxScalar);
1542 
1543     if (Op == G_MERGE_VALUES) {
1544       Builder.widenScalarIf(
1545         // TODO: Use 16-bit shifts if legal for 8-bit values?
1546         [=](const LegalityQuery &Query) {
1547           const LLT Ty = Query.Types[LitTyIdx];
1548           return Ty.getSizeInBits() < 32;
1549         },
1550         changeTo(LitTyIdx, S32));
1551     }
1552 
1553     Builder.widenScalarIf(
1554       [=](const LegalityQuery &Query) {
1555         const LLT Ty = Query.Types[BigTyIdx];
1556         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1557           Ty.getSizeInBits() % 16 != 0;
1558       },
1559       [=](const LegalityQuery &Query) {
1560         // Pick the next power of 2, or a multiple of 64 over 128.
1561         // Whichever is smaller.
1562         const LLT &Ty = Query.Types[BigTyIdx];
1563         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1564         if (NewSizeInBits >= 256) {
1565           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1566           if (RoundedTo < NewSizeInBits)
1567             NewSizeInBits = RoundedTo;
1568         }
1569         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1570       })
1571       // Any vectors left are the wrong size. Scalarize them.
1572       .scalarize(0)
1573       .scalarize(1);
1574   }
1575 
1576   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1577   // RegBankSelect.
1578   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1579     .legalFor({{S32}, {S64}});
1580 
1581   if (ST.hasVOP3PInsts()) {
1582     SextInReg.lowerFor({{V2S16}})
1583       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1584       // get more vector shift opportunities, since we'll get those when
1585       // expanded.
1586       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1587   } else if (ST.has16BitInsts()) {
1588     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1589   } else {
1590     // Prefer to promote to s32 before lowering if we don't have 16-bit
1591     // shifts. This avoid a lot of intermediate truncate and extend operations.
1592     SextInReg.lowerFor({{S32}, {S64}});
1593   }
1594 
1595   SextInReg
1596     .scalarize(0)
1597     .clampScalar(0, S32, S64)
1598     .lower();
1599 
1600   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1601     .scalarize(0)
1602     .lower();
1603 
1604   // TODO: Only Try to form v2s16 with legal packed instructions.
1605   getActionDefinitionsBuilder(G_FSHR)
1606     .legalFor({{S32, S32}})
1607     .lowerFor({{V2S16, V2S16}})
1608     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1609     .scalarize(0)
1610     .lower();
1611 
1612   if (ST.hasVOP3PInsts()) {
1613     getActionDefinitionsBuilder(G_FSHL)
1614       .lowerFor({{V2S16, V2S16}})
1615       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1616       .scalarize(0)
1617       .lower();
1618   } else {
1619     getActionDefinitionsBuilder(G_FSHL)
1620       .scalarize(0)
1621       .lower();
1622   }
1623 
1624   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1625     .legalFor({S64});
1626 
1627   getActionDefinitionsBuilder(G_FENCE)
1628     .alwaysLegal();
1629 
1630   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1631       .scalarize(0)
1632       .minScalar(0, S32)
1633       .lower();
1634 
1635   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1636       .legalFor({{S32, S32}, {S64, S32}})
1637       .clampScalar(1, S32, S32)
1638       .clampScalar(0, S32, S64)
1639       .widenScalarToNextPow2(0)
1640       .scalarize(0);
1641 
1642   getActionDefinitionsBuilder({
1643       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1644       G_FCOPYSIGN,
1645 
1646       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1647       G_ATOMICRMW_NAND,
1648       G_ATOMICRMW_FSUB,
1649       G_READ_REGISTER,
1650       G_WRITE_REGISTER,
1651 
1652       G_SADDO, G_SSUBO,
1653 
1654        // TODO: Implement
1655       G_FMINIMUM, G_FMAXIMUM}).lower();
1656 
1657   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1658       .lower();
1659 
1660   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1661         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1662         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1663     .unsupported();
1664 
1665   getLegacyLegalizerInfo().computeTables();
1666   verify(*ST.getInstrInfo());
1667 }
1668 
1669 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1670                                          MachineInstr &MI) const {
1671   MachineIRBuilder &B = Helper.MIRBuilder;
1672   MachineRegisterInfo &MRI = *B.getMRI();
1673 
1674   switch (MI.getOpcode()) {
1675   case TargetOpcode::G_ADDRSPACE_CAST:
1676     return legalizeAddrSpaceCast(MI, MRI, B);
1677   case TargetOpcode::G_FRINT:
1678     return legalizeFrint(MI, MRI, B);
1679   case TargetOpcode::G_FCEIL:
1680     return legalizeFceil(MI, MRI, B);
1681   case TargetOpcode::G_FREM:
1682     return legalizeFrem(MI, MRI, B);
1683   case TargetOpcode::G_INTRINSIC_TRUNC:
1684     return legalizeIntrinsicTrunc(MI, MRI, B);
1685   case TargetOpcode::G_SITOFP:
1686     return legalizeITOFP(MI, MRI, B, true);
1687   case TargetOpcode::G_UITOFP:
1688     return legalizeITOFP(MI, MRI, B, false);
1689   case TargetOpcode::G_FPTOSI:
1690     return legalizeFPTOI(MI, MRI, B, true);
1691   case TargetOpcode::G_FPTOUI:
1692     return legalizeFPTOI(MI, MRI, B, false);
1693   case TargetOpcode::G_FMINNUM:
1694   case TargetOpcode::G_FMAXNUM:
1695   case TargetOpcode::G_FMINNUM_IEEE:
1696   case TargetOpcode::G_FMAXNUM_IEEE:
1697     return legalizeMinNumMaxNum(Helper, MI);
1698   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1699     return legalizeExtractVectorElt(MI, MRI, B);
1700   case TargetOpcode::G_INSERT_VECTOR_ELT:
1701     return legalizeInsertVectorElt(MI, MRI, B);
1702   case TargetOpcode::G_SHUFFLE_VECTOR:
1703     return legalizeShuffleVector(MI, MRI, B);
1704   case TargetOpcode::G_FSIN:
1705   case TargetOpcode::G_FCOS:
1706     return legalizeSinCos(MI, MRI, B);
1707   case TargetOpcode::G_GLOBAL_VALUE:
1708     return legalizeGlobalValue(MI, MRI, B);
1709   case TargetOpcode::G_LOAD:
1710   case TargetOpcode::G_SEXTLOAD:
1711   case TargetOpcode::G_ZEXTLOAD:
1712     return legalizeLoad(Helper, MI);
1713   case TargetOpcode::G_FMAD:
1714     return legalizeFMad(MI, MRI, B);
1715   case TargetOpcode::G_FDIV:
1716     return legalizeFDIV(MI, MRI, B);
1717   case TargetOpcode::G_UDIV:
1718   case TargetOpcode::G_UREM:
1719   case TargetOpcode::G_UDIVREM:
1720     return legalizeUnsignedDIV_REM(MI, MRI, B);
1721   case TargetOpcode::G_SDIV:
1722   case TargetOpcode::G_SREM:
1723   case TargetOpcode::G_SDIVREM:
1724     return legalizeSignedDIV_REM(MI, MRI, B);
1725   case TargetOpcode::G_ATOMIC_CMPXCHG:
1726     return legalizeAtomicCmpXChg(MI, MRI, B);
1727   case TargetOpcode::G_FLOG:
1728     return legalizeFlog(MI, B, numbers::ln2f);
1729   case TargetOpcode::G_FLOG10:
1730     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1731   case TargetOpcode::G_FEXP:
1732     return legalizeFExp(MI, B);
1733   case TargetOpcode::G_FPOW:
1734     return legalizeFPow(MI, B);
1735   case TargetOpcode::G_FFLOOR:
1736     return legalizeFFloor(MI, MRI, B);
1737   case TargetOpcode::G_BUILD_VECTOR:
1738     return legalizeBuildVector(MI, MRI, B);
1739   case TargetOpcode::G_CTLZ:
1740   case TargetOpcode::G_CTTZ:
1741     return legalizeCTLZ_CTTZ(MI, MRI, B);
1742   default:
1743     return false;
1744   }
1745 
1746   llvm_unreachable("expected switch to return");
1747 }
1748 
1749 Register AMDGPULegalizerInfo::getSegmentAperture(
1750   unsigned AS,
1751   MachineRegisterInfo &MRI,
1752   MachineIRBuilder &B) const {
1753   MachineFunction &MF = B.getMF();
1754   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1755   const LLT S32 = LLT::scalar(32);
1756 
1757   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1758 
1759   if (ST.hasApertureRegs()) {
1760     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1761     // getreg.
1762     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1763         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1764         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1765     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1766         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1767         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1768     unsigned Encoding =
1769         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1770         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1771         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1772 
1773     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1774 
1775     B.buildInstr(AMDGPU::S_GETREG_B32)
1776       .addDef(GetReg)
1777       .addImm(Encoding);
1778     MRI.setType(GetReg, S32);
1779 
1780     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1781     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1782   }
1783 
1784   Register QueuePtr = MRI.createGenericVirtualRegister(
1785     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1786 
1787   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1788     return Register();
1789 
1790   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1791   // private_segment_aperture_base_hi.
1792   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1793 
1794   // TODO: can we be smarter about machine pointer info?
1795   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1796   MachineMemOperand *MMO = MF.getMachineMemOperand(
1797       PtrInfo,
1798       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1799           MachineMemOperand::MOInvariant,
1800       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1801 
1802   Register LoadAddr;
1803 
1804   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1805   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1806 }
1807 
1808 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1809   MachineInstr &MI, MachineRegisterInfo &MRI,
1810   MachineIRBuilder &B) const {
1811   MachineFunction &MF = B.getMF();
1812 
1813   const LLT S32 = LLT::scalar(32);
1814   Register Dst = MI.getOperand(0).getReg();
1815   Register Src = MI.getOperand(1).getReg();
1816 
1817   LLT DstTy = MRI.getType(Dst);
1818   LLT SrcTy = MRI.getType(Src);
1819   unsigned DestAS = DstTy.getAddressSpace();
1820   unsigned SrcAS = SrcTy.getAddressSpace();
1821 
1822   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1823   // vector element.
1824   assert(!DstTy.isVector());
1825 
1826   const AMDGPUTargetMachine &TM
1827     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1828 
1829   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1830     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1831     return true;
1832   }
1833 
1834   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1835     // Truncate.
1836     B.buildExtract(Dst, Src, 0);
1837     MI.eraseFromParent();
1838     return true;
1839   }
1840 
1841   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1842     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1843     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1844 
1845     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1846     // another. Merge operands are required to be the same type, but creating an
1847     // extra ptrtoint would be kind of pointless.
1848     auto HighAddr = B.buildConstant(
1849       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1850     B.buildMerge(Dst, {Src, HighAddr});
1851     MI.eraseFromParent();
1852     return true;
1853   }
1854 
1855   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1856     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1857            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1858     unsigned NullVal = TM.getNullPointerValue(DestAS);
1859 
1860     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1861     auto FlatNull = B.buildConstant(SrcTy, 0);
1862 
1863     // Extract low 32-bits of the pointer.
1864     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1865 
1866     auto CmpRes =
1867         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1868     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1869 
1870     MI.eraseFromParent();
1871     return true;
1872   }
1873 
1874   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1875     return false;
1876 
1877   if (!ST.hasFlatAddressSpace())
1878     return false;
1879 
1880   auto SegmentNull =
1881       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1882   auto FlatNull =
1883       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1884 
1885   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1886   if (!ApertureReg.isValid())
1887     return false;
1888 
1889   auto CmpRes =
1890       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1891 
1892   // Coerce the type of the low half of the result so we can use merge_values.
1893   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1894 
1895   // TODO: Should we allow mismatched types but matching sizes in merges to
1896   // avoid the ptrtoint?
1897   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1898   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1899 
1900   MI.eraseFromParent();
1901   return true;
1902 }
1903 
1904 bool AMDGPULegalizerInfo::legalizeFrint(
1905   MachineInstr &MI, MachineRegisterInfo &MRI,
1906   MachineIRBuilder &B) const {
1907   Register Src = MI.getOperand(1).getReg();
1908   LLT Ty = MRI.getType(Src);
1909   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1910 
1911   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1912   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1913 
1914   auto C1 = B.buildFConstant(Ty, C1Val);
1915   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1916 
1917   // TODO: Should this propagate fast-math-flags?
1918   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1919   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1920 
1921   auto C2 = B.buildFConstant(Ty, C2Val);
1922   auto Fabs = B.buildFAbs(Ty, Src);
1923 
1924   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1925   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1926   MI.eraseFromParent();
1927   return true;
1928 }
1929 
1930 bool AMDGPULegalizerInfo::legalizeFceil(
1931   MachineInstr &MI, MachineRegisterInfo &MRI,
1932   MachineIRBuilder &B) const {
1933 
1934   const LLT S1 = LLT::scalar(1);
1935   const LLT S64 = LLT::scalar(64);
1936 
1937   Register Src = MI.getOperand(1).getReg();
1938   assert(MRI.getType(Src) == S64);
1939 
1940   // result = trunc(src)
1941   // if (src > 0.0 && src != result)
1942   //   result += 1.0
1943 
1944   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1945 
1946   const auto Zero = B.buildFConstant(S64, 0.0);
1947   const auto One = B.buildFConstant(S64, 1.0);
1948   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1949   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1950   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1951   auto Add = B.buildSelect(S64, And, One, Zero);
1952 
1953   // TODO: Should this propagate fast-math-flags?
1954   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1955   return true;
1956 }
1957 
1958 bool AMDGPULegalizerInfo::legalizeFrem(
1959   MachineInstr &MI, MachineRegisterInfo &MRI,
1960   MachineIRBuilder &B) const {
1961     Register DstReg = MI.getOperand(0).getReg();
1962     Register Src0Reg = MI.getOperand(1).getReg();
1963     Register Src1Reg = MI.getOperand(2).getReg();
1964     auto Flags = MI.getFlags();
1965     LLT Ty = MRI.getType(DstReg);
1966 
1967     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1968     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1969     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1970     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1971     MI.eraseFromParent();
1972     return true;
1973 }
1974 
1975 static MachineInstrBuilder extractF64Exponent(Register Hi,
1976                                               MachineIRBuilder &B) {
1977   const unsigned FractBits = 52;
1978   const unsigned ExpBits = 11;
1979   LLT S32 = LLT::scalar(32);
1980 
1981   auto Const0 = B.buildConstant(S32, FractBits - 32);
1982   auto Const1 = B.buildConstant(S32, ExpBits);
1983 
1984   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1985     .addUse(Hi)
1986     .addUse(Const0.getReg(0))
1987     .addUse(Const1.getReg(0));
1988 
1989   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1990 }
1991 
1992 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1993   MachineInstr &MI, MachineRegisterInfo &MRI,
1994   MachineIRBuilder &B) const {
1995   const LLT S1 = LLT::scalar(1);
1996   const LLT S32 = LLT::scalar(32);
1997   const LLT S64 = LLT::scalar(64);
1998 
1999   Register Src = MI.getOperand(1).getReg();
2000   assert(MRI.getType(Src) == S64);
2001 
2002   // TODO: Should this use extract since the low half is unused?
2003   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2004   Register Hi = Unmerge.getReg(1);
2005 
2006   // Extract the upper half, since this is where we will find the sign and
2007   // exponent.
2008   auto Exp = extractF64Exponent(Hi, B);
2009 
2010   const unsigned FractBits = 52;
2011 
2012   // Extract the sign bit.
2013   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2014   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2015 
2016   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2017 
2018   const auto Zero32 = B.buildConstant(S32, 0);
2019 
2020   // Extend back to 64-bits.
2021   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2022 
2023   auto Shr = B.buildAShr(S64, FractMask, Exp);
2024   auto Not = B.buildNot(S64, Shr);
2025   auto Tmp0 = B.buildAnd(S64, Src, Not);
2026   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2027 
2028   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2029   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2030 
2031   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2032   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2033   MI.eraseFromParent();
2034   return true;
2035 }
2036 
2037 bool AMDGPULegalizerInfo::legalizeITOFP(
2038   MachineInstr &MI, MachineRegisterInfo &MRI,
2039   MachineIRBuilder &B, bool Signed) const {
2040 
2041   Register Dst = MI.getOperand(0).getReg();
2042   Register Src = MI.getOperand(1).getReg();
2043 
2044   const LLT S64 = LLT::scalar(64);
2045   const LLT S32 = LLT::scalar(32);
2046 
2047   assert(MRI.getType(Src) == S64);
2048 
2049   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2050   auto ThirtyTwo = B.buildConstant(S32, 32);
2051 
2052   if (MRI.getType(Dst) == S64) {
2053     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2054                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2055 
2056     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2057     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2058                      .addUse(CvtHi.getReg(0))
2059                      .addUse(ThirtyTwo.getReg(0));
2060 
2061     // TODO: Should this propagate fast-math-flags?
2062     B.buildFAdd(Dst, LdExp, CvtLo);
2063     MI.eraseFromParent();
2064     return true;
2065   }
2066 
2067   assert(MRI.getType(Dst) == S32);
2068 
2069   auto One = B.buildConstant(S32, 1);
2070 
2071   MachineInstrBuilder ShAmt;
2072   if (Signed) {
2073     auto ThirtyOne = B.buildConstant(S32, 31);
2074     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2075     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2076     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2077     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2078                                /*HasSideEffects=*/false)
2079                   .addUse(Unmerge.getReg(1));
2080     auto LS2 = B.buildSub(S32, LS, One);
2081     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2082   } else
2083     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2084   auto Norm = B.buildShl(S64, Src, ShAmt);
2085   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2086   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2087   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2088   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2089   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2090   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2091                    /*HasSideEffects=*/false)
2092       .addUse(FVal.getReg(0))
2093       .addUse(Scale.getReg(0));
2094   MI.eraseFromParent();
2095   return true;
2096 }
2097 
2098 // TODO: Copied from DAG implementation. Verify logic and document how this
2099 // actually works.
2100 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2101                                         MachineRegisterInfo &MRI,
2102                                         MachineIRBuilder &B,
2103                                         bool Signed) const {
2104 
2105   Register Dst = MI.getOperand(0).getReg();
2106   Register Src = MI.getOperand(1).getReg();
2107 
2108   const LLT S64 = LLT::scalar(64);
2109   const LLT S32 = LLT::scalar(32);
2110 
2111   const LLT SrcLT = MRI.getType(Src);
2112   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2113 
2114   unsigned Flags = MI.getFlags();
2115 
2116   // The basic idea of converting a floating point number into a pair of 32-bit
2117   // integers is illustrated as follows:
2118   //
2119   //     tf := trunc(val);
2120   //    hif := floor(tf * 2^-32);
2121   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2122   //     hi := fptoi(hif);
2123   //     lo := fptoi(lof);
2124   //
2125   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2126   MachineInstrBuilder Sign;
2127   if (Signed && SrcLT == S32) {
2128     // However, a 32-bit floating point number has only 23 bits mantissa and
2129     // it's not enough to hold all the significant bits of `lof` if val is
2130     // negative. To avoid the loss of precision, We need to take the absolute
2131     // value after truncating and flip the result back based on the original
2132     // signedness.
2133     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2134     Trunc = B.buildFAbs(S32, Trunc, Flags);
2135   }
2136   MachineInstrBuilder K0, K1;
2137   if (SrcLT == S64) {
2138     K0 = B.buildFConstant(S64,
2139                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2140     K1 = B.buildFConstant(S64,
2141                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2142   } else {
2143     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2144     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2145   }
2146 
2147   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2148   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2149   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2150 
2151   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2152                                      : B.buildFPTOUI(S32, FloorMul);
2153   auto Lo = B.buildFPTOUI(S32, Fma);
2154 
2155   if (Signed && SrcLT == S32) {
2156     // Flip the result based on the signedness, which is either all 0s or 1s.
2157     Sign = B.buildMerge(S64, {Sign, Sign});
2158     // r := xor({lo, hi}, sign) - sign;
2159     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2160   } else
2161     B.buildMerge(Dst, {Lo, Hi});
2162   MI.eraseFromParent();
2163 
2164   return true;
2165 }
2166 
2167 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2168                                                MachineInstr &MI) const {
2169   MachineFunction &MF = Helper.MIRBuilder.getMF();
2170   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2171 
2172   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2173                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2174 
2175   // With ieee_mode disabled, the instructions have the correct behavior
2176   // already for G_FMINNUM/G_FMAXNUM
2177   if (!MFI->getMode().IEEE)
2178     return !IsIEEEOp;
2179 
2180   if (IsIEEEOp)
2181     return true;
2182 
2183   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2184 }
2185 
2186 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2187   MachineInstr &MI, MachineRegisterInfo &MRI,
2188   MachineIRBuilder &B) const {
2189   // TODO: Should move some of this into LegalizerHelper.
2190 
2191   // TODO: Promote dynamic indexing of s16 to s32
2192 
2193   // FIXME: Artifact combiner probably should have replaced the truncated
2194   // constant before this, so we shouldn't need
2195   // getIConstantVRegValWithLookThrough.
2196   Optional<ValueAndVReg> MaybeIdxVal =
2197       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2198   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2199     return true;
2200   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2201 
2202   Register Dst = MI.getOperand(0).getReg();
2203   Register Vec = MI.getOperand(1).getReg();
2204 
2205   LLT VecTy = MRI.getType(Vec);
2206   LLT EltTy = VecTy.getElementType();
2207   assert(EltTy == MRI.getType(Dst));
2208 
2209   if (IdxVal < VecTy.getNumElements())
2210     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2211   else
2212     B.buildUndef(Dst);
2213 
2214   MI.eraseFromParent();
2215   return true;
2216 }
2217 
2218 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2219   MachineInstr &MI, MachineRegisterInfo &MRI,
2220   MachineIRBuilder &B) const {
2221   // TODO: Should move some of this into LegalizerHelper.
2222 
2223   // TODO: Promote dynamic indexing of s16 to s32
2224 
2225   // FIXME: Artifact combiner probably should have replaced the truncated
2226   // constant before this, so we shouldn't need
2227   // getIConstantVRegValWithLookThrough.
2228   Optional<ValueAndVReg> MaybeIdxVal =
2229       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2230   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2231     return true;
2232 
2233   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2234   Register Dst = MI.getOperand(0).getReg();
2235   Register Vec = MI.getOperand(1).getReg();
2236   Register Ins = MI.getOperand(2).getReg();
2237 
2238   LLT VecTy = MRI.getType(Vec);
2239   LLT EltTy = VecTy.getElementType();
2240   assert(EltTy == MRI.getType(Ins));
2241 
2242   if (IdxVal < VecTy.getNumElements())
2243     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2244   else
2245     B.buildUndef(Dst);
2246 
2247   MI.eraseFromParent();
2248   return true;
2249 }
2250 
2251 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2252   MachineInstr &MI, MachineRegisterInfo &MRI,
2253   MachineIRBuilder &B) const {
2254   const LLT V2S16 = LLT::fixed_vector(2, 16);
2255 
2256   Register Dst = MI.getOperand(0).getReg();
2257   Register Src0 = MI.getOperand(1).getReg();
2258   LLT DstTy = MRI.getType(Dst);
2259   LLT SrcTy = MRI.getType(Src0);
2260 
2261   if (SrcTy == V2S16 && DstTy == V2S16 &&
2262       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2263     return true;
2264 
2265   MachineIRBuilder HelperBuilder(MI);
2266   GISelObserverWrapper DummyObserver;
2267   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2268   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2269 }
2270 
2271 bool AMDGPULegalizerInfo::legalizeSinCos(
2272   MachineInstr &MI, MachineRegisterInfo &MRI,
2273   MachineIRBuilder &B) const {
2274 
2275   Register DstReg = MI.getOperand(0).getReg();
2276   Register SrcReg = MI.getOperand(1).getReg();
2277   LLT Ty = MRI.getType(DstReg);
2278   unsigned Flags = MI.getFlags();
2279 
2280   Register TrigVal;
2281   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2282   if (ST.hasTrigReducedRange()) {
2283     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2284     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2285       .addUse(MulVal.getReg(0))
2286       .setMIFlags(Flags).getReg(0);
2287   } else
2288     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2289 
2290   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2291     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2292   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2293     .addUse(TrigVal)
2294     .setMIFlags(Flags);
2295   MI.eraseFromParent();
2296   return true;
2297 }
2298 
2299 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2300                                                   MachineIRBuilder &B,
2301                                                   const GlobalValue *GV,
2302                                                   int64_t Offset,
2303                                                   unsigned GAFlags) const {
2304   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2305   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2306   // to the following code sequence:
2307   //
2308   // For constant address space:
2309   //   s_getpc_b64 s[0:1]
2310   //   s_add_u32 s0, s0, $symbol
2311   //   s_addc_u32 s1, s1, 0
2312   //
2313   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2314   //   a fixup or relocation is emitted to replace $symbol with a literal
2315   //   constant, which is a pc-relative offset from the encoding of the $symbol
2316   //   operand to the global variable.
2317   //
2318   // For global address space:
2319   //   s_getpc_b64 s[0:1]
2320   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2321   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2322   //
2323   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2324   //   fixups or relocations are emitted to replace $symbol@*@lo and
2325   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2326   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2327   //   operand to the global variable.
2328   //
2329   // What we want here is an offset from the value returned by s_getpc
2330   // (which is the address of the s_add_u32 instruction) to the global
2331   // variable, but since the encoding of $symbol starts 4 bytes after the start
2332   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2333   // small. This requires us to add 4 to the global variable offset in order to
2334   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2335   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2336   // instruction.
2337 
2338   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2339 
2340   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2341     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2342 
2343   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2344     .addDef(PCReg);
2345 
2346   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2347   if (GAFlags == SIInstrInfo::MO_NONE)
2348     MIB.addImm(0);
2349   else
2350     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2351 
2352   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2353 
2354   if (PtrTy.getSizeInBits() == 32)
2355     B.buildExtract(DstReg, PCReg, 0);
2356   return true;
2357  }
2358 
2359 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2360   MachineInstr &MI, MachineRegisterInfo &MRI,
2361   MachineIRBuilder &B) const {
2362   Register DstReg = MI.getOperand(0).getReg();
2363   LLT Ty = MRI.getType(DstReg);
2364   unsigned AS = Ty.getAddressSpace();
2365 
2366   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2367   MachineFunction &MF = B.getMF();
2368   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2369 
2370   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2371     if (!MFI->isModuleEntryFunction() &&
2372         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2373       const Function &Fn = MF.getFunction();
2374       DiagnosticInfoUnsupported BadLDSDecl(
2375         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2376         DS_Warning);
2377       Fn.getContext().diagnose(BadLDSDecl);
2378 
2379       // We currently don't have a way to correctly allocate LDS objects that
2380       // aren't directly associated with a kernel. We do force inlining of
2381       // functions that use local objects. However, if these dead functions are
2382       // not eliminated, we don't want a compile time error. Just emit a warning
2383       // and a trap, since there should be no callable path here.
2384       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2385       B.buildUndef(DstReg);
2386       MI.eraseFromParent();
2387       return true;
2388     }
2389 
2390     // TODO: We could emit code to handle the initialization somewhere.
2391     // We ignore the initializer for now and legalize it to allow selection.
2392     // The initializer will anyway get errored out during assembly emission.
2393     const SITargetLowering *TLI = ST.getTargetLowering();
2394     if (!TLI->shouldUseLDSConstAddress(GV)) {
2395       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2396       return true; // Leave in place;
2397     }
2398 
2399     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2400       Type *Ty = GV->getValueType();
2401       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2402       // zero-sized type in other languages to declare the dynamic shared
2403       // memory which size is not known at the compile time. They will be
2404       // allocated by the runtime and placed directly after the static
2405       // allocated ones. They all share the same offset.
2406       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2407         // Adjust alignment for that dynamic shared memory array.
2408         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2409         LLT S32 = LLT::scalar(32);
2410         auto Sz =
2411             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2412         B.buildIntToPtr(DstReg, Sz);
2413         MI.eraseFromParent();
2414         return true;
2415       }
2416     }
2417 
2418     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2419                                                    *cast<GlobalVariable>(GV)));
2420     MI.eraseFromParent();
2421     return true;
2422   }
2423 
2424   const SITargetLowering *TLI = ST.getTargetLowering();
2425 
2426   if (TLI->shouldEmitFixup(GV)) {
2427     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2428     MI.eraseFromParent();
2429     return true;
2430   }
2431 
2432   if (TLI->shouldEmitPCReloc(GV)) {
2433     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2434     MI.eraseFromParent();
2435     return true;
2436   }
2437 
2438   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2439   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2440 
2441   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2442   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2443       MachinePointerInfo::getGOT(MF),
2444       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2445           MachineMemOperand::MOInvariant,
2446       LoadTy, Align(8));
2447 
2448   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2449 
2450   if (Ty.getSizeInBits() == 32) {
2451     // Truncate if this is a 32-bit constant address.
2452     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2453     B.buildExtract(DstReg, Load, 0);
2454   } else
2455     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2456 
2457   MI.eraseFromParent();
2458   return true;
2459 }
2460 
2461 static LLT widenToNextPowerOf2(LLT Ty) {
2462   if (Ty.isVector())
2463     return Ty.changeElementCount(
2464         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2465   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2466 }
2467 
2468 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2469                                        MachineInstr &MI) const {
2470   MachineIRBuilder &B = Helper.MIRBuilder;
2471   MachineRegisterInfo &MRI = *B.getMRI();
2472   GISelChangeObserver &Observer = Helper.Observer;
2473 
2474   Register PtrReg = MI.getOperand(1).getReg();
2475   LLT PtrTy = MRI.getType(PtrReg);
2476   unsigned AddrSpace = PtrTy.getAddressSpace();
2477 
2478   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2479     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2480     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2481     Observer.changingInstr(MI);
2482     MI.getOperand(1).setReg(Cast.getReg(0));
2483     Observer.changedInstr(MI);
2484     return true;
2485   }
2486 
2487   if (MI.getOpcode() != AMDGPU::G_LOAD)
2488     return false;
2489 
2490   Register ValReg = MI.getOperand(0).getReg();
2491   LLT ValTy = MRI.getType(ValReg);
2492 
2493   MachineMemOperand *MMO = *MI.memoperands_begin();
2494   const unsigned ValSize = ValTy.getSizeInBits();
2495   const LLT MemTy = MMO->getMemoryType();
2496   const Align MemAlign = MMO->getAlign();
2497   const unsigned MemSize = MemTy.getSizeInBits();
2498   const unsigned AlignInBits = 8 * MemAlign.value();
2499 
2500   // Widen non-power-of-2 loads to the alignment if needed
2501   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2502     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2503 
2504     // This was already the correct extending load result type, so just adjust
2505     // the memory type.
2506     if (WideMemSize == ValSize) {
2507       MachineFunction &MF = B.getMF();
2508 
2509       MachineMemOperand *WideMMO =
2510           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2511       Observer.changingInstr(MI);
2512       MI.setMemRefs(MF, {WideMMO});
2513       Observer.changedInstr(MI);
2514       return true;
2515     }
2516 
2517     // Don't bother handling edge case that should probably never be produced.
2518     if (ValSize > WideMemSize)
2519       return false;
2520 
2521     LLT WideTy = widenToNextPowerOf2(ValTy);
2522 
2523     Register WideLoad;
2524     if (!WideTy.isVector()) {
2525       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2526       B.buildTrunc(ValReg, WideLoad).getReg(0);
2527     } else {
2528       // Extract the subvector.
2529 
2530       if (isRegisterType(ValTy)) {
2531         // If this a case where G_EXTRACT is legal, use it.
2532         // (e.g. <3 x s32> -> <4 x s32>)
2533         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2534         B.buildExtract(ValReg, WideLoad, 0);
2535       } else {
2536         // For cases where the widened type isn't a nice register value, unmerge
2537         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2538         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2539         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2540         B.setInsertPt(B.getMBB(), MI.getIterator());
2541         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2542       }
2543     }
2544 
2545     MI.eraseFromParent();
2546     return true;
2547   }
2548 
2549   return false;
2550 }
2551 
2552 bool AMDGPULegalizerInfo::legalizeFMad(
2553   MachineInstr &MI, MachineRegisterInfo &MRI,
2554   MachineIRBuilder &B) const {
2555   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2556   assert(Ty.isScalar());
2557 
2558   MachineFunction &MF = B.getMF();
2559   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2560 
2561   // TODO: Always legal with future ftz flag.
2562   // FIXME: Do we need just output?
2563   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2564     return true;
2565   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2566     return true;
2567 
2568   MachineIRBuilder HelperBuilder(MI);
2569   GISelObserverWrapper DummyObserver;
2570   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2571   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2572 }
2573 
2574 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2575   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2576   Register DstReg = MI.getOperand(0).getReg();
2577   Register PtrReg = MI.getOperand(1).getReg();
2578   Register CmpVal = MI.getOperand(2).getReg();
2579   Register NewVal = MI.getOperand(3).getReg();
2580 
2581   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2582          "this should not have been custom lowered");
2583 
2584   LLT ValTy = MRI.getType(CmpVal);
2585   LLT VecTy = LLT::fixed_vector(2, ValTy);
2586 
2587   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2588 
2589   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2590     .addDef(DstReg)
2591     .addUse(PtrReg)
2592     .addUse(PackedVal)
2593     .setMemRefs(MI.memoperands());
2594 
2595   MI.eraseFromParent();
2596   return true;
2597 }
2598 
2599 bool AMDGPULegalizerInfo::legalizeFlog(
2600   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2601   Register Dst = MI.getOperand(0).getReg();
2602   Register Src = MI.getOperand(1).getReg();
2603   LLT Ty = B.getMRI()->getType(Dst);
2604   unsigned Flags = MI.getFlags();
2605 
2606   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2607   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2608 
2609   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2610   MI.eraseFromParent();
2611   return true;
2612 }
2613 
2614 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2615                                        MachineIRBuilder &B) const {
2616   Register Dst = MI.getOperand(0).getReg();
2617   Register Src = MI.getOperand(1).getReg();
2618   unsigned Flags = MI.getFlags();
2619   LLT Ty = B.getMRI()->getType(Dst);
2620 
2621   auto K = B.buildFConstant(Ty, numbers::log2e);
2622   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2623   B.buildFExp2(Dst, Mul, Flags);
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627 
2628 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2629                                        MachineIRBuilder &B) const {
2630   Register Dst = MI.getOperand(0).getReg();
2631   Register Src0 = MI.getOperand(1).getReg();
2632   Register Src1 = MI.getOperand(2).getReg();
2633   unsigned Flags = MI.getFlags();
2634   LLT Ty = B.getMRI()->getType(Dst);
2635   const LLT S16 = LLT::scalar(16);
2636   const LLT S32 = LLT::scalar(32);
2637 
2638   if (Ty == S32) {
2639     auto Log = B.buildFLog2(S32, Src0, Flags);
2640     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2641       .addUse(Log.getReg(0))
2642       .addUse(Src1)
2643       .setMIFlags(Flags);
2644     B.buildFExp2(Dst, Mul, Flags);
2645   } else if (Ty == S16) {
2646     // There's no f16 fmul_legacy, so we need to convert for it.
2647     auto Log = B.buildFLog2(S16, Src0, Flags);
2648     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2649     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2650     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2651       .addUse(Ext0.getReg(0))
2652       .addUse(Ext1.getReg(0))
2653       .setMIFlags(Flags);
2654 
2655     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2656   } else
2657     return false;
2658 
2659   MI.eraseFromParent();
2660   return true;
2661 }
2662 
2663 // Find a source register, ignoring any possible source modifiers.
2664 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2665   Register ModSrc = OrigSrc;
2666   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2667     ModSrc = SrcFNeg->getOperand(1).getReg();
2668     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2669       ModSrc = SrcFAbs->getOperand(1).getReg();
2670   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2671     ModSrc = SrcFAbs->getOperand(1).getReg();
2672   return ModSrc;
2673 }
2674 
2675 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2676                                          MachineRegisterInfo &MRI,
2677                                          MachineIRBuilder &B) const {
2678 
2679   const LLT S1 = LLT::scalar(1);
2680   const LLT S64 = LLT::scalar(64);
2681   Register Dst = MI.getOperand(0).getReg();
2682   Register OrigSrc = MI.getOperand(1).getReg();
2683   unsigned Flags = MI.getFlags();
2684   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2685          "this should not have been custom lowered");
2686 
2687   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2688   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2689   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2690   // V_FRACT bug is:
2691   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2692   //
2693   // Convert floor(x) to (x - fract(x))
2694 
2695   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2696     .addUse(OrigSrc)
2697     .setMIFlags(Flags);
2698 
2699   // Give source modifier matching some assistance before obscuring a foldable
2700   // pattern.
2701 
2702   // TODO: We can avoid the neg on the fract? The input sign to fract
2703   // shouldn't matter?
2704   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2705 
2706   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2707 
2708   Register Min = MRI.createGenericVirtualRegister(S64);
2709 
2710   // We don't need to concern ourselves with the snan handling difference, so
2711   // use the one which will directly select.
2712   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2713   if (MFI->getMode().IEEE)
2714     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2715   else
2716     B.buildFMinNum(Min, Fract, Const, Flags);
2717 
2718   Register CorrectedFract = Min;
2719   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2720     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2721     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2722   }
2723 
2724   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2725   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2726 
2727   MI.eraseFromParent();
2728   return true;
2729 }
2730 
2731 // Turn an illegal packed v2s16 build vector into bit operations.
2732 // TODO: This should probably be a bitcast action in LegalizerHelper.
2733 bool AMDGPULegalizerInfo::legalizeBuildVector(
2734   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2735   Register Dst = MI.getOperand(0).getReg();
2736   const LLT S32 = LLT::scalar(32);
2737   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2738 
2739   Register Src0 = MI.getOperand(1).getReg();
2740   Register Src1 = MI.getOperand(2).getReg();
2741   assert(MRI.getType(Src0) == LLT::scalar(16));
2742 
2743   auto Merge = B.buildMerge(S32, {Src0, Src1});
2744   B.buildBitcast(Dst, Merge);
2745 
2746   MI.eraseFromParent();
2747   return true;
2748 }
2749 
2750 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2751 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2752 // case with a single min instruction instead of a compare+select.
2753 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2754                                             MachineRegisterInfo &MRI,
2755                                             MachineIRBuilder &B) const {
2756   Register Dst = MI.getOperand(0).getReg();
2757   Register Src = MI.getOperand(1).getReg();
2758   LLT DstTy = MRI.getType(Dst);
2759   LLT SrcTy = MRI.getType(Src);
2760 
2761   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2762                         ? AMDGPU::G_AMDGPU_FFBH_U32
2763                         : AMDGPU::G_AMDGPU_FFBL_B32;
2764   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2765   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2766 
2767   MI.eraseFromParent();
2768   return true;
2769 }
2770 
2771 // Check that this is a G_XOR x, -1
2772 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2773   if (MI.getOpcode() != TargetOpcode::G_XOR)
2774     return false;
2775   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2776   return ConstVal && *ConstVal == -1;
2777 }
2778 
2779 // Return the use branch instruction, otherwise null if the usage is invalid.
2780 static MachineInstr *
2781 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2782                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2783   Register CondDef = MI.getOperand(0).getReg();
2784   if (!MRI.hasOneNonDBGUse(CondDef))
2785     return nullptr;
2786 
2787   MachineBasicBlock *Parent = MI.getParent();
2788   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2789 
2790   if (isNot(MRI, *UseMI)) {
2791     Register NegatedCond = UseMI->getOperand(0).getReg();
2792     if (!MRI.hasOneNonDBGUse(NegatedCond))
2793       return nullptr;
2794 
2795     // We're deleting the def of this value, so we need to remove it.
2796     eraseInstr(*UseMI, MRI);
2797 
2798     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2799     Negated = true;
2800   }
2801 
2802   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2803     return nullptr;
2804 
2805   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2806   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2807   if (Next == Parent->end()) {
2808     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2809     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2810       return nullptr;
2811     UncondBrTarget = &*NextMBB;
2812   } else {
2813     if (Next->getOpcode() != AMDGPU::G_BR)
2814       return nullptr;
2815     Br = &*Next;
2816     UncondBrTarget = Br->getOperand(0).getMBB();
2817   }
2818 
2819   return UseMI;
2820 }
2821 
2822 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2823                                          const ArgDescriptor *Arg,
2824                                          const TargetRegisterClass *ArgRC,
2825                                          LLT ArgTy) const {
2826   MCRegister SrcReg = Arg->getRegister();
2827   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2828   assert(DstReg.isVirtual() && "Virtual register expected");
2829 
2830   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2831                                              ArgTy);
2832   if (Arg->isMasked()) {
2833     // TODO: Should we try to emit this once in the entry block?
2834     const LLT S32 = LLT::scalar(32);
2835     const unsigned Mask = Arg->getMask();
2836     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2837 
2838     Register AndMaskSrc = LiveIn;
2839 
2840     if (Shift != 0) {
2841       auto ShiftAmt = B.buildConstant(S32, Shift);
2842       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2843     }
2844 
2845     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2846   } else {
2847     B.buildCopy(DstReg, LiveIn);
2848   }
2849 
2850   return true;
2851 }
2852 
2853 bool AMDGPULegalizerInfo::loadInputValue(
2854     Register DstReg, MachineIRBuilder &B,
2855     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2856   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2857   const ArgDescriptor *Arg;
2858   const TargetRegisterClass *ArgRC;
2859   LLT ArgTy;
2860   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2861 
2862   if (!Arg) {
2863     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2864       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2865       // case the pointer argument may be missing and we use null.
2866       B.buildConstant(DstReg, 0);
2867       return true;
2868     }
2869 
2870     // It's undefined behavior if a function marked with the amdgpu-no-*
2871     // attributes uses the corresponding intrinsic.
2872     B.buildUndef(DstReg);
2873     return true;
2874   }
2875 
2876   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2877     return false; // TODO: Handle these
2878   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2879 }
2880 
2881 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2882     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2883     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2884   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2885     return false;
2886 
2887   MI.eraseFromParent();
2888   return true;
2889 }
2890 
2891 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2892                                        MachineRegisterInfo &MRI,
2893                                        MachineIRBuilder &B) const {
2894   Register Dst = MI.getOperand(0).getReg();
2895   LLT DstTy = MRI.getType(Dst);
2896   LLT S16 = LLT::scalar(16);
2897   LLT S32 = LLT::scalar(32);
2898   LLT S64 = LLT::scalar(64);
2899 
2900   if (DstTy == S16)
2901     return legalizeFDIV16(MI, MRI, B);
2902   if (DstTy == S32)
2903     return legalizeFDIV32(MI, MRI, B);
2904   if (DstTy == S64)
2905     return legalizeFDIV64(MI, MRI, B);
2906 
2907   return false;
2908 }
2909 
2910 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2911                                                         Register DstDivReg,
2912                                                         Register DstRemReg,
2913                                                         Register X,
2914                                                         Register Y) const {
2915   const LLT S1 = LLT::scalar(1);
2916   const LLT S32 = LLT::scalar(32);
2917 
2918   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2919   // algorithm used here.
2920 
2921   // Initial estimate of inv(y).
2922   auto FloatY = B.buildUITOFP(S32, Y);
2923   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2924   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2925   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2926   auto Z = B.buildFPTOUI(S32, ScaledY);
2927 
2928   // One round of UNR.
2929   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2930   auto NegYZ = B.buildMul(S32, NegY, Z);
2931   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2932 
2933   // Quotient/remainder estimate.
2934   auto Q = B.buildUMulH(S32, X, Z);
2935   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2936 
2937   // First quotient/remainder refinement.
2938   auto One = B.buildConstant(S32, 1);
2939   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2940   if (DstDivReg)
2941     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2942   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2943 
2944   // Second quotient/remainder refinement.
2945   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2946   if (DstDivReg)
2947     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2948 
2949   if (DstRemReg)
2950     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2951 }
2952 
2953 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2954 //
2955 // Return lo, hi of result
2956 //
2957 // %cvt.lo = G_UITOFP Val.lo
2958 // %cvt.hi = G_UITOFP Val.hi
2959 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2960 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2961 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2962 // %mul2 = G_FMUL %mul1, 2**(-32)
2963 // %trunc = G_INTRINSIC_TRUNC %mul2
2964 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2965 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2966 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2967                                                        Register Val) {
2968   const LLT S32 = LLT::scalar(32);
2969   auto Unmerge = B.buildUnmerge(S32, Val);
2970 
2971   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2972   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2973 
2974   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2975                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2976 
2977   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2978   auto Mul1 =
2979       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2980 
2981   // 2**(-32)
2982   auto Mul2 =
2983       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2984   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2985 
2986   // -(2**32)
2987   auto Mad2 = B.buildFMAD(S32, Trunc,
2988                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2989 
2990   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2991   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2992 
2993   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2994 }
2995 
2996 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2997                                                         Register DstDivReg,
2998                                                         Register DstRemReg,
2999                                                         Register Numer,
3000                                                         Register Denom) const {
3001   const LLT S32 = LLT::scalar(32);
3002   const LLT S64 = LLT::scalar(64);
3003   const LLT S1 = LLT::scalar(1);
3004   Register RcpLo, RcpHi;
3005 
3006   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3007 
3008   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3009 
3010   auto Zero64 = B.buildConstant(S64, 0);
3011   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3012 
3013   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3014   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3015 
3016   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3017   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3018   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3019 
3020   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3021   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3022   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3023 
3024   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3025   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3026   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3027   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3028   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3029 
3030   auto Zero32 = B.buildConstant(S32, 0);
3031   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3032   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3033   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3034 
3035   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3036   Register NumerLo = UnmergeNumer.getReg(0);
3037   Register NumerHi = UnmergeNumer.getReg(1);
3038 
3039   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3040   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3041   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3042   Register Mul3_Lo = UnmergeMul3.getReg(0);
3043   Register Mul3_Hi = UnmergeMul3.getReg(1);
3044   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3045   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3046   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3047   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3048 
3049   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3050   Register DenomLo = UnmergeDenom.getReg(0);
3051   Register DenomHi = UnmergeDenom.getReg(1);
3052 
3053   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3054   auto C1 = B.buildSExt(S32, CmpHi);
3055 
3056   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3057   auto C2 = B.buildSExt(S32, CmpLo);
3058 
3059   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3060   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3061 
3062   // TODO: Here and below portions of the code can be enclosed into if/endif.
3063   // Currently control flow is unconditional and we have 4 selects after
3064   // potential endif to substitute PHIs.
3065 
3066   // if C3 != 0 ...
3067   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3068   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3069   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3070   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3071 
3072   auto One64 = B.buildConstant(S64, 1);
3073   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3074 
3075   auto C4 =
3076       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3077   auto C5 =
3078       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3079   auto C6 = B.buildSelect(
3080       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3081 
3082   // if (C6 != 0)
3083   auto Add4 = B.buildAdd(S64, Add3, One64);
3084   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3085 
3086   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3087   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3088   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3089 
3090   // endif C6
3091   // endif C3
3092 
3093   if (DstDivReg) {
3094     auto Sel1 = B.buildSelect(
3095         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3096     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3097                   Sel1, MulHi3);
3098   }
3099 
3100   if (DstRemReg) {
3101     auto Sel2 = B.buildSelect(
3102         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3103     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3104                   Sel2, Sub1);
3105   }
3106 }
3107 
3108 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3109                                                   MachineRegisterInfo &MRI,
3110                                                   MachineIRBuilder &B) const {
3111   Register DstDivReg, DstRemReg;
3112   switch (MI.getOpcode()) {
3113   default:
3114     llvm_unreachable("Unexpected opcode!");
3115   case AMDGPU::G_UDIV: {
3116     DstDivReg = MI.getOperand(0).getReg();
3117     break;
3118   }
3119   case AMDGPU::G_UREM: {
3120     DstRemReg = MI.getOperand(0).getReg();
3121     break;
3122   }
3123   case AMDGPU::G_UDIVREM: {
3124     DstDivReg = MI.getOperand(0).getReg();
3125     DstRemReg = MI.getOperand(1).getReg();
3126     break;
3127   }
3128   }
3129 
3130   const LLT S64 = LLT::scalar(64);
3131   const LLT S32 = LLT::scalar(32);
3132   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3133   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3134   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3135   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3136 
3137   if (Ty == S32)
3138     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3139   else if (Ty == S64)
3140     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3141   else
3142     return false;
3143 
3144   MI.eraseFromParent();
3145   return true;
3146 }
3147 
3148 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3149                                                 MachineRegisterInfo &MRI,
3150                                                 MachineIRBuilder &B) const {
3151   const LLT S64 = LLT::scalar(64);
3152   const LLT S32 = LLT::scalar(32);
3153 
3154   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3155   if (Ty != S32 && Ty != S64)
3156     return false;
3157 
3158   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3159   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3160   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3161 
3162   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3163   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3164   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3165 
3166   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3167   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3168 
3169   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3170   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3171 
3172   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3173   switch (MI.getOpcode()) {
3174   default:
3175     llvm_unreachable("Unexpected opcode!");
3176   case AMDGPU::G_SDIV: {
3177     DstDivReg = MI.getOperand(0).getReg();
3178     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3179     break;
3180   }
3181   case AMDGPU::G_SREM: {
3182     DstRemReg = MI.getOperand(0).getReg();
3183     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3184     break;
3185   }
3186   case AMDGPU::G_SDIVREM: {
3187     DstDivReg = MI.getOperand(0).getReg();
3188     DstRemReg = MI.getOperand(1).getReg();
3189     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3190     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3191     break;
3192   }
3193   }
3194 
3195   if (Ty == S32)
3196     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3197   else
3198     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3199 
3200   if (DstDivReg) {
3201     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3202     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3203     B.buildSub(DstDivReg, SignXor, Sign);
3204   }
3205 
3206   if (DstRemReg) {
3207     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3208     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3209     B.buildSub(DstRemReg, SignXor, Sign);
3210   }
3211 
3212   MI.eraseFromParent();
3213   return true;
3214 }
3215 
3216 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3217                                                  MachineRegisterInfo &MRI,
3218                                                  MachineIRBuilder &B) const {
3219   Register Res = MI.getOperand(0).getReg();
3220   Register LHS = MI.getOperand(1).getReg();
3221   Register RHS = MI.getOperand(2).getReg();
3222   uint16_t Flags = MI.getFlags();
3223   LLT ResTy = MRI.getType(Res);
3224 
3225   const MachineFunction &MF = B.getMF();
3226   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3227                             MI.getFlag(MachineInstr::FmAfn);
3228 
3229   if (!AllowInaccurateRcp)
3230     return false;
3231 
3232   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3233     // 1 / x -> RCP(x)
3234     if (CLHS->isExactlyValue(1.0)) {
3235       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3236         .addUse(RHS)
3237         .setMIFlags(Flags);
3238 
3239       MI.eraseFromParent();
3240       return true;
3241     }
3242 
3243     // -1 / x -> RCP( FNEG(x) )
3244     if (CLHS->isExactlyValue(-1.0)) {
3245       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3246       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3247         .addUse(FNeg.getReg(0))
3248         .setMIFlags(Flags);
3249 
3250       MI.eraseFromParent();
3251       return true;
3252     }
3253   }
3254 
3255   // x / y -> x * (1.0 / y)
3256   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3257     .addUse(RHS)
3258     .setMIFlags(Flags);
3259   B.buildFMul(Res, LHS, RCP, Flags);
3260 
3261   MI.eraseFromParent();
3262   return true;
3263 }
3264 
3265 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3266                                                    MachineRegisterInfo &MRI,
3267                                                    MachineIRBuilder &B) const {
3268   Register Res = MI.getOperand(0).getReg();
3269   Register X = MI.getOperand(1).getReg();
3270   Register Y = MI.getOperand(2).getReg();
3271   uint16_t Flags = MI.getFlags();
3272   LLT ResTy = MRI.getType(Res);
3273 
3274   const MachineFunction &MF = B.getMF();
3275   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3276                             MI.getFlag(MachineInstr::FmAfn);
3277 
3278   if (!AllowInaccurateRcp)
3279     return false;
3280 
3281   auto NegY = B.buildFNeg(ResTy, Y);
3282   auto One = B.buildFConstant(ResTy, 1.0);
3283 
3284   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3285     .addUse(Y)
3286     .setMIFlags(Flags);
3287 
3288   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3289   R = B.buildFMA(ResTy, Tmp0, R, R);
3290 
3291   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3292   R = B.buildFMA(ResTy, Tmp1, R, R);
3293 
3294   auto Ret = B.buildFMul(ResTy, X, R);
3295   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3296 
3297   B.buildFMA(Res, Tmp2, R, Ret);
3298   MI.eraseFromParent();
3299   return true;
3300 }
3301 
3302 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3303                                          MachineRegisterInfo &MRI,
3304                                          MachineIRBuilder &B) const {
3305   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3306     return true;
3307 
3308   Register Res = MI.getOperand(0).getReg();
3309   Register LHS = MI.getOperand(1).getReg();
3310   Register RHS = MI.getOperand(2).getReg();
3311 
3312   uint16_t Flags = MI.getFlags();
3313 
3314   LLT S16 = LLT::scalar(16);
3315   LLT S32 = LLT::scalar(32);
3316 
3317   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3318   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3319 
3320   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3321     .addUse(RHSExt.getReg(0))
3322     .setMIFlags(Flags);
3323 
3324   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3325   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3326 
3327   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3328     .addUse(RDst.getReg(0))
3329     .addUse(RHS)
3330     .addUse(LHS)
3331     .setMIFlags(Flags);
3332 
3333   MI.eraseFromParent();
3334   return true;
3335 }
3336 
3337 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3338 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3339 static void toggleSPDenormMode(bool Enable,
3340                                MachineIRBuilder &B,
3341                                const GCNSubtarget &ST,
3342                                AMDGPU::SIModeRegisterDefaults Mode) {
3343   // Set SP denorm mode to this value.
3344   unsigned SPDenormMode =
3345     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3346 
3347   if (ST.hasDenormModeInst()) {
3348     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3349     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3350 
3351     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3352     B.buildInstr(AMDGPU::S_DENORM_MODE)
3353       .addImm(NewDenormModeValue);
3354 
3355   } else {
3356     // Select FP32 bit field in mode register.
3357     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3358                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3359                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3360 
3361     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3362       .addImm(SPDenormMode)
3363       .addImm(SPDenormModeBitField);
3364   }
3365 }
3366 
3367 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3368                                          MachineRegisterInfo &MRI,
3369                                          MachineIRBuilder &B) const {
3370   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3371     return true;
3372 
3373   Register Res = MI.getOperand(0).getReg();
3374   Register LHS = MI.getOperand(1).getReg();
3375   Register RHS = MI.getOperand(2).getReg();
3376   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3377   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3378 
3379   uint16_t Flags = MI.getFlags();
3380 
3381   LLT S32 = LLT::scalar(32);
3382   LLT S1 = LLT::scalar(1);
3383 
3384   auto One = B.buildFConstant(S32, 1.0f);
3385 
3386   auto DenominatorScaled =
3387     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3388       .addUse(LHS)
3389       .addUse(RHS)
3390       .addImm(0)
3391       .setMIFlags(Flags);
3392   auto NumeratorScaled =
3393     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3394       .addUse(LHS)
3395       .addUse(RHS)
3396       .addImm(1)
3397       .setMIFlags(Flags);
3398 
3399   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3400     .addUse(DenominatorScaled.getReg(0))
3401     .setMIFlags(Flags);
3402   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3403 
3404   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3405   // aren't modeled as reading it.
3406   if (!Mode.allFP32Denormals())
3407     toggleSPDenormMode(true, B, ST, Mode);
3408 
3409   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3410   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3411   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3412   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3413   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3414   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3415 
3416   if (!Mode.allFP32Denormals())
3417     toggleSPDenormMode(false, B, ST, Mode);
3418 
3419   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3420     .addUse(Fma4.getReg(0))
3421     .addUse(Fma1.getReg(0))
3422     .addUse(Fma3.getReg(0))
3423     .addUse(NumeratorScaled.getReg(1))
3424     .setMIFlags(Flags);
3425 
3426   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3427     .addUse(Fmas.getReg(0))
3428     .addUse(RHS)
3429     .addUse(LHS)
3430     .setMIFlags(Flags);
3431 
3432   MI.eraseFromParent();
3433   return true;
3434 }
3435 
3436 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3437                                          MachineRegisterInfo &MRI,
3438                                          MachineIRBuilder &B) const {
3439   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3440     return true;
3441 
3442   Register Res = MI.getOperand(0).getReg();
3443   Register LHS = MI.getOperand(1).getReg();
3444   Register RHS = MI.getOperand(2).getReg();
3445 
3446   uint16_t Flags = MI.getFlags();
3447 
3448   LLT S64 = LLT::scalar(64);
3449   LLT S1 = LLT::scalar(1);
3450 
3451   auto One = B.buildFConstant(S64, 1.0);
3452 
3453   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3454     .addUse(LHS)
3455     .addUse(RHS)
3456     .addImm(0)
3457     .setMIFlags(Flags);
3458 
3459   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3460 
3461   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3462     .addUse(DivScale0.getReg(0))
3463     .setMIFlags(Flags);
3464 
3465   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3466   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3467   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3468 
3469   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3470     .addUse(LHS)
3471     .addUse(RHS)
3472     .addImm(1)
3473     .setMIFlags(Flags);
3474 
3475   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3476   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3477   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3478 
3479   Register Scale;
3480   if (!ST.hasUsableDivScaleConditionOutput()) {
3481     // Workaround a hardware bug on SI where the condition output from div_scale
3482     // is not usable.
3483 
3484     LLT S32 = LLT::scalar(32);
3485 
3486     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3487     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3488     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3489     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3490 
3491     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3492                               Scale1Unmerge.getReg(1));
3493     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3494                               Scale0Unmerge.getReg(1));
3495     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3496   } else {
3497     Scale = DivScale1.getReg(1);
3498   }
3499 
3500   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3501     .addUse(Fma4.getReg(0))
3502     .addUse(Fma3.getReg(0))
3503     .addUse(Mul.getReg(0))
3504     .addUse(Scale)
3505     .setMIFlags(Flags);
3506 
3507   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3508     .addUse(Fmas.getReg(0))
3509     .addUse(RHS)
3510     .addUse(LHS)
3511     .setMIFlags(Flags);
3512 
3513   MI.eraseFromParent();
3514   return true;
3515 }
3516 
3517 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3518                                                  MachineRegisterInfo &MRI,
3519                                                  MachineIRBuilder &B) const {
3520   Register Res = MI.getOperand(0).getReg();
3521   Register LHS = MI.getOperand(2).getReg();
3522   Register RHS = MI.getOperand(3).getReg();
3523   uint16_t Flags = MI.getFlags();
3524 
3525   LLT S32 = LLT::scalar(32);
3526   LLT S1 = LLT::scalar(1);
3527 
3528   auto Abs = B.buildFAbs(S32, RHS, Flags);
3529   const APFloat C0Val(1.0f);
3530 
3531   auto C0 = B.buildConstant(S32, 0x6f800000);
3532   auto C1 = B.buildConstant(S32, 0x2f800000);
3533   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3534 
3535   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3536   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3537 
3538   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3539 
3540   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3541     .addUse(Mul0.getReg(0))
3542     .setMIFlags(Flags);
3543 
3544   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3545 
3546   B.buildFMul(Res, Sel, Mul1, Flags);
3547 
3548   MI.eraseFromParent();
3549   return true;
3550 }
3551 
3552 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3553 // FIXME: Why do we handle this one but not other removed instructions?
3554 //
3555 // Reciprocal square root.  The clamp prevents infinite results, clamping
3556 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3557 // +-max_float.
3558 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3559                                                     MachineRegisterInfo &MRI,
3560                                                     MachineIRBuilder &B) const {
3561   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3562     return true;
3563 
3564   Register Dst = MI.getOperand(0).getReg();
3565   Register Src = MI.getOperand(2).getReg();
3566   auto Flags = MI.getFlags();
3567 
3568   LLT Ty = MRI.getType(Dst);
3569 
3570   const fltSemantics *FltSemantics;
3571   if (Ty == LLT::scalar(32))
3572     FltSemantics = &APFloat::IEEEsingle();
3573   else if (Ty == LLT::scalar(64))
3574     FltSemantics = &APFloat::IEEEdouble();
3575   else
3576     return false;
3577 
3578   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3579     .addUse(Src)
3580     .setMIFlags(Flags);
3581 
3582   // We don't need to concern ourselves with the snan handling difference, since
3583   // the rsq quieted (or not) so use the one which will directly select.
3584   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3585   const bool UseIEEE = MFI->getMode().IEEE;
3586 
3587   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3588   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3589                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3590 
3591   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3592 
3593   if (UseIEEE)
3594     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3595   else
3596     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3597   MI.eraseFromParent();
3598   return true;
3599 }
3600 
3601 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3602   switch (IID) {
3603   case Intrinsic::amdgcn_ds_fadd:
3604     return AMDGPU::G_ATOMICRMW_FADD;
3605   case Intrinsic::amdgcn_ds_fmin:
3606     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3607   case Intrinsic::amdgcn_ds_fmax:
3608     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3609   default:
3610     llvm_unreachable("not a DS FP intrinsic");
3611   }
3612 }
3613 
3614 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3615                                                       MachineInstr &MI,
3616                                                       Intrinsic::ID IID) const {
3617   GISelChangeObserver &Observer = Helper.Observer;
3618   Observer.changingInstr(MI);
3619 
3620   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3621 
3622   // The remaining operands were used to set fields in the MemOperand on
3623   // construction.
3624   for (int I = 6; I > 3; --I)
3625     MI.RemoveOperand(I);
3626 
3627   MI.RemoveOperand(1); // Remove the intrinsic ID.
3628   Observer.changedInstr(MI);
3629   return true;
3630 }
3631 
3632 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3633                                             MachineRegisterInfo &MRI,
3634                                             MachineIRBuilder &B) const {
3635   uint64_t Offset =
3636     ST.getTargetLowering()->getImplicitParameterOffset(
3637       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3638   LLT DstTy = MRI.getType(DstReg);
3639   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3640 
3641   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3642   if (!loadInputValue(KernargPtrReg, B,
3643                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3644     return false;
3645 
3646   // FIXME: This should be nuw
3647   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3648   return true;
3649 }
3650 
3651 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3652                                                  MachineRegisterInfo &MRI,
3653                                                  MachineIRBuilder &B) const {
3654   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3655   if (!MFI->isEntryFunction()) {
3656     return legalizePreloadedArgIntrin(MI, MRI, B,
3657                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3658   }
3659 
3660   Register DstReg = MI.getOperand(0).getReg();
3661   if (!getImplicitArgPtr(DstReg, MRI, B))
3662     return false;
3663 
3664   MI.eraseFromParent();
3665   return true;
3666 }
3667 
3668 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3669                                               MachineRegisterInfo &MRI,
3670                                               MachineIRBuilder &B,
3671                                               unsigned AddrSpace) const {
3672   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3673   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3674   Register Hi32 = Unmerge.getReg(1);
3675 
3676   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3677   MI.eraseFromParent();
3678   return true;
3679 }
3680 
3681 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3682 // offset (the offset that is included in bounds checking and swizzling, to be
3683 // split between the instruction's voffset and immoffset fields) and soffset
3684 // (the offset that is excluded from bounds checking and swizzling, to go in
3685 // the instruction's soffset field).  This function takes the first kind of
3686 // offset and figures out how to split it between voffset and immoffset.
3687 std::pair<Register, unsigned>
3688 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3689                                         Register OrigOffset) const {
3690   const unsigned MaxImm = 4095;
3691   Register BaseReg;
3692   unsigned ImmOffset;
3693   const LLT S32 = LLT::scalar(32);
3694   MachineRegisterInfo &MRI = *B.getMRI();
3695 
3696   std::tie(BaseReg, ImmOffset) =
3697       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3698 
3699   // If BaseReg is a pointer, convert it to int.
3700   if (MRI.getType(BaseReg).isPointer())
3701     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3702 
3703   // If the immediate value is too big for the immoffset field, put the value
3704   // and -4096 into the immoffset field so that the value that is copied/added
3705   // for the voffset field is a multiple of 4096, and it stands more chance
3706   // of being CSEd with the copy/add for another similar load/store.
3707   // However, do not do that rounding down to a multiple of 4096 if that is a
3708   // negative number, as it appears to be illegal to have a negative offset
3709   // in the vgpr, even if adding the immediate offset makes it positive.
3710   unsigned Overflow = ImmOffset & ~MaxImm;
3711   ImmOffset -= Overflow;
3712   if ((int32_t)Overflow < 0) {
3713     Overflow += ImmOffset;
3714     ImmOffset = 0;
3715   }
3716 
3717   if (Overflow != 0) {
3718     if (!BaseReg) {
3719       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3720     } else {
3721       auto OverflowVal = B.buildConstant(S32, Overflow);
3722       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3723     }
3724   }
3725 
3726   if (!BaseReg)
3727     BaseReg = B.buildConstant(S32, 0).getReg(0);
3728 
3729   return std::make_pair(BaseReg, ImmOffset);
3730 }
3731 
3732 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3733 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3734                                           Register VOffset, Register SOffset,
3735                                           unsigned ImmOffset, Register VIndex,
3736                                           MachineRegisterInfo &MRI) const {
3737   Optional<ValueAndVReg> MaybeVOffsetVal =
3738       getIConstantVRegValWithLookThrough(VOffset, MRI);
3739   Optional<ValueAndVReg> MaybeSOffsetVal =
3740       getIConstantVRegValWithLookThrough(SOffset, MRI);
3741   Optional<ValueAndVReg> MaybeVIndexVal =
3742       getIConstantVRegValWithLookThrough(VIndex, MRI);
3743   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3744   // update the MMO with that offset. The stride is unknown so we can only do
3745   // this if VIndex is constant 0.
3746   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3747       MaybeVIndexVal->Value == 0) {
3748     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3749                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3750     MMO->setOffset(TotalOffset);
3751   } else {
3752     // We don't have a constant combined offset to use in the MMO. Give up.
3753     MMO->setValue((Value *)nullptr);
3754   }
3755 }
3756 
3757 /// Handle register layout difference for f16 images for some subtargets.
3758 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3759                                              MachineRegisterInfo &MRI,
3760                                              Register Reg,
3761                                              bool ImageStore) const {
3762   const LLT S16 = LLT::scalar(16);
3763   const LLT S32 = LLT::scalar(32);
3764   LLT StoreVT = MRI.getType(Reg);
3765   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3766 
3767   if (ST.hasUnpackedD16VMem()) {
3768     auto Unmerge = B.buildUnmerge(S16, Reg);
3769 
3770     SmallVector<Register, 4> WideRegs;
3771     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3772       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3773 
3774     int NumElts = StoreVT.getNumElements();
3775 
3776     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3777         .getReg(0);
3778   }
3779 
3780   if (ImageStore && ST.hasImageStoreD16Bug()) {
3781     if (StoreVT.getNumElements() == 2) {
3782       SmallVector<Register, 4> PackedRegs;
3783       Reg = B.buildBitcast(S32, Reg).getReg(0);
3784       PackedRegs.push_back(Reg);
3785       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3786       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3787           .getReg(0);
3788     }
3789 
3790     if (StoreVT.getNumElements() == 3) {
3791       SmallVector<Register, 4> PackedRegs;
3792       auto Unmerge = B.buildUnmerge(S16, Reg);
3793       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3794         PackedRegs.push_back(Unmerge.getReg(I));
3795       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3796       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3797       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3798     }
3799 
3800     if (StoreVT.getNumElements() == 4) {
3801       SmallVector<Register, 4> PackedRegs;
3802       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3803       auto Unmerge = B.buildUnmerge(S32, Reg);
3804       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3805         PackedRegs.push_back(Unmerge.getReg(I));
3806       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3807       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3808           .getReg(0);
3809     }
3810 
3811     llvm_unreachable("invalid data type");
3812   }
3813 
3814   return Reg;
3815 }
3816 
3817 Register AMDGPULegalizerInfo::fixStoreSourceType(
3818   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3819   MachineRegisterInfo *MRI = B.getMRI();
3820   LLT Ty = MRI->getType(VData);
3821 
3822   const LLT S16 = LLT::scalar(16);
3823 
3824   // Fixup illegal register types for i8 stores.
3825   if (Ty == LLT::scalar(8) || Ty == S16) {
3826     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3827     return AnyExt;
3828   }
3829 
3830   if (Ty.isVector()) {
3831     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3832       if (IsFormat)
3833         return handleD16VData(B, *MRI, VData);
3834     }
3835   }
3836 
3837   return VData;
3838 }
3839 
3840 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3841                                               MachineRegisterInfo &MRI,
3842                                               MachineIRBuilder &B,
3843                                               bool IsTyped,
3844                                               bool IsFormat) const {
3845   Register VData = MI.getOperand(1).getReg();
3846   LLT Ty = MRI.getType(VData);
3847   LLT EltTy = Ty.getScalarType();
3848   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3849   const LLT S32 = LLT::scalar(32);
3850 
3851   VData = fixStoreSourceType(B, VData, IsFormat);
3852   Register RSrc = MI.getOperand(2).getReg();
3853 
3854   MachineMemOperand *MMO = *MI.memoperands_begin();
3855   const int MemSize = MMO->getSize();
3856 
3857   unsigned ImmOffset;
3858 
3859   // The typed intrinsics add an immediate after the registers.
3860   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3861 
3862   // The struct intrinsic variants add one additional operand over raw.
3863   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3864   Register VIndex;
3865   int OpOffset = 0;
3866   if (HasVIndex) {
3867     VIndex = MI.getOperand(3).getReg();
3868     OpOffset = 1;
3869   } else {
3870     VIndex = B.buildConstant(S32, 0).getReg(0);
3871   }
3872 
3873   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3874   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3875 
3876   unsigned Format = 0;
3877   if (IsTyped) {
3878     Format = MI.getOperand(5 + OpOffset).getImm();
3879     ++OpOffset;
3880   }
3881 
3882   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3883 
3884   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3885   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3886 
3887   unsigned Opc;
3888   if (IsTyped) {
3889     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3890                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3891   } else if (IsFormat) {
3892     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3893                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3894   } else {
3895     switch (MemSize) {
3896     case 1:
3897       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3898       break;
3899     case 2:
3900       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3901       break;
3902     default:
3903       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3904       break;
3905     }
3906   }
3907 
3908   auto MIB = B.buildInstr(Opc)
3909     .addUse(VData)              // vdata
3910     .addUse(RSrc)               // rsrc
3911     .addUse(VIndex)             // vindex
3912     .addUse(VOffset)            // voffset
3913     .addUse(SOffset)            // soffset
3914     .addImm(ImmOffset);         // offset(imm)
3915 
3916   if (IsTyped)
3917     MIB.addImm(Format);
3918 
3919   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3920      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3921      .addMemOperand(MMO);
3922 
3923   MI.eraseFromParent();
3924   return true;
3925 }
3926 
3927 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3928                                              MachineRegisterInfo &MRI,
3929                                              MachineIRBuilder &B,
3930                                              bool IsFormat,
3931                                              bool IsTyped) const {
3932   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3933   MachineMemOperand *MMO = *MI.memoperands_begin();
3934   const LLT MemTy = MMO->getMemoryType();
3935   const LLT S32 = LLT::scalar(32);
3936 
3937   Register Dst = MI.getOperand(0).getReg();
3938   Register RSrc = MI.getOperand(2).getReg();
3939 
3940   // The typed intrinsics add an immediate after the registers.
3941   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3942 
3943   // The struct intrinsic variants add one additional operand over raw.
3944   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3945   Register VIndex;
3946   int OpOffset = 0;
3947   if (HasVIndex) {
3948     VIndex = MI.getOperand(3).getReg();
3949     OpOffset = 1;
3950   } else {
3951     VIndex = B.buildConstant(S32, 0).getReg(0);
3952   }
3953 
3954   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3955   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3956 
3957   unsigned Format = 0;
3958   if (IsTyped) {
3959     Format = MI.getOperand(5 + OpOffset).getImm();
3960     ++OpOffset;
3961   }
3962 
3963   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3964   unsigned ImmOffset;
3965 
3966   LLT Ty = MRI.getType(Dst);
3967   LLT EltTy = Ty.getScalarType();
3968   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3969   const bool Unpacked = ST.hasUnpackedD16VMem();
3970 
3971   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3972   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3973 
3974   unsigned Opc;
3975 
3976   if (IsTyped) {
3977     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3978                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3979   } else if (IsFormat) {
3980     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3981                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3982   } else {
3983     switch (MemTy.getSizeInBits()) {
3984     case 8:
3985       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3986       break;
3987     case 16:
3988       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3989       break;
3990     default:
3991       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3992       break;
3993     }
3994   }
3995 
3996   Register LoadDstReg;
3997 
3998   bool IsExtLoad =
3999       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4000   LLT UnpackedTy = Ty.changeElementSize(32);
4001 
4002   if (IsExtLoad)
4003     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4004   else if (Unpacked && IsD16 && Ty.isVector())
4005     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4006   else
4007     LoadDstReg = Dst;
4008 
4009   auto MIB = B.buildInstr(Opc)
4010     .addDef(LoadDstReg)         // vdata
4011     .addUse(RSrc)               // rsrc
4012     .addUse(VIndex)             // vindex
4013     .addUse(VOffset)            // voffset
4014     .addUse(SOffset)            // soffset
4015     .addImm(ImmOffset);         // offset(imm)
4016 
4017   if (IsTyped)
4018     MIB.addImm(Format);
4019 
4020   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4021      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4022      .addMemOperand(MMO);
4023 
4024   if (LoadDstReg != Dst) {
4025     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4026 
4027     // Widen result for extending loads was widened.
4028     if (IsExtLoad)
4029       B.buildTrunc(Dst, LoadDstReg);
4030     else {
4031       // Repack to original 16-bit vector result
4032       // FIXME: G_TRUNC should work, but legalization currently fails
4033       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4034       SmallVector<Register, 4> Repack;
4035       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4036         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4037       B.buildMerge(Dst, Repack);
4038     }
4039   }
4040 
4041   MI.eraseFromParent();
4042   return true;
4043 }
4044 
4045 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4046                                                MachineIRBuilder &B,
4047                                                bool IsInc) const {
4048   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4049                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4050   B.buildInstr(Opc)
4051     .addDef(MI.getOperand(0).getReg())
4052     .addUse(MI.getOperand(2).getReg())
4053     .addUse(MI.getOperand(3).getReg())
4054     .cloneMemRefs(MI);
4055   MI.eraseFromParent();
4056   return true;
4057 }
4058 
4059 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4060   switch (IntrID) {
4061   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4062   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4063     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4064   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4065   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4066     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4067   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4068   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4069     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4070   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4071   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4072     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4073   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4074   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4075     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4076   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4077   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4078     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4079   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4080   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4081     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4082   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4083   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4084     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4085   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4086   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4087     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4088   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4089   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4090     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4091   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4092   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4093     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4094   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4095   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4096     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4097   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4098   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4099     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4100   case Intrinsic::amdgcn_buffer_atomic_fadd:
4101   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4102   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4103     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4104   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4105   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4106     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4107   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4108   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4109     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4110   default:
4111     llvm_unreachable("unhandled atomic opcode");
4112   }
4113 }
4114 
4115 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4116                                                MachineIRBuilder &B,
4117                                                Intrinsic::ID IID) const {
4118   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4119                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4120   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4121 
4122   Register Dst;
4123 
4124   int OpOffset = 0;
4125   if (HasReturn) {
4126     // A few FP atomics do not support return values.
4127     Dst = MI.getOperand(0).getReg();
4128   } else {
4129     OpOffset = -1;
4130   }
4131 
4132   Register VData = MI.getOperand(2 + OpOffset).getReg();
4133   Register CmpVal;
4134 
4135   if (IsCmpSwap) {
4136     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4137     ++OpOffset;
4138   }
4139 
4140   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4141   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4142 
4143   // The struct intrinsic variants add one additional operand over raw.
4144   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4145   Register VIndex;
4146   if (HasVIndex) {
4147     VIndex = MI.getOperand(4 + OpOffset).getReg();
4148     ++OpOffset;
4149   } else {
4150     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4151   }
4152 
4153   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4154   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4155   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4156 
4157   MachineMemOperand *MMO = *MI.memoperands_begin();
4158 
4159   unsigned ImmOffset;
4160   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4161   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4162 
4163   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4164 
4165   if (HasReturn)
4166     MIB.addDef(Dst);
4167 
4168   MIB.addUse(VData); // vdata
4169 
4170   if (IsCmpSwap)
4171     MIB.addReg(CmpVal);
4172 
4173   MIB.addUse(RSrc)               // rsrc
4174      .addUse(VIndex)             // vindex
4175      .addUse(VOffset)            // voffset
4176      .addUse(SOffset)            // soffset
4177      .addImm(ImmOffset)          // offset(imm)
4178      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4179      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4180      .addMemOperand(MMO);
4181 
4182   MI.eraseFromParent();
4183   return true;
4184 }
4185 
4186 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4187 /// vector with s16 typed elements.
4188 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4189                                       SmallVectorImpl<Register> &PackedAddrs,
4190                                       unsigned ArgOffset,
4191                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4192                                       bool IsA16, bool IsG16) {
4193   const LLT S16 = LLT::scalar(16);
4194   const LLT V2S16 = LLT::fixed_vector(2, 16);
4195   auto EndIdx = Intr->VAddrEnd;
4196 
4197   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4198     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4199     if (!SrcOp.isReg())
4200       continue; // _L to _LZ may have eliminated this.
4201 
4202     Register AddrReg = SrcOp.getReg();
4203 
4204     if ((I < Intr->GradientStart) ||
4205         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4206         (I >= Intr->CoordStart && !IsA16)) {
4207       // Handle any gradient or coordinate operands that should not be packed
4208       if ((I < Intr->GradientStart) && IsA16 &&
4209           (B.getMRI()->getType(AddrReg) == S16)) {
4210         // Special handling of bias when A16 is on. Bias is of type half but
4211         // occupies full 32-bit.
4212         PackedAddrs.push_back(
4213             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4214                 .getReg(0));
4215       } else {
4216         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4217         PackedAddrs.push_back(AddrReg);
4218       }
4219     } else {
4220       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4221       // derivatives dx/dh and dx/dv are packed with undef.
4222       if (((I + 1) >= EndIdx) ||
4223           ((Intr->NumGradients / 2) % 2 == 1 &&
4224            (I == static_cast<unsigned>(Intr->GradientStart +
4225                                        (Intr->NumGradients / 2) - 1) ||
4226             I == static_cast<unsigned>(Intr->GradientStart +
4227                                        Intr->NumGradients - 1))) ||
4228           // Check for _L to _LZ optimization
4229           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4230         PackedAddrs.push_back(
4231             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4232                 .getReg(0));
4233       } else {
4234         PackedAddrs.push_back(
4235             B.buildBuildVector(
4236                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4237                 .getReg(0));
4238         ++I;
4239       }
4240     }
4241   }
4242 }
4243 
4244 /// Convert from separate vaddr components to a single vector address register,
4245 /// and replace the remaining operands with $noreg.
4246 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4247                                      int DimIdx, int NumVAddrs) {
4248   const LLT S32 = LLT::scalar(32);
4249 
4250   SmallVector<Register, 8> AddrRegs;
4251   for (int I = 0; I != NumVAddrs; ++I) {
4252     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4253     if (SrcOp.isReg()) {
4254       AddrRegs.push_back(SrcOp.getReg());
4255       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4256     }
4257   }
4258 
4259   int NumAddrRegs = AddrRegs.size();
4260   if (NumAddrRegs != 1) {
4261     // Above 8 elements round up to next power of 2 (i.e. 16).
4262     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4263       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4264       auto Undef = B.buildUndef(S32);
4265       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4266       NumAddrRegs = RoundedNumRegs;
4267     }
4268 
4269     auto VAddr =
4270         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4271     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4272   }
4273 
4274   for (int I = 1; I != NumVAddrs; ++I) {
4275     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4276     if (SrcOp.isReg())
4277       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4278   }
4279 }
4280 
4281 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4282 ///
4283 /// Depending on the subtarget, load/store with 16-bit element data need to be
4284 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4285 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4286 /// registers.
4287 ///
4288 /// We don't want to directly select image instructions just yet, but also want
4289 /// to exposes all register repacking to the legalizer/combiners. We also don't
4290 /// want a selected instrution entering RegBankSelect. In order to avoid
4291 /// defining a multitude of intermediate image instructions, directly hack on
4292 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4293 /// padding now unnecessary arguments with $noreg.
4294 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4295     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4296     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4297 
4298   const unsigned NumDefs = MI.getNumExplicitDefs();
4299   const unsigned ArgOffset = NumDefs + 1;
4300   bool IsTFE = NumDefs == 2;
4301   // We are only processing the operands of d16 image operations on subtargets
4302   // that use the unpacked register layout, or need to repack the TFE result.
4303 
4304   // TODO: Do we need to guard against already legalized intrinsics?
4305   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4306       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4307 
4308   MachineRegisterInfo *MRI = B.getMRI();
4309   const LLT S32 = LLT::scalar(32);
4310   const LLT S16 = LLT::scalar(16);
4311   const LLT V2S16 = LLT::fixed_vector(2, 16);
4312 
4313   unsigned DMask = 0;
4314 
4315   // Check for 16 bit addresses and pack if true.
4316   LLT GradTy =
4317       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4318   LLT AddrTy =
4319       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4320   const bool IsG16 = GradTy == S16;
4321   const bool IsA16 = AddrTy == S16;
4322 
4323   int DMaskLanes = 0;
4324   if (!BaseOpcode->Atomic) {
4325     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4326     if (BaseOpcode->Gather4) {
4327       DMaskLanes = 4;
4328     } else if (DMask != 0) {
4329       DMaskLanes = countPopulation(DMask);
4330     } else if (!IsTFE && !BaseOpcode->Store) {
4331       // If dmask is 0, this is a no-op load. This can be eliminated.
4332       B.buildUndef(MI.getOperand(0));
4333       MI.eraseFromParent();
4334       return true;
4335     }
4336   }
4337 
4338   Observer.changingInstr(MI);
4339   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4340 
4341   unsigned NewOpcode = NumDefs == 0 ?
4342     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4343 
4344   // Track that we legalized this
4345   MI.setDesc(B.getTII().get(NewOpcode));
4346 
4347   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4348   // dmask to be at least 1 otherwise the instruction will fail
4349   if (IsTFE && DMask == 0) {
4350     DMask = 0x1;
4351     DMaskLanes = 1;
4352     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4353   }
4354 
4355   if (BaseOpcode->Atomic) {
4356     Register VData0 = MI.getOperand(2).getReg();
4357     LLT Ty = MRI->getType(VData0);
4358 
4359     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4360     if (Ty.isVector())
4361       return false;
4362 
4363     if (BaseOpcode->AtomicX2) {
4364       Register VData1 = MI.getOperand(3).getReg();
4365       // The two values are packed in one register.
4366       LLT PackedTy = LLT::fixed_vector(2, Ty);
4367       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4368       MI.getOperand(2).setReg(Concat.getReg(0));
4369       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4370     }
4371   }
4372 
4373   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4374 
4375   // Optimize _L to _LZ when _L is zero
4376   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4377           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4378     const ConstantFP *ConstantLod;
4379 
4380     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4381                  m_GFCst(ConstantLod))) {
4382       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4383         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4384         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4385             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4386                                                      Intr->Dim);
4387 
4388         // The starting indexes should remain in the same place.
4389         --CorrectedNumVAddrs;
4390 
4391         MI.getOperand(MI.getNumExplicitDefs())
4392             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4393         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4394         Intr = NewImageDimIntr;
4395       }
4396     }
4397   }
4398 
4399   // Optimize _mip away, when 'lod' is zero
4400   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4401     int64_t ConstantLod;
4402     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4403                  m_ICst(ConstantLod))) {
4404       if (ConstantLod == 0) {
4405         // TODO: Change intrinsic opcode and remove operand instead or replacing
4406         // it with 0, as the _L to _LZ handling is done above.
4407         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4408         --CorrectedNumVAddrs;
4409       }
4410     }
4411   }
4412 
4413   // Rewrite the addressing register layout before doing anything else.
4414   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4415     // 16 bit gradients are supported, but are tied to the A16 control
4416     // so both gradients and addresses must be 16 bit
4417     return false;
4418   }
4419 
4420   if (IsA16 && !ST.hasA16()) {
4421     // A16 not supported
4422     return false;
4423   }
4424 
4425   if (IsA16 || IsG16) {
4426     if (Intr->NumVAddrs > 1) {
4427       SmallVector<Register, 4> PackedRegs;
4428 
4429       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4430                                 IsG16);
4431 
4432       // See also below in the non-a16 branch
4433       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4434                           PackedRegs.size() <= ST.getNSAMaxSize();
4435 
4436       if (!UseNSA && PackedRegs.size() > 1) {
4437         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4438         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4439         PackedRegs[0] = Concat.getReg(0);
4440         PackedRegs.resize(1);
4441       }
4442 
4443       const unsigned NumPacked = PackedRegs.size();
4444       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4445         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4446         if (!SrcOp.isReg()) {
4447           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4448           continue;
4449         }
4450 
4451         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4452 
4453         if (I - Intr->VAddrStart < NumPacked)
4454           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4455         else
4456           SrcOp.setReg(AMDGPU::NoRegister);
4457       }
4458     }
4459   } else {
4460     // If the register allocator cannot place the address registers contiguously
4461     // without introducing moves, then using the non-sequential address encoding
4462     // is always preferable, since it saves VALU instructions and is usually a
4463     // wash in terms of code size or even better.
4464     //
4465     // However, we currently have no way of hinting to the register allocator
4466     // that MIMG addresses should be placed contiguously when it is possible to
4467     // do so, so force non-NSA for the common 2-address case as a heuristic.
4468     //
4469     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4470     // allocation when possible.
4471     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4472                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4473 
4474     if (!UseNSA && Intr->NumVAddrs > 1)
4475       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4476                                Intr->NumVAddrs);
4477   }
4478 
4479   int Flags = 0;
4480   if (IsA16)
4481     Flags |= 1;
4482   if (IsG16)
4483     Flags |= 2;
4484   MI.addOperand(MachineOperand::CreateImm(Flags));
4485 
4486   if (BaseOpcode->Store) { // No TFE for stores?
4487     // TODO: Handle dmask trim
4488     Register VData = MI.getOperand(1).getReg();
4489     LLT Ty = MRI->getType(VData);
4490     if (!Ty.isVector() || Ty.getElementType() != S16)
4491       return true;
4492 
4493     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4494     if (RepackedReg != VData) {
4495       MI.getOperand(1).setReg(RepackedReg);
4496     }
4497 
4498     return true;
4499   }
4500 
4501   Register DstReg = MI.getOperand(0).getReg();
4502   LLT Ty = MRI->getType(DstReg);
4503   const LLT EltTy = Ty.getScalarType();
4504   const bool IsD16 = Ty.getScalarType() == S16;
4505   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4506 
4507   // Confirm that the return type is large enough for the dmask specified
4508   if (NumElts < DMaskLanes)
4509     return false;
4510 
4511   if (NumElts > 4 || DMaskLanes > 4)
4512     return false;
4513 
4514   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4515   const LLT AdjustedTy =
4516       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4517 
4518   // The raw dword aligned data component of the load. The only legal cases
4519   // where this matters should be when using the packed D16 format, for
4520   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4521   LLT RoundedTy;
4522 
4523   // S32 vector to to cover all data, plus TFE result element.
4524   LLT TFETy;
4525 
4526   // Register type to use for each loaded component. Will be S32 or V2S16.
4527   LLT RegTy;
4528 
4529   if (IsD16 && ST.hasUnpackedD16VMem()) {
4530     RoundedTy =
4531         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4532     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4533     RegTy = S32;
4534   } else {
4535     unsigned EltSize = EltTy.getSizeInBits();
4536     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4537     unsigned RoundedSize = 32 * RoundedElts;
4538     RoundedTy = LLT::scalarOrVector(
4539         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4540     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4541     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4542   }
4543 
4544   // The return type does not need adjustment.
4545   // TODO: Should we change s16 case to s32 or <2 x s16>?
4546   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4547     return true;
4548 
4549   Register Dst1Reg;
4550 
4551   // Insert after the instruction.
4552   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4553 
4554   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4555   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4556   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4557   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4558 
4559   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4560 
4561   MI.getOperand(0).setReg(NewResultReg);
4562 
4563   // In the IR, TFE is supposed to be used with a 2 element struct return
4564   // type. The instruction really returns these two values in one contiguous
4565   // register, with one additional dword beyond the loaded data. Rewrite the
4566   // return type to use a single register result.
4567 
4568   if (IsTFE) {
4569     Dst1Reg = MI.getOperand(1).getReg();
4570     if (MRI->getType(Dst1Reg) != S32)
4571       return false;
4572 
4573     // TODO: Make sure the TFE operand bit is set.
4574     MI.RemoveOperand(1);
4575 
4576     // Handle the easy case that requires no repack instructions.
4577     if (Ty == S32) {
4578       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4579       return true;
4580     }
4581   }
4582 
4583   // Now figure out how to copy the new result register back into the old
4584   // result.
4585   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4586 
4587   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4588 
4589   if (ResultNumRegs == 1) {
4590     assert(!IsTFE);
4591     ResultRegs[0] = NewResultReg;
4592   } else {
4593     // We have to repack into a new vector of some kind.
4594     for (int I = 0; I != NumDataRegs; ++I)
4595       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4596     B.buildUnmerge(ResultRegs, NewResultReg);
4597 
4598     // Drop the final TFE element to get the data part. The TFE result is
4599     // directly written to the right place already.
4600     if (IsTFE)
4601       ResultRegs.resize(NumDataRegs);
4602   }
4603 
4604   // For an s16 scalar result, we form an s32 result with a truncate regardless
4605   // of packed vs. unpacked.
4606   if (IsD16 && !Ty.isVector()) {
4607     B.buildTrunc(DstReg, ResultRegs[0]);
4608     return true;
4609   }
4610 
4611   // Avoid a build/concat_vector of 1 entry.
4612   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4613     B.buildBitcast(DstReg, ResultRegs[0]);
4614     return true;
4615   }
4616 
4617   assert(Ty.isVector());
4618 
4619   if (IsD16) {
4620     // For packed D16 results with TFE enabled, all the data components are
4621     // S32. Cast back to the expected type.
4622     //
4623     // TODO: We don't really need to use load s32 elements. We would only need one
4624     // cast for the TFE result if a multiple of v2s16 was used.
4625     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4626       for (Register &Reg : ResultRegs)
4627         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4628     } else if (ST.hasUnpackedD16VMem()) {
4629       for (Register &Reg : ResultRegs)
4630         Reg = B.buildTrunc(S16, Reg).getReg(0);
4631     }
4632   }
4633 
4634   auto padWithUndef = [&](LLT Ty, int NumElts) {
4635     if (NumElts == 0)
4636       return;
4637     Register Undef = B.buildUndef(Ty).getReg(0);
4638     for (int I = 0; I != NumElts; ++I)
4639       ResultRegs.push_back(Undef);
4640   };
4641 
4642   // Pad out any elements eliminated due to the dmask.
4643   LLT ResTy = MRI->getType(ResultRegs[0]);
4644   if (!ResTy.isVector()) {
4645     padWithUndef(ResTy, NumElts - ResultRegs.size());
4646     B.buildBuildVector(DstReg, ResultRegs);
4647     return true;
4648   }
4649 
4650   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4651   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4652 
4653   // Deal with the one annoying legal case.
4654   const LLT V3S16 = LLT::fixed_vector(3, 16);
4655   if (Ty == V3S16) {
4656     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4657     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4658     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4659     return true;
4660   }
4661 
4662   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4663   B.buildConcatVectors(DstReg, ResultRegs);
4664   return true;
4665 }
4666 
4667 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4668   LegalizerHelper &Helper, MachineInstr &MI) const {
4669   MachineIRBuilder &B = Helper.MIRBuilder;
4670   GISelChangeObserver &Observer = Helper.Observer;
4671 
4672   Register Dst = MI.getOperand(0).getReg();
4673   LLT Ty = B.getMRI()->getType(Dst);
4674   unsigned Size = Ty.getSizeInBits();
4675   MachineFunction &MF = B.getMF();
4676 
4677   Observer.changingInstr(MI);
4678 
4679   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4680     Ty = getBitcastRegisterType(Ty);
4681     Helper.bitcastDst(MI, Ty, 0);
4682     Dst = MI.getOperand(0).getReg();
4683     B.setInsertPt(B.getMBB(), MI);
4684   }
4685 
4686   // FIXME: We don't really need this intermediate instruction. The intrinsic
4687   // should be fixed to have a memory operand. Since it's readnone, we're not
4688   // allowed to add one.
4689   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4690   MI.RemoveOperand(1); // Remove intrinsic ID
4691 
4692   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4693   // TODO: Should this use datalayout alignment?
4694   const unsigned MemSize = (Size + 7) / 8;
4695   const Align MemAlign(4);
4696   MachineMemOperand *MMO = MF.getMachineMemOperand(
4697       MachinePointerInfo(),
4698       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4699           MachineMemOperand::MOInvariant,
4700       MemSize, MemAlign);
4701   MI.addMemOperand(MF, MMO);
4702 
4703   // There are no 96-bit result scalar loads, but widening to 128-bit should
4704   // always be legal. We may need to restore this to a 96-bit result if it turns
4705   // out this needs to be converted to a vector load during RegBankSelect.
4706   if (!isPowerOf2_32(Size)) {
4707     if (Ty.isVector())
4708       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4709     else
4710       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4711   }
4712 
4713   Observer.changedInstr(MI);
4714   return true;
4715 }
4716 
4717 // TODO: Move to selection
4718 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4719                                                 MachineRegisterInfo &MRI,
4720                                                 MachineIRBuilder &B) const {
4721   if (!ST.isTrapHandlerEnabled() ||
4722       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4723     return legalizeTrapEndpgm(MI, MRI, B);
4724 
4725   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4726     switch (*HsaAbiVer) {
4727     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4728     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4729       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4730     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4731       return ST.supportsGetDoorbellID() ?
4732           legalizeTrapHsa(MI, MRI, B) :
4733           legalizeTrapHsaQueuePtr(MI, MRI, B);
4734     }
4735   }
4736 
4737   llvm_unreachable("Unknown trap handler");
4738 }
4739 
4740 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4741     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4742   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4743   MI.eraseFromParent();
4744   return true;
4745 }
4746 
4747 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4748     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4749   // Pass queue pointer to trap handler as input, and insert trap instruction
4750   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4751   Register LiveIn =
4752     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4753   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4754     return false;
4755 
4756   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4757   B.buildCopy(SGPR01, LiveIn);
4758   B.buildInstr(AMDGPU::S_TRAP)
4759       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4760       .addReg(SGPR01, RegState::Implicit);
4761 
4762   MI.eraseFromParent();
4763   return true;
4764 }
4765 
4766 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4767     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4768   B.buildInstr(AMDGPU::S_TRAP)
4769       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4770   MI.eraseFromParent();
4771   return true;
4772 }
4773 
4774 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4775     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4776   // Is non-HSA path or trap-handler disabled? Then, report a warning
4777   // accordingly
4778   if (!ST.isTrapHandlerEnabled() ||
4779       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4780     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4781                                      "debugtrap handler not supported",
4782                                      MI.getDebugLoc(), DS_Warning);
4783     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4784     Ctx.diagnose(NoTrap);
4785   } else {
4786     // Insert debug-trap instruction
4787     B.buildInstr(AMDGPU::S_TRAP)
4788         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4789   }
4790 
4791   MI.eraseFromParent();
4792   return true;
4793 }
4794 
4795 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4796                                                MachineIRBuilder &B) const {
4797   MachineRegisterInfo &MRI = *B.getMRI();
4798   const LLT S16 = LLT::scalar(16);
4799   const LLT S32 = LLT::scalar(32);
4800 
4801   Register DstReg = MI.getOperand(0).getReg();
4802   Register NodePtr = MI.getOperand(2).getReg();
4803   Register RayExtent = MI.getOperand(3).getReg();
4804   Register RayOrigin = MI.getOperand(4).getReg();
4805   Register RayDir = MI.getOperand(5).getReg();
4806   Register RayInvDir = MI.getOperand(6).getReg();
4807   Register TDescr = MI.getOperand(7).getReg();
4808 
4809   if (!ST.hasGFX10_AEncoding()) {
4810     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4811                                         "intrinsic not supported on subtarget",
4812                                         MI.getDebugLoc());
4813     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4814     return false;
4815   }
4816 
4817   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4818   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4819   const unsigned NumVDataDwords = 4;
4820   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4821   const bool UseNSA =
4822       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4823   const unsigned BaseOpcodes[2][2] = {
4824       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4825       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4826        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4827   int Opcode;
4828   if (UseNSA) {
4829     Opcode =
4830         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4831                               NumVDataDwords, NumVAddrDwords);
4832   } else {
4833     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4834                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4835                                    PowerOf2Ceil(NumVAddrDwords));
4836   }
4837   assert(Opcode != -1);
4838 
4839   SmallVector<Register, 12> Ops;
4840   if (Is64) {
4841     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4842     Ops.push_back(Unmerge.getReg(0));
4843     Ops.push_back(Unmerge.getReg(1));
4844   } else {
4845     Ops.push_back(NodePtr);
4846   }
4847   Ops.push_back(RayExtent);
4848 
4849   auto packLanes = [&Ops, &S32, &B](Register Src) {
4850     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4851     Ops.push_back(Unmerge.getReg(0));
4852     Ops.push_back(Unmerge.getReg(1));
4853     Ops.push_back(Unmerge.getReg(2));
4854   };
4855 
4856   packLanes(RayOrigin);
4857   if (IsA16) {
4858     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4859     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4860     Register R1 = MRI.createGenericVirtualRegister(S32);
4861     Register R2 = MRI.createGenericVirtualRegister(S32);
4862     Register R3 = MRI.createGenericVirtualRegister(S32);
4863     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4864     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4865     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4866     Ops.push_back(R1);
4867     Ops.push_back(R2);
4868     Ops.push_back(R3);
4869   } else {
4870     packLanes(RayDir);
4871     packLanes(RayInvDir);
4872   }
4873 
4874   if (!UseNSA) {
4875     // Build a single vector containing all the operands so far prepared.
4876     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4877     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4878     Ops.clear();
4879     Ops.push_back(MergedOps);
4880   }
4881 
4882   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4883     .addDef(DstReg)
4884     .addImm(Opcode);
4885 
4886   for (Register R : Ops) {
4887     MIB.addUse(R);
4888   }
4889 
4890   MIB.addUse(TDescr)
4891      .addImm(IsA16 ? 1 : 0)
4892      .cloneMemRefs(MI);
4893 
4894   MI.eraseFromParent();
4895   return true;
4896 }
4897 
4898 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4899                                             MachineInstr &MI) const {
4900   MachineIRBuilder &B = Helper.MIRBuilder;
4901   MachineRegisterInfo &MRI = *B.getMRI();
4902 
4903   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4904   auto IntrID = MI.getIntrinsicID();
4905   switch (IntrID) {
4906   case Intrinsic::amdgcn_if:
4907   case Intrinsic::amdgcn_else: {
4908     MachineInstr *Br = nullptr;
4909     MachineBasicBlock *UncondBrTarget = nullptr;
4910     bool Negated = false;
4911     if (MachineInstr *BrCond =
4912             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4913       const SIRegisterInfo *TRI
4914         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4915 
4916       Register Def = MI.getOperand(1).getReg();
4917       Register Use = MI.getOperand(3).getReg();
4918 
4919       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4920 
4921       if (Negated)
4922         std::swap(CondBrTarget, UncondBrTarget);
4923 
4924       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4925       if (IntrID == Intrinsic::amdgcn_if) {
4926         B.buildInstr(AMDGPU::SI_IF)
4927           .addDef(Def)
4928           .addUse(Use)
4929           .addMBB(UncondBrTarget);
4930       } else {
4931         B.buildInstr(AMDGPU::SI_ELSE)
4932             .addDef(Def)
4933             .addUse(Use)
4934             .addMBB(UncondBrTarget);
4935       }
4936 
4937       if (Br) {
4938         Br->getOperand(0).setMBB(CondBrTarget);
4939       } else {
4940         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4941         // since we're swapping branch targets it needs to be reinserted.
4942         // FIXME: IRTranslator should probably not do this
4943         B.buildBr(*CondBrTarget);
4944       }
4945 
4946       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4947       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4948       MI.eraseFromParent();
4949       BrCond->eraseFromParent();
4950       return true;
4951     }
4952 
4953     return false;
4954   }
4955   case Intrinsic::amdgcn_loop: {
4956     MachineInstr *Br = nullptr;
4957     MachineBasicBlock *UncondBrTarget = nullptr;
4958     bool Negated = false;
4959     if (MachineInstr *BrCond =
4960             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4961       const SIRegisterInfo *TRI
4962         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4963 
4964       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4965       Register Reg = MI.getOperand(2).getReg();
4966 
4967       if (Negated)
4968         std::swap(CondBrTarget, UncondBrTarget);
4969 
4970       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4971       B.buildInstr(AMDGPU::SI_LOOP)
4972         .addUse(Reg)
4973         .addMBB(UncondBrTarget);
4974 
4975       if (Br)
4976         Br->getOperand(0).setMBB(CondBrTarget);
4977       else
4978         B.buildBr(*CondBrTarget);
4979 
4980       MI.eraseFromParent();
4981       BrCond->eraseFromParent();
4982       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4983       return true;
4984     }
4985 
4986     return false;
4987   }
4988   case Intrinsic::amdgcn_kernarg_segment_ptr:
4989     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4990       // This only makes sense to call in a kernel, so just lower to null.
4991       B.buildConstant(MI.getOperand(0).getReg(), 0);
4992       MI.eraseFromParent();
4993       return true;
4994     }
4995 
4996     return legalizePreloadedArgIntrin(
4997       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4998   case Intrinsic::amdgcn_implicitarg_ptr:
4999     return legalizeImplicitArgPtr(MI, MRI, B);
5000   case Intrinsic::amdgcn_workitem_id_x:
5001     return legalizePreloadedArgIntrin(MI, MRI, B,
5002                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5003   case Intrinsic::amdgcn_workitem_id_y:
5004     return legalizePreloadedArgIntrin(MI, MRI, B,
5005                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5006   case Intrinsic::amdgcn_workitem_id_z:
5007     return legalizePreloadedArgIntrin(MI, MRI, B,
5008                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5009   case Intrinsic::amdgcn_workgroup_id_x:
5010     return legalizePreloadedArgIntrin(MI, MRI, B,
5011                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5012   case Intrinsic::amdgcn_workgroup_id_y:
5013     return legalizePreloadedArgIntrin(MI, MRI, B,
5014                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5015   case Intrinsic::amdgcn_workgroup_id_z:
5016     return legalizePreloadedArgIntrin(MI, MRI, B,
5017                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5018   case Intrinsic::amdgcn_dispatch_ptr:
5019     return legalizePreloadedArgIntrin(MI, MRI, B,
5020                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5021   case Intrinsic::amdgcn_queue_ptr:
5022     return legalizePreloadedArgIntrin(MI, MRI, B,
5023                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5024   case Intrinsic::amdgcn_implicit_buffer_ptr:
5025     return legalizePreloadedArgIntrin(
5026       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5027   case Intrinsic::amdgcn_dispatch_id:
5028     return legalizePreloadedArgIntrin(MI, MRI, B,
5029                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5030   case Intrinsic::amdgcn_fdiv_fast:
5031     return legalizeFDIVFastIntrin(MI, MRI, B);
5032   case Intrinsic::amdgcn_is_shared:
5033     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5034   case Intrinsic::amdgcn_is_private:
5035     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5036   case Intrinsic::amdgcn_wavefrontsize: {
5037     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5038     MI.eraseFromParent();
5039     return true;
5040   }
5041   case Intrinsic::amdgcn_s_buffer_load:
5042     return legalizeSBufferLoad(Helper, MI);
5043   case Intrinsic::amdgcn_raw_buffer_store:
5044   case Intrinsic::amdgcn_struct_buffer_store:
5045     return legalizeBufferStore(MI, MRI, B, false, false);
5046   case Intrinsic::amdgcn_raw_buffer_store_format:
5047   case Intrinsic::amdgcn_struct_buffer_store_format:
5048     return legalizeBufferStore(MI, MRI, B, false, true);
5049   case Intrinsic::amdgcn_raw_tbuffer_store:
5050   case Intrinsic::amdgcn_struct_tbuffer_store:
5051     return legalizeBufferStore(MI, MRI, B, true, true);
5052   case Intrinsic::amdgcn_raw_buffer_load:
5053   case Intrinsic::amdgcn_struct_buffer_load:
5054     return legalizeBufferLoad(MI, MRI, B, false, false);
5055   case Intrinsic::amdgcn_raw_buffer_load_format:
5056   case Intrinsic::amdgcn_struct_buffer_load_format:
5057     return legalizeBufferLoad(MI, MRI, B, true, false);
5058   case Intrinsic::amdgcn_raw_tbuffer_load:
5059   case Intrinsic::amdgcn_struct_tbuffer_load:
5060     return legalizeBufferLoad(MI, MRI, B, true, true);
5061   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5062   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5063   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5064   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5065   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5066   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5067   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5068   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5069   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5070   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5071   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5072   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5073   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5074   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5075   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5076   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5077   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5078   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5079   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5080   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5081   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5082   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5083   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5084   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5085   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5086   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5087   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5088   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5089   case Intrinsic::amdgcn_buffer_atomic_fadd:
5090   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5091   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5092   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5093   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5094     return legalizeBufferAtomic(MI, B, IntrID);
5095   case Intrinsic::amdgcn_atomic_inc:
5096     return legalizeAtomicIncDec(MI, B, true);
5097   case Intrinsic::amdgcn_atomic_dec:
5098     return legalizeAtomicIncDec(MI, B, false);
5099   case Intrinsic::trap:
5100     return legalizeTrapIntrinsic(MI, MRI, B);
5101   case Intrinsic::debugtrap:
5102     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5103   case Intrinsic::amdgcn_rsq_clamp:
5104     return legalizeRsqClampIntrinsic(MI, MRI, B);
5105   case Intrinsic::amdgcn_ds_fadd:
5106   case Intrinsic::amdgcn_ds_fmin:
5107   case Intrinsic::amdgcn_ds_fmax:
5108     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5109   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5110     return legalizeBVHIntrinsic(MI, B);
5111   default: {
5112     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5113             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5114       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5115     return true;
5116   }
5117   }
5118 
5119   return true;
5120 }
5121