1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816                     .lowerIf(typeIs(1, S1))
817                     .customFor({{S32, S64}, {S64, S64}});
818   if (ST.has16BitInsts())
819     IToFP.legalFor({{S16, S16}});
820   IToFP.clampScalar(1, S32, S64)
821        .minScalar(0, S32)
822        .scalarize(0)
823        .widenScalarToNextPow2(1);
824 
825   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
826     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
827     .customFor({{S64, S32}, {S64, S64}})
828     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
829   if (ST.has16BitInsts())
830     FPToI.legalFor({{S16, S16}});
831   else
832     FPToI.minScalar(1, S32);
833 
834   FPToI.minScalar(0, S32)
835        .widenScalarToNextPow2(0, 32)
836        .scalarize(0)
837        .lower();
838 
839   // Lower roundeven into G_FRINT
840   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
841     .scalarize(0)
842     .lower();
843 
844   if (ST.has16BitInsts()) {
845     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
846       .legalFor({S16, S32, S64})
847       .clampScalar(0, S16, S64)
848       .scalarize(0);
849   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
850     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
851       .legalFor({S32, S64})
852       .clampScalar(0, S32, S64)
853       .scalarize(0);
854   } else {
855     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
856       .legalFor({S32})
857       .customFor({S64})
858       .clampScalar(0, S32, S64)
859       .scalarize(0);
860   }
861 
862   getActionDefinitionsBuilder(G_PTR_ADD)
863     .legalIf(all(isPointer(0), sameSize(0, 1)))
864     .scalarize(0)
865     .scalarSameSizeAs(1, 0);
866 
867   getActionDefinitionsBuilder(G_PTRMASK)
868     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
869     .scalarSameSizeAs(1, 0)
870     .scalarize(0);
871 
872   auto &CmpBuilder =
873     getActionDefinitionsBuilder(G_ICMP)
874     // The compare output type differs based on the register bank of the output,
875     // so make both s1 and s32 legal.
876     //
877     // Scalar compares producing output in scc will be promoted to s32, as that
878     // is the allocatable register type that will be needed for the copy from
879     // scc. This will be promoted during RegBankSelect, and we assume something
880     // before that won't try to use s32 result types.
881     //
882     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
883     // bank.
884     .legalForCartesianProduct(
885       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
886     .legalForCartesianProduct(
887       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
888   if (ST.has16BitInsts()) {
889     CmpBuilder.legalFor({{S1, S16}});
890   }
891 
892   CmpBuilder
893     .widenScalarToNextPow2(1)
894     .clampScalar(1, S32, S64)
895     .scalarize(0)
896     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
897 
898   getActionDefinitionsBuilder(G_FCMP)
899     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
900     .widenScalarToNextPow2(1)
901     .clampScalar(1, S32, S64)
902     .scalarize(0);
903 
904   // FIXME: fpow has a selection pattern that should move to custom lowering.
905   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
906   if (ST.has16BitInsts())
907     Exp2Ops.legalFor({S32, S16});
908   else
909     Exp2Ops.legalFor({S32});
910   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
911   Exp2Ops.scalarize(0);
912 
913   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
914   if (ST.has16BitInsts())
915     ExpOps.customFor({{S32}, {S16}});
916   else
917     ExpOps.customFor({S32});
918   ExpOps.clampScalar(0, MinScalarFPTy, S32)
919         .scalarize(0);
920 
921   getActionDefinitionsBuilder(G_FPOWI)
922     .clampScalar(0, MinScalarFPTy, S32)
923     .lower();
924 
925   // The 64-bit versions produce 32-bit results, but only on the SALU.
926   getActionDefinitionsBuilder(G_CTPOP)
927     .legalFor({{S32, S32}, {S32, S64}})
928     .clampScalar(0, S32, S32)
929     .clampScalar(1, S32, S64)
930     .scalarize(0)
931     .widenScalarToNextPow2(0, 32)
932     .widenScalarToNextPow2(1, 32);
933 
934   // The hardware instructions return a different result on 0 than the generic
935   // instructions expect. The hardware produces -1, but these produce the
936   // bitwidth.
937   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
938     .scalarize(0)
939     .clampScalar(0, S32, S32)
940     .clampScalar(1, S32, S64)
941     .widenScalarToNextPow2(0, 32)
942     .widenScalarToNextPow2(1, 32)
943     .custom();
944 
945   // The 64-bit versions produce 32-bit results, but only on the SALU.
946   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
947     .legalFor({{S32, S32}, {S32, S64}})
948     .clampScalar(0, S32, S32)
949     .clampScalar(1, S32, S64)
950     .scalarize(0)
951     .widenScalarToNextPow2(0, 32)
952     .widenScalarToNextPow2(1, 32);
953 
954   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
955   // RegBankSelect.
956   getActionDefinitionsBuilder(G_BITREVERSE)
957     .legalFor({S32, S64})
958     .clampScalar(0, S32, S64)
959     .scalarize(0)
960     .widenScalarToNextPow2(0);
961 
962   if (ST.has16BitInsts()) {
963     getActionDefinitionsBuilder(G_BSWAP)
964       .legalFor({S16, S32, V2S16})
965       .clampMaxNumElements(0, S16, 2)
966       // FIXME: Fixing non-power-of-2 before clamp is workaround for
967       // narrowScalar limitation.
968       .widenScalarToNextPow2(0)
969       .clampScalar(0, S16, S32)
970       .scalarize(0);
971 
972     if (ST.hasVOP3PInsts()) {
973       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
974         .legalFor({S32, S16, V2S16})
975         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
976         .clampMaxNumElements(0, S16, 2)
977         .minScalar(0, S16)
978         .widenScalarToNextPow2(0)
979         .scalarize(0)
980         .lower();
981     } else {
982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983         .legalFor({S32, S16})
984         .widenScalarToNextPow2(0)
985         .minScalar(0, S16)
986         .scalarize(0)
987         .lower();
988     }
989   } else {
990     // TODO: Should have same legality without v_perm_b32
991     getActionDefinitionsBuilder(G_BSWAP)
992       .legalFor({S32})
993       .lowerIf(scalarNarrowerThan(0, 32))
994       // FIXME: Fixing non-power-of-2 before clamp is workaround for
995       // narrowScalar limitation.
996       .widenScalarToNextPow2(0)
997       .maxScalar(0, S32)
998       .scalarize(0)
999       .lower();
1000 
1001     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1002       .legalFor({S32})
1003       .minScalar(0, S32)
1004       .widenScalarToNextPow2(0)
1005       .scalarize(0)
1006       .lower();
1007   }
1008 
1009   getActionDefinitionsBuilder(G_INTTOPTR)
1010     // List the common cases
1011     .legalForCartesianProduct(AddrSpaces64, {S64})
1012     .legalForCartesianProduct(AddrSpaces32, {S32})
1013     .scalarize(0)
1014     // Accept any address space as long as the size matches
1015     .legalIf(sameSize(0, 1))
1016     .widenScalarIf(smallerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       })
1020     .narrowScalarIf(largerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       });
1024 
1025   getActionDefinitionsBuilder(G_PTRTOINT)
1026     // List the common cases
1027     .legalForCartesianProduct(AddrSpaces64, {S64})
1028     .legalForCartesianProduct(AddrSpaces32, {S32})
1029     .scalarize(0)
1030     // Accept any address space as long as the size matches
1031     .legalIf(sameSize(0, 1))
1032     .widenScalarIf(smallerThan(0, 1),
1033       [](const LegalityQuery &Query) {
1034         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1035       })
1036     .narrowScalarIf(
1037       largerThan(0, 1),
1038       [](const LegalityQuery &Query) {
1039         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1040       });
1041 
1042   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1043     .scalarize(0)
1044     .custom();
1045 
1046   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1047                                     bool IsLoad) -> bool {
1048     const LLT DstTy = Query.Types[0];
1049 
1050     // Split vector extloads.
1051     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1052     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1053 
1054     if (MemSize < DstTy.getSizeInBits())
1055       MemSize = std::max(MemSize, AlignBits);
1056 
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059 
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064 
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076 
1077     if (AlignBits < MemSize) {
1078       const SITargetLowering *TLI = ST.getTargetLowering();
1079       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1080                                                       Align(AlignBits / 8));
1081     }
1082 
1083     return false;
1084   };
1085 
1086   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1089 
1090   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1091   // LDS
1092   // TODO: Unsupported flat for SI.
1093 
1094   for (unsigned Op : {G_LOAD, G_STORE}) {
1095     const bool IsStore = Op == G_STORE;
1096 
1097     auto &Actions = getActionDefinitionsBuilder(Op);
1098     // Explicitly list some common cases.
1099     // TODO: Does this help compile time at all?
1100     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1101                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1102                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1103                                       {S64, GlobalPtr, S64, GlobalAlign32},
1104                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1105                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1106                                       {S32, GlobalPtr, S8, GlobalAlign8},
1107                                       {S32, GlobalPtr, S16, GlobalAlign16},
1108 
1109                                       {S32, LocalPtr, S32, 32},
1110                                       {S64, LocalPtr, S64, 32},
1111                                       {V2S32, LocalPtr, V2S32, 32},
1112                                       {S32, LocalPtr, S8, 8},
1113                                       {S32, LocalPtr, S16, 16},
1114                                       {V2S16, LocalPtr, S32, 32},
1115 
1116                                       {S32, PrivatePtr, S32, 32},
1117                                       {S32, PrivatePtr, S8, 8},
1118                                       {S32, PrivatePtr, S16, 16},
1119                                       {V2S16, PrivatePtr, S32, 32},
1120 
1121                                       {S32, ConstantPtr, S32, GlobalAlign32},
1122                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1123                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1124                                       {S64, ConstantPtr, S64, GlobalAlign32},
1125                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1126     Actions.legalIf(
1127       [=](const LegalityQuery &Query) -> bool {
1128         return isLoadStoreLegal(ST, Query);
1129       });
1130 
1131     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1132     // 64-bits.
1133     //
1134     // TODO: Should generalize bitcast action into coerce, which will also cover
1135     // inserting addrspacecasts.
1136     Actions.customIf(typeIs(1, Constant32Ptr));
1137 
1138     // Turn any illegal element vectors into something easier to deal
1139     // with. These will ultimately produce 32-bit scalar shifts to extract the
1140     // parts anyway.
1141     //
1142     // For odd 16-bit element vectors, prefer to split those into pieces with
1143     // 16-bit vector parts.
1144     Actions.bitcastIf(
1145       [=](const LegalityQuery &Query) -> bool {
1146         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1147                                           Query.MMODescrs[0].MemoryTy);
1148       }, bitcastToRegisterType(0));
1149 
1150     if (!IsStore) {
1151       // Widen suitably aligned loads by loading extra bytes. The standard
1152       // legalization actions can't properly express widening memory operands.
1153       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1154         return shouldWidenLoad(ST, Query, G_LOAD);
1155       });
1156     }
1157 
1158     // FIXME: load/store narrowing should be moved to lower action
1159     Actions
1160         .narrowScalarIf(
1161             [=](const LegalityQuery &Query) -> bool {
1162               return !Query.Types[0].isVector() &&
1163                      needToSplitMemOp(Query, Op == G_LOAD);
1164             },
1165             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1166               const LLT DstTy = Query.Types[0];
1167               const LLT PtrTy = Query.Types[1];
1168 
1169               const unsigned DstSize = DstTy.getSizeInBits();
1170               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1171 
1172               // Split extloads.
1173               if (DstSize > MemSize)
1174                 return std::make_pair(0, LLT::scalar(MemSize));
1175 
1176               if (!isPowerOf2_32(DstSize)) {
1177                 // We're probably decomposing an odd sized store. Try to split
1178                 // to the widest type. TODO: Account for alignment. As-is it
1179                 // should be OK, since the new parts will be further legalized.
1180                 unsigned FloorSize = PowerOf2Floor(DstSize);
1181                 return std::make_pair(0, LLT::scalar(FloorSize));
1182               }
1183 
1184               if (DstSize > 32 && (DstSize % 32 != 0)) {
1185                 // FIXME: Need a way to specify non-extload of larger size if
1186                 // suitably aligned.
1187                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1188               }
1189 
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193               if (MemSize > MaxSize)
1194                 return std::make_pair(0, LLT::scalar(MaxSize));
1195 
1196               unsigned Align = Query.MMODescrs[0].AlignInBits;
1197               return std::make_pair(0, LLT::scalar(Align));
1198             })
1199         .fewerElementsIf(
1200             [=](const LegalityQuery &Query) -> bool {
1201               return Query.Types[0].isVector() &&
1202                      needToSplitMemOp(Query, Op == G_LOAD);
1203             },
1204             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1205               const LLT DstTy = Query.Types[0];
1206               const LLT PtrTy = Query.Types[1];
1207 
1208               LLT EltTy = DstTy.getElementType();
1209               unsigned MaxSize = maxSizeForAddrSpace(ST,
1210                                                      PtrTy.getAddressSpace(),
1211                                                      Op == G_LOAD);
1212 
1213               // FIXME: Handle widened to power of 2 results better. This ends
1214               // up scalarizing.
1215               // FIXME: 3 element stores scalarized on SI
1216 
1217               // Split if it's too large for the address space.
1218               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1219               if (MemSize > MaxSize) {
1220                 unsigned NumElts = DstTy.getNumElements();
1221                 unsigned EltSize = EltTy.getSizeInBits();
1222 
1223                 if (MaxSize % EltSize == 0) {
1224                   return std::make_pair(
1225                       0, LLT::scalarOrVector(
1226                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1227                 }
1228 
1229                 unsigned NumPieces = MemSize / MaxSize;
1230 
1231                 // FIXME: Refine when odd breakdowns handled
1232                 // The scalars will need to be re-legalized.
1233                 if (NumPieces == 1 || NumPieces >= NumElts ||
1234                     NumElts % NumPieces != 0)
1235                   return std::make_pair(0, EltTy);
1236 
1237                 return std::make_pair(
1238                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1239               }
1240 
1241               // FIXME: We could probably handle weird extending loads better.
1242               if (DstTy.getSizeInBits() > MemSize)
1243                 return std::make_pair(0, EltTy);
1244 
1245               unsigned EltSize = EltTy.getSizeInBits();
1246               unsigned DstSize = DstTy.getSizeInBits();
1247               if (!isPowerOf2_32(DstSize)) {
1248                 // We're probably decomposing an odd sized store. Try to split
1249                 // to the widest type. TODO: Account for alignment. As-is it
1250                 // should be OK, since the new parts will be further legalized.
1251                 unsigned FloorSize = PowerOf2Floor(DstSize);
1252                 return std::make_pair(
1253                     0, LLT::scalarOrVector(
1254                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1255               }
1256 
1257               // Need to split because of alignment.
1258               unsigned Align = Query.MMODescrs[0].AlignInBits;
1259               if (EltSize > Align &&
1260                   (EltSize / Align < DstTy.getNumElements())) {
1261                 return std::make_pair(
1262                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1263               }
1264 
1265               // May need relegalization for the scalars.
1266               return std::make_pair(0, EltTy);
1267             })
1268     .minScalar(0, S32)
1269     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1270     .widenScalarToNextPow2(0)
1271     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1272     .lower();
1273   }
1274 
1275   // FIXME: Unaligned accesses not lowered.
1276   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1277                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1278                                                   {S32, GlobalPtr, S16, 2 * 8},
1279                                                   {S32, LocalPtr, S8, 8},
1280                                                   {S32, LocalPtr, S16, 16},
1281                                                   {S32, PrivatePtr, S8, 8},
1282                                                   {S32, PrivatePtr, S16, 16},
1283                                                   {S32, ConstantPtr, S8, 8},
1284                                                   {S32, ConstantPtr, S16, 2 * 8}})
1285                        .legalIf(
1286                          [=](const LegalityQuery &Query) -> bool {
1287                            return isLoadStoreLegal(ST, Query);
1288                          });
1289 
1290   if (ST.hasFlatAddressSpace()) {
1291     ExtLoads.legalForTypesWithMemDesc(
1292         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1293   }
1294 
1295   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1296   // 64-bits.
1297   //
1298   // TODO: Should generalize bitcast action into coerce, which will also cover
1299   // inserting addrspacecasts.
1300   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1301 
1302   ExtLoads.clampScalar(0, S32, S32)
1303           .widenScalarToNextPow2(0)
1304           .lower();
1305 
1306   auto &Atomics = getActionDefinitionsBuilder(
1307     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1308      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1309      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1310      G_ATOMICRMW_UMIN})
1311     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1312                {S64, GlobalPtr}, {S64, LocalPtr},
1313                {S32, RegionPtr}, {S64, RegionPtr}});
1314   if (ST.hasFlatAddressSpace()) {
1315     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1316   }
1317 
1318   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1319   if (ST.hasLDSFPAtomics()) {
1320     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1321     if (ST.hasGFX90AInsts())
1322       Atomic.legalFor({{S64, LocalPtr}});
1323   }
1324   if (ST.hasAtomicFaddInsts())
1325     Atomic.legalFor({{S32, GlobalPtr}});
1326 
1327   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1328   // demarshalling
1329   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1330     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1331                 {S32, FlatPtr}, {S64, FlatPtr}})
1332     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1333                {S32, RegionPtr}, {S64, RegionPtr}});
1334   // TODO: Pointer types, any 32-bit or 64-bit vector
1335 
1336   // Condition should be s32 for scalar, s1 for vector.
1337   getActionDefinitionsBuilder(G_SELECT)
1338       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1339                                  LocalPtr, FlatPtr, PrivatePtr,
1340                                  LLT::fixed_vector(2, LocalPtr),
1341                                  LLT::fixed_vector(2, PrivatePtr)},
1342                                 {S1, S32})
1343       .clampScalar(0, S16, S64)
1344       .scalarize(1)
1345       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347       .clampMaxNumElements(0, S32, 2)
1348       .clampMaxNumElements(0, LocalPtr, 2)
1349       .clampMaxNumElements(0, PrivatePtr, 2)
1350       .scalarize(0)
1351       .widenScalarToNextPow2(0)
1352       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1353 
1354   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355   // be more flexible with the shift amount type.
1356   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1357     .legalFor({{S32, S32}, {S64, S32}});
1358   if (ST.has16BitInsts()) {
1359     if (ST.hasVOP3PInsts()) {
1360       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1361             .clampMaxNumElements(0, S16, 2);
1362     } else
1363       Shifts.legalFor({{S16, S16}});
1364 
1365     // TODO: Support 16-bit shift amounts for all types
1366     Shifts.widenScalarIf(
1367       [=](const LegalityQuery &Query) {
1368         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1369         // 32-bit amount.
1370         const LLT ValTy = Query.Types[0];
1371         const LLT AmountTy = Query.Types[1];
1372         return ValTy.getSizeInBits() <= 16 &&
1373                AmountTy.getSizeInBits() < 16;
1374       }, changeTo(1, S16));
1375     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1376     Shifts.clampScalar(1, S32, S32);
1377     Shifts.clampScalar(0, S16, S64);
1378     Shifts.widenScalarToNextPow2(0, 16);
1379 
1380     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381       .minScalar(0, S16)
1382       .scalarize(0)
1383       .lower();
1384   } else {
1385     // Make sure we legalize the shift amount type first, as the general
1386     // expansion for the shifted type will produce much worse code if it hasn't
1387     // been truncated already.
1388     Shifts.clampScalar(1, S32, S32);
1389     Shifts.clampScalar(0, S32, S64);
1390     Shifts.widenScalarToNextPow2(0, 32);
1391 
1392     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1393       .minScalar(0, S32)
1394       .scalarize(0)
1395       .lower();
1396   }
1397   Shifts.scalarize(0);
1398 
1399   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1400     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1401     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1402     unsigned IdxTypeIdx = 2;
1403 
1404     getActionDefinitionsBuilder(Op)
1405       .customIf([=](const LegalityQuery &Query) {
1406           const LLT EltTy = Query.Types[EltTypeIdx];
1407           const LLT VecTy = Query.Types[VecTypeIdx];
1408           const LLT IdxTy = Query.Types[IdxTypeIdx];
1409           const unsigned EltSize = EltTy.getSizeInBits();
1410           return (EltSize == 32 || EltSize == 64) &&
1411                   VecTy.getSizeInBits() % 32 == 0 &&
1412                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1413                   IdxTy.getSizeInBits() == 32;
1414         })
1415       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1416                  bitcastToVectorElement32(VecTypeIdx))
1417       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1418       .bitcastIf(
1419         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1420         [=](const LegalityQuery &Query) {
1421           // For > 64-bit element types, try to turn this into a 64-bit
1422           // element vector since we may be able to do better indexing
1423           // if this is scalar. If not, fall back to 32.
1424           const LLT EltTy = Query.Types[EltTypeIdx];
1425           const LLT VecTy = Query.Types[VecTypeIdx];
1426           const unsigned DstEltSize = EltTy.getSizeInBits();
1427           const unsigned VecSize = VecTy.getSizeInBits();
1428 
1429           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1430           return std::make_pair(
1431               VecTypeIdx,
1432               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1433         })
1434       .clampScalar(EltTypeIdx, S32, S64)
1435       .clampScalar(VecTypeIdx, S32, S64)
1436       .clampScalar(IdxTypeIdx, S32, S32)
1437       .clampMaxNumElements(VecTypeIdx, S32, 32)
1438       // TODO: Clamp elements for 64-bit vectors?
1439       // It should only be necessary with variable indexes.
1440       // As a last resort, lower to the stack
1441       .lower();
1442   }
1443 
1444   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1445     .unsupportedIf([=](const LegalityQuery &Query) {
1446         const LLT &EltTy = Query.Types[1].getElementType();
1447         return Query.Types[0] != EltTy;
1448       });
1449 
1450   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1451     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1452     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1453 
1454     // FIXME: Doesn't handle extract of illegal sizes.
1455     getActionDefinitionsBuilder(Op)
1456       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1457       // FIXME: Multiples of 16 should not be legal.
1458       .legalIf([=](const LegalityQuery &Query) {
1459           const LLT BigTy = Query.Types[BigTyIdx];
1460           const LLT LitTy = Query.Types[LitTyIdx];
1461           return (BigTy.getSizeInBits() % 32 == 0) &&
1462                  (LitTy.getSizeInBits() % 16 == 0);
1463         })
1464       .widenScalarIf(
1465         [=](const LegalityQuery &Query) {
1466           const LLT BigTy = Query.Types[BigTyIdx];
1467           return (BigTy.getScalarSizeInBits() < 16);
1468         },
1469         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1470       .widenScalarIf(
1471         [=](const LegalityQuery &Query) {
1472           const LLT LitTy = Query.Types[LitTyIdx];
1473           return (LitTy.getScalarSizeInBits() < 16);
1474         },
1475         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1476       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1477       .widenScalarToNextPow2(BigTyIdx, 32);
1478 
1479   }
1480 
1481   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1482     .legalForCartesianProduct(AllS32Vectors, {S32})
1483     .legalForCartesianProduct(AllS64Vectors, {S64})
1484     .clampNumElements(0, V16S32, V32S32)
1485     .clampNumElements(0, V2S64, V16S64)
1486     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1487 
1488   if (ST.hasScalarPackInsts()) {
1489     BuildVector
1490       // FIXME: Should probably widen s1 vectors straight to s32
1491       .minScalarOrElt(0, S16)
1492       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1493       .minScalar(1, S32);
1494 
1495     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496       .legalFor({V2S16, S32})
1497       .lower();
1498     BuildVector.minScalarOrElt(0, S32);
1499   } else {
1500     BuildVector.customFor({V2S16, S16});
1501     BuildVector.minScalarOrElt(0, S32);
1502 
1503     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1504       .customFor({V2S16, S32})
1505       .lower();
1506   }
1507 
1508   BuildVector.legalIf(isRegisterType(0));
1509 
1510   // FIXME: Clamp maximum size
1511   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1512     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513     .clampMaxNumElements(0, S32, 32)
1514     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1515     .clampMaxNumElements(0, S16, 64);
1516 
1517   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1518   // pre-legalize.
1519   if (ST.hasVOP3PInsts()) {
1520     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1521       .customFor({V2S16, V2S16})
1522       .lower();
1523   } else
1524     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1525 
1526   // Merge/Unmerge
1527   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1528     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1529     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1530 
1531     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1532       const LLT Ty = Query.Types[TypeIdx];
1533       if (Ty.isVector()) {
1534         const LLT &EltTy = Ty.getElementType();
1535         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1536           return true;
1537         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1538           return true;
1539       }
1540       return false;
1541     };
1542 
1543     auto &Builder = getActionDefinitionsBuilder(Op)
1544       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545       .lowerFor({{S16, V2S16}})
1546       .lowerIf([=](const LegalityQuery &Query) {
1547           const LLT BigTy = Query.Types[BigTyIdx];
1548           return BigTy.getSizeInBits() == 32;
1549         })
1550       // Try to widen to s16 first for small types.
1551       // TODO: Only do this on targets with legal s16 shifts
1552       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1554       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1555       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1556                            elementTypeIs(1, S16)),
1557                        changeTo(1, V2S16))
1558       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1560       // valid.
1561       .clampScalar(LitTyIdx, S32, S512)
1562       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1563       // Break up vectors with weird elements into scalars
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1566         scalarize(0))
1567       .fewerElementsIf(
1568         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1569         scalarize(1))
1570       .clampScalar(BigTyIdx, S32, MaxScalar);
1571 
1572     if (Op == G_MERGE_VALUES) {
1573       Builder.widenScalarIf(
1574         // TODO: Use 16-bit shifts if legal for 8-bit values?
1575         [=](const LegalityQuery &Query) {
1576           const LLT Ty = Query.Types[LitTyIdx];
1577           return Ty.getSizeInBits() < 32;
1578         },
1579         changeTo(LitTyIdx, S32));
1580     }
1581 
1582     Builder.widenScalarIf(
1583       [=](const LegalityQuery &Query) {
1584         const LLT Ty = Query.Types[BigTyIdx];
1585         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1586           Ty.getSizeInBits() % 16 != 0;
1587       },
1588       [=](const LegalityQuery &Query) {
1589         // Pick the next power of 2, or a multiple of 64 over 128.
1590         // Whichever is smaller.
1591         const LLT &Ty = Query.Types[BigTyIdx];
1592         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1593         if (NewSizeInBits >= 256) {
1594           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1595           if (RoundedTo < NewSizeInBits)
1596             NewSizeInBits = RoundedTo;
1597         }
1598         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1599       })
1600       // Any vectors left are the wrong size. Scalarize them.
1601       .scalarize(0)
1602       .scalarize(1);
1603   }
1604 
1605   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1606   // RegBankSelect.
1607   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1608     .legalFor({{S32}, {S64}});
1609 
1610   if (ST.hasVOP3PInsts()) {
1611     SextInReg.lowerFor({{V2S16}})
1612       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613       // get more vector shift opportunities, since we'll get those when
1614       // expanded.
1615       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1616   } else if (ST.has16BitInsts()) {
1617     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1618   } else {
1619     // Prefer to promote to s32 before lowering if we don't have 16-bit
1620     // shifts. This avoid a lot of intermediate truncate and extend operations.
1621     SextInReg.lowerFor({{S32}, {S64}});
1622   }
1623 
1624   SextInReg
1625     .scalarize(0)
1626     .clampScalar(0, S32, S64)
1627     .lower();
1628 
1629   // TODO: Only Try to form v2s16 with legal packed instructions.
1630   getActionDefinitionsBuilder(G_FSHR)
1631     .legalFor({{S32, S32}})
1632     .lowerFor({{V2S16, V2S16}})
1633     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1634     .scalarize(0)
1635     .lower();
1636 
1637   if (ST.hasVOP3PInsts()) {
1638     getActionDefinitionsBuilder(G_FSHL)
1639       .lowerFor({{V2S16, V2S16}})
1640       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641       .scalarize(0)
1642       .lower();
1643   } else {
1644     getActionDefinitionsBuilder(G_FSHL)
1645       .scalarize(0)
1646       .lower();
1647   }
1648 
1649   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650     .legalFor({S64});
1651 
1652   getActionDefinitionsBuilder(G_FENCE)
1653     .alwaysLegal();
1654 
1655   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656       .scalarize(0)
1657       .minScalar(0, S32)
1658       .lower();
1659 
1660   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661       .legalFor({{S32, S32}, {S64, S32}})
1662       .clampScalar(1, S32, S32)
1663       .clampScalar(0, S32, S64)
1664       .widenScalarToNextPow2(0)
1665       .scalarize(0);
1666 
1667   getActionDefinitionsBuilder({
1668       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669       G_FCOPYSIGN,
1670 
1671       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672       G_ATOMICRMW_NAND,
1673       G_ATOMICRMW_FSUB,
1674       G_READ_REGISTER,
1675       G_WRITE_REGISTER,
1676 
1677       G_SADDO, G_SSUBO,
1678 
1679        // TODO: Implement
1680       G_FMINIMUM, G_FMAXIMUM}).lower();
1681 
1682   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1683         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1684         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1685     .unsupported();
1686 
1687   getLegacyLegalizerInfo().computeTables();
1688   verify(*ST.getInstrInfo());
1689 }
1690 
1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1692                                          MachineInstr &MI) const {
1693   MachineIRBuilder &B = Helper.MIRBuilder;
1694   MachineRegisterInfo &MRI = *B.getMRI();
1695 
1696   switch (MI.getOpcode()) {
1697   case TargetOpcode::G_ADDRSPACE_CAST:
1698     return legalizeAddrSpaceCast(MI, MRI, B);
1699   case TargetOpcode::G_FRINT:
1700     return legalizeFrint(MI, MRI, B);
1701   case TargetOpcode::G_FCEIL:
1702     return legalizeFceil(MI, MRI, B);
1703   case TargetOpcode::G_FREM:
1704     return legalizeFrem(MI, MRI, B);
1705   case TargetOpcode::G_INTRINSIC_TRUNC:
1706     return legalizeIntrinsicTrunc(MI, MRI, B);
1707   case TargetOpcode::G_SITOFP:
1708     return legalizeITOFP(MI, MRI, B, true);
1709   case TargetOpcode::G_UITOFP:
1710     return legalizeITOFP(MI, MRI, B, false);
1711   case TargetOpcode::G_FPTOSI:
1712     return legalizeFPTOI(MI, MRI, B, true);
1713   case TargetOpcode::G_FPTOUI:
1714     return legalizeFPTOI(MI, MRI, B, false);
1715   case TargetOpcode::G_FMINNUM:
1716   case TargetOpcode::G_FMAXNUM:
1717   case TargetOpcode::G_FMINNUM_IEEE:
1718   case TargetOpcode::G_FMAXNUM_IEEE:
1719     return legalizeMinNumMaxNum(Helper, MI);
1720   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1721     return legalizeExtractVectorElt(MI, MRI, B);
1722   case TargetOpcode::G_INSERT_VECTOR_ELT:
1723     return legalizeInsertVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_SHUFFLE_VECTOR:
1725     return legalizeShuffleVector(MI, MRI, B);
1726   case TargetOpcode::G_FSIN:
1727   case TargetOpcode::G_FCOS:
1728     return legalizeSinCos(MI, MRI, B);
1729   case TargetOpcode::G_GLOBAL_VALUE:
1730     return legalizeGlobalValue(MI, MRI, B);
1731   case TargetOpcode::G_LOAD:
1732   case TargetOpcode::G_SEXTLOAD:
1733   case TargetOpcode::G_ZEXTLOAD:
1734     return legalizeLoad(Helper, MI);
1735   case TargetOpcode::G_FMAD:
1736     return legalizeFMad(MI, MRI, B);
1737   case TargetOpcode::G_FDIV:
1738     return legalizeFDIV(MI, MRI, B);
1739   case TargetOpcode::G_UDIV:
1740   case TargetOpcode::G_UREM:
1741   case TargetOpcode::G_UDIVREM:
1742     return legalizeUnsignedDIV_REM(MI, MRI, B);
1743   case TargetOpcode::G_SDIV:
1744   case TargetOpcode::G_SREM:
1745   case TargetOpcode::G_SDIVREM:
1746     return legalizeSignedDIV_REM(MI, MRI, B);
1747   case TargetOpcode::G_ATOMIC_CMPXCHG:
1748     return legalizeAtomicCmpXChg(MI, MRI, B);
1749   case TargetOpcode::G_FLOG:
1750     return legalizeFlog(MI, B, numbers::ln2f);
1751   case TargetOpcode::G_FLOG10:
1752     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1753   case TargetOpcode::G_FEXP:
1754     return legalizeFExp(MI, B);
1755   case TargetOpcode::G_FPOW:
1756     return legalizeFPow(MI, B);
1757   case TargetOpcode::G_FFLOOR:
1758     return legalizeFFloor(MI, MRI, B);
1759   case TargetOpcode::G_BUILD_VECTOR:
1760     return legalizeBuildVector(MI, MRI, B);
1761   case TargetOpcode::G_CTLZ:
1762   case TargetOpcode::G_CTTZ:
1763     return legalizeCTLZ_CTTZ(MI, MRI, B);
1764   default:
1765     return false;
1766   }
1767 
1768   llvm_unreachable("expected switch to return");
1769 }
1770 
1771 Register AMDGPULegalizerInfo::getSegmentAperture(
1772   unsigned AS,
1773   MachineRegisterInfo &MRI,
1774   MachineIRBuilder &B) const {
1775   MachineFunction &MF = B.getMF();
1776   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1777   const LLT S32 = LLT::scalar(32);
1778 
1779   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1780 
1781   if (ST.hasApertureRegs()) {
1782     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1783     // getreg.
1784     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1786         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1787     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1788         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1789         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1790     unsigned Encoding =
1791         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1792         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1793         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1794 
1795     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1796 
1797     B.buildInstr(AMDGPU::S_GETREG_B32)
1798       .addDef(GetReg)
1799       .addImm(Encoding);
1800     MRI.setType(GetReg, S32);
1801 
1802     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1803     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1804   }
1805 
1806   Register QueuePtr = MRI.createGenericVirtualRegister(
1807     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1808 
1809   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1810     return Register();
1811 
1812   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1813   // private_segment_aperture_base_hi.
1814   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1815 
1816   // TODO: can we be smarter about machine pointer info?
1817   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1818   MachineMemOperand *MMO = MF.getMachineMemOperand(
1819       PtrInfo,
1820       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1821           MachineMemOperand::MOInvariant,
1822       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1823 
1824   Register LoadAddr;
1825 
1826   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1827   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1828 }
1829 
1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1831   MachineInstr &MI, MachineRegisterInfo &MRI,
1832   MachineIRBuilder &B) const {
1833   MachineFunction &MF = B.getMF();
1834 
1835   const LLT S32 = LLT::scalar(32);
1836   Register Dst = MI.getOperand(0).getReg();
1837   Register Src = MI.getOperand(1).getReg();
1838 
1839   LLT DstTy = MRI.getType(Dst);
1840   LLT SrcTy = MRI.getType(Src);
1841   unsigned DestAS = DstTy.getAddressSpace();
1842   unsigned SrcAS = SrcTy.getAddressSpace();
1843 
1844   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1845   // vector element.
1846   assert(!DstTy.isVector());
1847 
1848   const AMDGPUTargetMachine &TM
1849     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1850 
1851   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1852     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1853     return true;
1854   }
1855 
1856   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1857     // Truncate.
1858     B.buildExtract(Dst, Src, 0);
1859     MI.eraseFromParent();
1860     return true;
1861   }
1862 
1863   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1864     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1865     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1866 
1867     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1868     // another. Merge operands are required to be the same type, but creating an
1869     // extra ptrtoint would be kind of pointless.
1870     auto HighAddr = B.buildConstant(
1871       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1872     B.buildMerge(Dst, {Src, HighAddr});
1873     MI.eraseFromParent();
1874     return true;
1875   }
1876 
1877   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1878     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1879            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1880     unsigned NullVal = TM.getNullPointerValue(DestAS);
1881 
1882     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1883     auto FlatNull = B.buildConstant(SrcTy, 0);
1884 
1885     // Extract low 32-bits of the pointer.
1886     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1887 
1888     auto CmpRes =
1889         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1890     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1891 
1892     MI.eraseFromParent();
1893     return true;
1894   }
1895 
1896   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1897     return false;
1898 
1899   if (!ST.hasFlatAddressSpace())
1900     return false;
1901 
1902   auto SegmentNull =
1903       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1904   auto FlatNull =
1905       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1906 
1907   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1908   if (!ApertureReg.isValid())
1909     return false;
1910 
1911   auto CmpRes =
1912       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1913 
1914   // Coerce the type of the low half of the result so we can use merge_values.
1915   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1916 
1917   // TODO: Should we allow mismatched types but matching sizes in merges to
1918   // avoid the ptrtoint?
1919   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1920   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1921 
1922   MI.eraseFromParent();
1923   return true;
1924 }
1925 
1926 bool AMDGPULegalizerInfo::legalizeFrint(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   Register Src = MI.getOperand(1).getReg();
1930   LLT Ty = MRI.getType(Src);
1931   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1932 
1933   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1934   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1935 
1936   auto C1 = B.buildFConstant(Ty, C1Val);
1937   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1938 
1939   // TODO: Should this propagate fast-math-flags?
1940   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1941   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1942 
1943   auto C2 = B.buildFConstant(Ty, C2Val);
1944   auto Fabs = B.buildFAbs(Ty, Src);
1945 
1946   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1947   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1948   MI.eraseFromParent();
1949   return true;
1950 }
1951 
1952 bool AMDGPULegalizerInfo::legalizeFceil(
1953   MachineInstr &MI, MachineRegisterInfo &MRI,
1954   MachineIRBuilder &B) const {
1955 
1956   const LLT S1 = LLT::scalar(1);
1957   const LLT S64 = LLT::scalar(64);
1958 
1959   Register Src = MI.getOperand(1).getReg();
1960   assert(MRI.getType(Src) == S64);
1961 
1962   // result = trunc(src)
1963   // if (src > 0.0 && src != result)
1964   //   result += 1.0
1965 
1966   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1967 
1968   const auto Zero = B.buildFConstant(S64, 0.0);
1969   const auto One = B.buildFConstant(S64, 1.0);
1970   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1971   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1972   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1973   auto Add = B.buildSelect(S64, And, One, Zero);
1974 
1975   // TODO: Should this propagate fast-math-flags?
1976   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1977   return true;
1978 }
1979 
1980 bool AMDGPULegalizerInfo::legalizeFrem(
1981   MachineInstr &MI, MachineRegisterInfo &MRI,
1982   MachineIRBuilder &B) const {
1983     Register DstReg = MI.getOperand(0).getReg();
1984     Register Src0Reg = MI.getOperand(1).getReg();
1985     Register Src1Reg = MI.getOperand(2).getReg();
1986     auto Flags = MI.getFlags();
1987     LLT Ty = MRI.getType(DstReg);
1988 
1989     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1990     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1991     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1992     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1993     MI.eraseFromParent();
1994     return true;
1995 }
1996 
1997 static MachineInstrBuilder extractF64Exponent(Register Hi,
1998                                               MachineIRBuilder &B) {
1999   const unsigned FractBits = 52;
2000   const unsigned ExpBits = 11;
2001   LLT S32 = LLT::scalar(32);
2002 
2003   auto Const0 = B.buildConstant(S32, FractBits - 32);
2004   auto Const1 = B.buildConstant(S32, ExpBits);
2005 
2006   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2007     .addUse(Hi)
2008     .addUse(Const0.getReg(0))
2009     .addUse(Const1.getReg(0));
2010 
2011   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2012 }
2013 
2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2015   MachineInstr &MI, MachineRegisterInfo &MRI,
2016   MachineIRBuilder &B) const {
2017   const LLT S1 = LLT::scalar(1);
2018   const LLT S32 = LLT::scalar(32);
2019   const LLT S64 = LLT::scalar(64);
2020 
2021   Register Src = MI.getOperand(1).getReg();
2022   assert(MRI.getType(Src) == S64);
2023 
2024   // TODO: Should this use extract since the low half is unused?
2025   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2026   Register Hi = Unmerge.getReg(1);
2027 
2028   // Extract the upper half, since this is where we will find the sign and
2029   // exponent.
2030   auto Exp = extractF64Exponent(Hi, B);
2031 
2032   const unsigned FractBits = 52;
2033 
2034   // Extract the sign bit.
2035   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2036   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2037 
2038   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2039 
2040   const auto Zero32 = B.buildConstant(S32, 0);
2041 
2042   // Extend back to 64-bits.
2043   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2044 
2045   auto Shr = B.buildAShr(S64, FractMask, Exp);
2046   auto Not = B.buildNot(S64, Shr);
2047   auto Tmp0 = B.buildAnd(S64, Src, Not);
2048   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2049 
2050   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2051   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2052 
2053   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2054   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2055   MI.eraseFromParent();
2056   return true;
2057 }
2058 
2059 bool AMDGPULegalizerInfo::legalizeITOFP(
2060   MachineInstr &MI, MachineRegisterInfo &MRI,
2061   MachineIRBuilder &B, bool Signed) const {
2062 
2063   Register Dst = MI.getOperand(0).getReg();
2064   Register Src = MI.getOperand(1).getReg();
2065 
2066   const LLT S64 = LLT::scalar(64);
2067   const LLT S32 = LLT::scalar(32);
2068 
2069   assert(MRI.getType(Src) == S64);
2070 
2071   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2072   auto ThirtyTwo = B.buildConstant(S32, 32);
2073 
2074   if (MRI.getType(Dst) == S64) {
2075     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2076                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2077 
2078     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2079     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080                      .addUse(CvtHi.getReg(0))
2081                      .addUse(ThirtyTwo.getReg(0));
2082 
2083     // TODO: Should this propagate fast-math-flags?
2084     B.buildFAdd(Dst, LdExp, CvtLo);
2085     MI.eraseFromParent();
2086     return true;
2087   }
2088 
2089   assert(MRI.getType(Dst) == S32);
2090 
2091   auto One = B.buildConstant(S32, 1);
2092 
2093   MachineInstrBuilder ShAmt;
2094   if (Signed) {
2095     auto ThirtyOne = B.buildConstant(S32, 31);
2096     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2097     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2098     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2099     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2100                                /*HasSideEffects=*/false)
2101                   .addUse(Unmerge.getReg(1));
2102     auto LS2 = B.buildSub(S32, LS, One);
2103     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2104   } else
2105     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2106   auto Norm = B.buildShl(S64, Src, ShAmt);
2107   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2108   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2109   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2110   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2111   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2112   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2113                    /*HasSideEffects=*/false)
2114       .addUse(FVal.getReg(0))
2115       .addUse(Scale.getReg(0));
2116   MI.eraseFromParent();
2117   return true;
2118 }
2119 
2120 // TODO: Copied from DAG implementation. Verify logic and document how this
2121 // actually works.
2122 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2123                                         MachineRegisterInfo &MRI,
2124                                         MachineIRBuilder &B,
2125                                         bool Signed) const {
2126 
2127   Register Dst = MI.getOperand(0).getReg();
2128   Register Src = MI.getOperand(1).getReg();
2129 
2130   const LLT S64 = LLT::scalar(64);
2131   const LLT S32 = LLT::scalar(32);
2132 
2133   const LLT SrcLT = MRI.getType(Src);
2134   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2135 
2136   unsigned Flags = MI.getFlags();
2137 
2138   // The basic idea of converting a floating point number into a pair of 32-bit
2139   // integers is illustrated as follows:
2140   //
2141   //     tf := trunc(val);
2142   //    hif := floor(tf * 2^-32);
2143   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2144   //     hi := fptoi(hif);
2145   //     lo := fptoi(lof);
2146   //
2147   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2148   MachineInstrBuilder Sign;
2149   if (Signed && SrcLT == S32) {
2150     // However, a 32-bit floating point number has only 23 bits mantissa and
2151     // it's not enough to hold all the significant bits of `lof` if val is
2152     // negative. To avoid the loss of precision, We need to take the absolute
2153     // value after truncating and flip the result back based on the original
2154     // signedness.
2155     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2156     Trunc = B.buildFAbs(S32, Trunc, Flags);
2157   }
2158   MachineInstrBuilder K0, K1;
2159   if (SrcLT == S64) {
2160     K0 = B.buildFConstant(S64,
2161                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2162     K1 = B.buildFConstant(S64,
2163                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2164   } else {
2165     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2166     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2167   }
2168 
2169   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2170   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2171   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2172 
2173   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2174                                      : B.buildFPTOUI(S32, FloorMul);
2175   auto Lo = B.buildFPTOUI(S32, Fma);
2176 
2177   if (Signed && SrcLT == S32) {
2178     // Flip the result based on the signedness, which is either all 0s or 1s.
2179     Sign = B.buildMerge(S64, {Sign, Sign});
2180     // r := xor({lo, hi}, sign) - sign;
2181     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2182   } else
2183     B.buildMerge(Dst, {Lo, Hi});
2184   MI.eraseFromParent();
2185 
2186   return true;
2187 }
2188 
2189 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2190                                                MachineInstr &MI) const {
2191   MachineFunction &MF = Helper.MIRBuilder.getMF();
2192   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2193 
2194   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2195                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2196 
2197   // With ieee_mode disabled, the instructions have the correct behavior
2198   // already for G_FMINNUM/G_FMAXNUM
2199   if (!MFI->getMode().IEEE)
2200     return !IsIEEEOp;
2201 
2202   if (IsIEEEOp)
2203     return true;
2204 
2205   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2206 }
2207 
2208 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2209   MachineInstr &MI, MachineRegisterInfo &MRI,
2210   MachineIRBuilder &B) const {
2211   // TODO: Should move some of this into LegalizerHelper.
2212 
2213   // TODO: Promote dynamic indexing of s16 to s32
2214 
2215   // FIXME: Artifact combiner probably should have replaced the truncated
2216   // constant before this, so we shouldn't need
2217   // getConstantVRegValWithLookThrough.
2218   Optional<ValueAndVReg> MaybeIdxVal =
2219       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2220   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2221     return true;
2222   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2223 
2224   Register Dst = MI.getOperand(0).getReg();
2225   Register Vec = MI.getOperand(1).getReg();
2226 
2227   LLT VecTy = MRI.getType(Vec);
2228   LLT EltTy = VecTy.getElementType();
2229   assert(EltTy == MRI.getType(Dst));
2230 
2231   if (IdxVal < VecTy.getNumElements())
2232     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2233   else
2234     B.buildUndef(Dst);
2235 
2236   MI.eraseFromParent();
2237   return true;
2238 }
2239 
2240 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2241   MachineInstr &MI, MachineRegisterInfo &MRI,
2242   MachineIRBuilder &B) const {
2243   // TODO: Should move some of this into LegalizerHelper.
2244 
2245   // TODO: Promote dynamic indexing of s16 to s32
2246 
2247   // FIXME: Artifact combiner probably should have replaced the truncated
2248   // constant before this, so we shouldn't need
2249   // getConstantVRegValWithLookThrough.
2250   Optional<ValueAndVReg> MaybeIdxVal =
2251       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2252   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2253     return true;
2254 
2255   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2256   Register Dst = MI.getOperand(0).getReg();
2257   Register Vec = MI.getOperand(1).getReg();
2258   Register Ins = MI.getOperand(2).getReg();
2259 
2260   LLT VecTy = MRI.getType(Vec);
2261   LLT EltTy = VecTy.getElementType();
2262   assert(EltTy == MRI.getType(Ins));
2263 
2264   if (IdxVal < VecTy.getNumElements())
2265     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2266   else
2267     B.buildUndef(Dst);
2268 
2269   MI.eraseFromParent();
2270   return true;
2271 }
2272 
2273 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2274   MachineInstr &MI, MachineRegisterInfo &MRI,
2275   MachineIRBuilder &B) const {
2276   const LLT V2S16 = LLT::fixed_vector(2, 16);
2277 
2278   Register Dst = MI.getOperand(0).getReg();
2279   Register Src0 = MI.getOperand(1).getReg();
2280   LLT DstTy = MRI.getType(Dst);
2281   LLT SrcTy = MRI.getType(Src0);
2282 
2283   if (SrcTy == V2S16 && DstTy == V2S16 &&
2284       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2285     return true;
2286 
2287   MachineIRBuilder HelperBuilder(MI);
2288   GISelObserverWrapper DummyObserver;
2289   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2290   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeSinCos(
2294   MachineInstr &MI, MachineRegisterInfo &MRI,
2295   MachineIRBuilder &B) const {
2296 
2297   Register DstReg = MI.getOperand(0).getReg();
2298   Register SrcReg = MI.getOperand(1).getReg();
2299   LLT Ty = MRI.getType(DstReg);
2300   unsigned Flags = MI.getFlags();
2301 
2302   Register TrigVal;
2303   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2304   if (ST.hasTrigReducedRange()) {
2305     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2306     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2307       .addUse(MulVal.getReg(0))
2308       .setMIFlags(Flags).getReg(0);
2309   } else
2310     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2311 
2312   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2313     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2314   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2315     .addUse(TrigVal)
2316     .setMIFlags(Flags);
2317   MI.eraseFromParent();
2318   return true;
2319 }
2320 
2321 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2322                                                   MachineIRBuilder &B,
2323                                                   const GlobalValue *GV,
2324                                                   int64_t Offset,
2325                                                   unsigned GAFlags) const {
2326   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2327   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2328   // to the following code sequence:
2329   //
2330   // For constant address space:
2331   //   s_getpc_b64 s[0:1]
2332   //   s_add_u32 s0, s0, $symbol
2333   //   s_addc_u32 s1, s1, 0
2334   //
2335   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2336   //   a fixup or relocation is emitted to replace $symbol with a literal
2337   //   constant, which is a pc-relative offset from the encoding of the $symbol
2338   //   operand to the global variable.
2339   //
2340   // For global address space:
2341   //   s_getpc_b64 s[0:1]
2342   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2343   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2344   //
2345   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346   //   fixups or relocations are emitted to replace $symbol@*@lo and
2347   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2348   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2349   //   operand to the global variable.
2350   //
2351   // What we want here is an offset from the value returned by s_getpc
2352   // (which is the address of the s_add_u32 instruction) to the global
2353   // variable, but since the encoding of $symbol starts 4 bytes after the start
2354   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2355   // small. This requires us to add 4 to the global variable offset in order to
2356   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2357   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2358   // instruction.
2359 
2360   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2361 
2362   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2363     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2364 
2365   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2366     .addDef(PCReg);
2367 
2368   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2369   if (GAFlags == SIInstrInfo::MO_NONE)
2370     MIB.addImm(0);
2371   else
2372     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2373 
2374   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2375 
2376   if (PtrTy.getSizeInBits() == 32)
2377     B.buildExtract(DstReg, PCReg, 0);
2378   return true;
2379  }
2380 
2381 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2382   MachineInstr &MI, MachineRegisterInfo &MRI,
2383   MachineIRBuilder &B) const {
2384   Register DstReg = MI.getOperand(0).getReg();
2385   LLT Ty = MRI.getType(DstReg);
2386   unsigned AS = Ty.getAddressSpace();
2387 
2388   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2389   MachineFunction &MF = B.getMF();
2390   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2391 
2392   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2393     if (!MFI->isModuleEntryFunction() &&
2394         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2395       const Function &Fn = MF.getFunction();
2396       DiagnosticInfoUnsupported BadLDSDecl(
2397         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2398         DS_Warning);
2399       Fn.getContext().diagnose(BadLDSDecl);
2400 
2401       // We currently don't have a way to correctly allocate LDS objects that
2402       // aren't directly associated with a kernel. We do force inlining of
2403       // functions that use local objects. However, if these dead functions are
2404       // not eliminated, we don't want a compile time error. Just emit a warning
2405       // and a trap, since there should be no callable path here.
2406       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2407       B.buildUndef(DstReg);
2408       MI.eraseFromParent();
2409       return true;
2410     }
2411 
2412     // TODO: We could emit code to handle the initialization somewhere.
2413     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2414       const SITargetLowering *TLI = ST.getTargetLowering();
2415       if (!TLI->shouldUseLDSConstAddress(GV)) {
2416         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2417         return true; // Leave in place;
2418       }
2419 
2420       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2421         Type *Ty = GV->getValueType();
2422         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2423         // zero-sized type in other languages to declare the dynamic shared
2424         // memory which size is not known at the compile time. They will be
2425         // allocated by the runtime and placed directly after the static
2426         // allocated ones. They all share the same offset.
2427         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2428           // Adjust alignment for that dynamic shared memory array.
2429           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2430           LLT S32 = LLT::scalar(32);
2431           auto Sz =
2432               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2433           B.buildIntToPtr(DstReg, Sz);
2434           MI.eraseFromParent();
2435           return true;
2436         }
2437       }
2438 
2439       B.buildConstant(
2440           DstReg,
2441           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2442       MI.eraseFromParent();
2443       return true;
2444     }
2445 
2446     const Function &Fn = MF.getFunction();
2447     DiagnosticInfoUnsupported BadInit(
2448       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2449     Fn.getContext().diagnose(BadInit);
2450     return true;
2451   }
2452 
2453   const SITargetLowering *TLI = ST.getTargetLowering();
2454 
2455   if (TLI->shouldEmitFixup(GV)) {
2456     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2457     MI.eraseFromParent();
2458     return true;
2459   }
2460 
2461   if (TLI->shouldEmitPCReloc(GV)) {
2462     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2463     MI.eraseFromParent();
2464     return true;
2465   }
2466 
2467   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2468   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2469 
2470   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2471   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2472       MachinePointerInfo::getGOT(MF),
2473       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2474           MachineMemOperand::MOInvariant,
2475       LoadTy, Align(8));
2476 
2477   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2478 
2479   if (Ty.getSizeInBits() == 32) {
2480     // Truncate if this is a 32-bit constant adrdess.
2481     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2482     B.buildExtract(DstReg, Load, 0);
2483   } else
2484     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2485 
2486   MI.eraseFromParent();
2487   return true;
2488 }
2489 
2490 static LLT widenToNextPowerOf2(LLT Ty) {
2491   if (Ty.isVector())
2492     return Ty.changeElementCount(
2493         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2494   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2495 }
2496 
2497 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2498                                        MachineInstr &MI) const {
2499   MachineIRBuilder &B = Helper.MIRBuilder;
2500   MachineRegisterInfo &MRI = *B.getMRI();
2501   GISelChangeObserver &Observer = Helper.Observer;
2502 
2503   Register PtrReg = MI.getOperand(1).getReg();
2504   LLT PtrTy = MRI.getType(PtrReg);
2505   unsigned AddrSpace = PtrTy.getAddressSpace();
2506 
2507   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2508     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2509     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2510     Observer.changingInstr(MI);
2511     MI.getOperand(1).setReg(Cast.getReg(0));
2512     Observer.changedInstr(MI);
2513     return true;
2514   }
2515 
2516   if (MI.getOpcode() != AMDGPU::G_LOAD)
2517     return false;
2518 
2519   Register ValReg = MI.getOperand(0).getReg();
2520   LLT ValTy = MRI.getType(ValReg);
2521 
2522   MachineMemOperand *MMO = *MI.memoperands_begin();
2523   const unsigned ValSize = ValTy.getSizeInBits();
2524   const LLT MemTy = MMO->getMemoryType();
2525   const Align MemAlign = MMO->getAlign();
2526   const unsigned MemSize = MemTy.getSizeInBits();
2527   const unsigned AlignInBits = 8 * MemAlign.value();
2528 
2529   // Widen non-power-of-2 loads to the alignment if needed
2530   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2531     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2532 
2533     // This was already the correct extending load result type, so just adjust
2534     // the memory type.
2535     if (WideMemSize == ValSize) {
2536       MachineFunction &MF = B.getMF();
2537 
2538       MachineMemOperand *WideMMO =
2539           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2540       Observer.changingInstr(MI);
2541       MI.setMemRefs(MF, {WideMMO});
2542       Observer.changedInstr(MI);
2543       return true;
2544     }
2545 
2546     // Don't bother handling edge case that should probably never be produced.
2547     if (ValSize > WideMemSize)
2548       return false;
2549 
2550     LLT WideTy = widenToNextPowerOf2(ValTy);
2551 
2552     Register WideLoad;
2553     if (!WideTy.isVector()) {
2554       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2555       B.buildTrunc(ValReg, WideLoad).getReg(0);
2556     } else {
2557       // Extract the subvector.
2558 
2559       if (isRegisterType(ValTy)) {
2560         // If this a case where G_EXTRACT is legal, use it.
2561         // (e.g. <3 x s32> -> <4 x s32>)
2562         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2563         B.buildExtract(ValReg, WideLoad, 0);
2564       } else {
2565         // For cases where the widened type isn't a nice register value, unmerge
2566         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2567         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2568         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2569         B.setInsertPt(B.getMBB(), MI.getIterator());
2570         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2571       }
2572     }
2573 
2574     MI.eraseFromParent();
2575     return true;
2576   }
2577 
2578   return false;
2579 }
2580 
2581 bool AMDGPULegalizerInfo::legalizeFMad(
2582   MachineInstr &MI, MachineRegisterInfo &MRI,
2583   MachineIRBuilder &B) const {
2584   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2585   assert(Ty.isScalar());
2586 
2587   MachineFunction &MF = B.getMF();
2588   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2589 
2590   // TODO: Always legal with future ftz flag.
2591   // FIXME: Do we need just output?
2592   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2593     return true;
2594   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2595     return true;
2596 
2597   MachineIRBuilder HelperBuilder(MI);
2598   GISelObserverWrapper DummyObserver;
2599   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2600   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2601 }
2602 
2603 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2604   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2605   Register DstReg = MI.getOperand(0).getReg();
2606   Register PtrReg = MI.getOperand(1).getReg();
2607   Register CmpVal = MI.getOperand(2).getReg();
2608   Register NewVal = MI.getOperand(3).getReg();
2609 
2610   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2611          "this should not have been custom lowered");
2612 
2613   LLT ValTy = MRI.getType(CmpVal);
2614   LLT VecTy = LLT::fixed_vector(2, ValTy);
2615 
2616   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2617 
2618   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2619     .addDef(DstReg)
2620     .addUse(PtrReg)
2621     .addUse(PackedVal)
2622     .setMemRefs(MI.memoperands());
2623 
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627 
2628 bool AMDGPULegalizerInfo::legalizeFlog(
2629   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2630   Register Dst = MI.getOperand(0).getReg();
2631   Register Src = MI.getOperand(1).getReg();
2632   LLT Ty = B.getMRI()->getType(Dst);
2633   unsigned Flags = MI.getFlags();
2634 
2635   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2636   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2637 
2638   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2639   MI.eraseFromParent();
2640   return true;
2641 }
2642 
2643 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2644                                        MachineIRBuilder &B) const {
2645   Register Dst = MI.getOperand(0).getReg();
2646   Register Src = MI.getOperand(1).getReg();
2647   unsigned Flags = MI.getFlags();
2648   LLT Ty = B.getMRI()->getType(Dst);
2649 
2650   auto K = B.buildFConstant(Ty, numbers::log2e);
2651   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2652   B.buildFExp2(Dst, Mul, Flags);
2653   MI.eraseFromParent();
2654   return true;
2655 }
2656 
2657 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2658                                        MachineIRBuilder &B) const {
2659   Register Dst = MI.getOperand(0).getReg();
2660   Register Src0 = MI.getOperand(1).getReg();
2661   Register Src1 = MI.getOperand(2).getReg();
2662   unsigned Flags = MI.getFlags();
2663   LLT Ty = B.getMRI()->getType(Dst);
2664   const LLT S16 = LLT::scalar(16);
2665   const LLT S32 = LLT::scalar(32);
2666 
2667   if (Ty == S32) {
2668     auto Log = B.buildFLog2(S32, Src0, Flags);
2669     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2670       .addUse(Log.getReg(0))
2671       .addUse(Src1)
2672       .setMIFlags(Flags);
2673     B.buildFExp2(Dst, Mul, Flags);
2674   } else if (Ty == S16) {
2675     // There's no f16 fmul_legacy, so we need to convert for it.
2676     auto Log = B.buildFLog2(S16, Src0, Flags);
2677     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2678     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2679     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2680       .addUse(Ext0.getReg(0))
2681       .addUse(Ext1.getReg(0))
2682       .setMIFlags(Flags);
2683 
2684     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2685   } else
2686     return false;
2687 
2688   MI.eraseFromParent();
2689   return true;
2690 }
2691 
2692 // Find a source register, ignoring any possible source modifiers.
2693 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2694   Register ModSrc = OrigSrc;
2695   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2696     ModSrc = SrcFNeg->getOperand(1).getReg();
2697     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2698       ModSrc = SrcFAbs->getOperand(1).getReg();
2699   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2700     ModSrc = SrcFAbs->getOperand(1).getReg();
2701   return ModSrc;
2702 }
2703 
2704 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2705                                          MachineRegisterInfo &MRI,
2706                                          MachineIRBuilder &B) const {
2707 
2708   const LLT S1 = LLT::scalar(1);
2709   const LLT S64 = LLT::scalar(64);
2710   Register Dst = MI.getOperand(0).getReg();
2711   Register OrigSrc = MI.getOperand(1).getReg();
2712   unsigned Flags = MI.getFlags();
2713   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2714          "this should not have been custom lowered");
2715 
2716   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2717   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2718   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2719   // V_FRACT bug is:
2720   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2721   //
2722   // Convert floor(x) to (x - fract(x))
2723 
2724   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2725     .addUse(OrigSrc)
2726     .setMIFlags(Flags);
2727 
2728   // Give source modifier matching some assistance before obscuring a foldable
2729   // pattern.
2730 
2731   // TODO: We can avoid the neg on the fract? The input sign to fract
2732   // shouldn't matter?
2733   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2734 
2735   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2736 
2737   Register Min = MRI.createGenericVirtualRegister(S64);
2738 
2739   // We don't need to concern ourselves with the snan handling difference, so
2740   // use the one which will directly select.
2741   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2742   if (MFI->getMode().IEEE)
2743     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2744   else
2745     B.buildFMinNum(Min, Fract, Const, Flags);
2746 
2747   Register CorrectedFract = Min;
2748   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2749     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2750     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2751   }
2752 
2753   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2754   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2755 
2756   MI.eraseFromParent();
2757   return true;
2758 }
2759 
2760 // Turn an illegal packed v2s16 build vector into bit operations.
2761 // TODO: This should probably be a bitcast action in LegalizerHelper.
2762 bool AMDGPULegalizerInfo::legalizeBuildVector(
2763   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2764   Register Dst = MI.getOperand(0).getReg();
2765   const LLT S32 = LLT::scalar(32);
2766   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2767 
2768   Register Src0 = MI.getOperand(1).getReg();
2769   Register Src1 = MI.getOperand(2).getReg();
2770   assert(MRI.getType(Src0) == LLT::scalar(16));
2771 
2772   auto Merge = B.buildMerge(S32, {Src0, Src1});
2773   B.buildBitcast(Dst, Merge);
2774 
2775   MI.eraseFromParent();
2776   return true;
2777 }
2778 
2779 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2780 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2781 // case with a single min instruction instead of a compare+select.
2782 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2783                                             MachineRegisterInfo &MRI,
2784                                             MachineIRBuilder &B) const {
2785   Register Dst = MI.getOperand(0).getReg();
2786   Register Src = MI.getOperand(1).getReg();
2787   LLT DstTy = MRI.getType(Dst);
2788   LLT SrcTy = MRI.getType(Src);
2789 
2790   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2791                         ? AMDGPU::G_AMDGPU_FFBH_U32
2792                         : AMDGPU::G_AMDGPU_FFBL_B32;
2793   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2794   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2795 
2796   MI.eraseFromParent();
2797   return true;
2798 }
2799 
2800 // Check that this is a G_XOR x, -1
2801 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2802   if (MI.getOpcode() != TargetOpcode::G_XOR)
2803     return false;
2804   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2805   return ConstVal && *ConstVal == -1;
2806 }
2807 
2808 // Return the use branch instruction, otherwise null if the usage is invalid.
2809 static MachineInstr *
2810 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2811                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2812   Register CondDef = MI.getOperand(0).getReg();
2813   if (!MRI.hasOneNonDBGUse(CondDef))
2814     return nullptr;
2815 
2816   MachineBasicBlock *Parent = MI.getParent();
2817   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2818 
2819   if (isNot(MRI, *UseMI)) {
2820     Register NegatedCond = UseMI->getOperand(0).getReg();
2821     if (!MRI.hasOneNonDBGUse(NegatedCond))
2822       return nullptr;
2823 
2824     // We're deleting the def of this value, so we need to remove it.
2825     UseMI->eraseFromParent();
2826 
2827     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2828     Negated = true;
2829   }
2830 
2831   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2832     return nullptr;
2833 
2834   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2835   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2836   if (Next == Parent->end()) {
2837     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2838     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2839       return nullptr;
2840     UncondBrTarget = &*NextMBB;
2841   } else {
2842     if (Next->getOpcode() != AMDGPU::G_BR)
2843       return nullptr;
2844     Br = &*Next;
2845     UncondBrTarget = Br->getOperand(0).getMBB();
2846   }
2847 
2848   return UseMI;
2849 }
2850 
2851 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2852                                          const ArgDescriptor *Arg,
2853                                          const TargetRegisterClass *ArgRC,
2854                                          LLT ArgTy) const {
2855   MCRegister SrcReg = Arg->getRegister();
2856   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2857   assert(DstReg.isVirtual() && "Virtual register expected");
2858 
2859   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2860                                              ArgTy);
2861   if (Arg->isMasked()) {
2862     // TODO: Should we try to emit this once in the entry block?
2863     const LLT S32 = LLT::scalar(32);
2864     const unsigned Mask = Arg->getMask();
2865     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2866 
2867     Register AndMaskSrc = LiveIn;
2868 
2869     if (Shift != 0) {
2870       auto ShiftAmt = B.buildConstant(S32, Shift);
2871       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2872     }
2873 
2874     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2875   } else {
2876     B.buildCopy(DstReg, LiveIn);
2877   }
2878 
2879   return true;
2880 }
2881 
2882 bool AMDGPULegalizerInfo::loadInputValue(
2883     Register DstReg, MachineIRBuilder &B,
2884     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2885   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2886   const ArgDescriptor *Arg;
2887   const TargetRegisterClass *ArgRC;
2888   LLT ArgTy;
2889   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2890 
2891   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2892     return false; // TODO: Handle these
2893   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2894 }
2895 
2896 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2897     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2898     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2899   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2900     return false;
2901 
2902   MI.eraseFromParent();
2903   return true;
2904 }
2905 
2906 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2907                                        MachineRegisterInfo &MRI,
2908                                        MachineIRBuilder &B) const {
2909   Register Dst = MI.getOperand(0).getReg();
2910   LLT DstTy = MRI.getType(Dst);
2911   LLT S16 = LLT::scalar(16);
2912   LLT S32 = LLT::scalar(32);
2913   LLT S64 = LLT::scalar(64);
2914 
2915   if (DstTy == S16)
2916     return legalizeFDIV16(MI, MRI, B);
2917   if (DstTy == S32)
2918     return legalizeFDIV32(MI, MRI, B);
2919   if (DstTy == S64)
2920     return legalizeFDIV64(MI, MRI, B);
2921 
2922   return false;
2923 }
2924 
2925 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2926                                                         Register DstDivReg,
2927                                                         Register DstRemReg,
2928                                                         Register X,
2929                                                         Register Y) const {
2930   const LLT S1 = LLT::scalar(1);
2931   const LLT S32 = LLT::scalar(32);
2932 
2933   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2934   // algorithm used here.
2935 
2936   // Initial estimate of inv(y).
2937   auto FloatY = B.buildUITOFP(S32, Y);
2938   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2939   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2940   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2941   auto Z = B.buildFPTOUI(S32, ScaledY);
2942 
2943   // One round of UNR.
2944   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2945   auto NegYZ = B.buildMul(S32, NegY, Z);
2946   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2947 
2948   // Quotient/remainder estimate.
2949   auto Q = B.buildUMulH(S32, X, Z);
2950   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2951 
2952   // First quotient/remainder refinement.
2953   auto One = B.buildConstant(S32, 1);
2954   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2955   if (DstDivReg)
2956     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2957   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2958 
2959   // Second quotient/remainder refinement.
2960   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2961   if (DstDivReg)
2962     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2963 
2964   if (DstRemReg)
2965     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2966 }
2967 
2968 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2969 //
2970 // Return lo, hi of result
2971 //
2972 // %cvt.lo = G_UITOFP Val.lo
2973 // %cvt.hi = G_UITOFP Val.hi
2974 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2975 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2976 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2977 // %mul2 = G_FMUL %mul1, 2**(-32)
2978 // %trunc = G_INTRINSIC_TRUNC %mul2
2979 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2980 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2981 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2982                                                        Register Val) {
2983   const LLT S32 = LLT::scalar(32);
2984   auto Unmerge = B.buildUnmerge(S32, Val);
2985 
2986   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2987   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2988 
2989   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2990                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2991 
2992   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2993   auto Mul1 =
2994       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2995 
2996   // 2**(-32)
2997   auto Mul2 =
2998       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2999   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3000 
3001   // -(2**32)
3002   auto Mad2 = B.buildFMAD(S32, Trunc,
3003                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3004 
3005   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3006   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3007 
3008   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3009 }
3010 
3011 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3012                                                         Register DstDivReg,
3013                                                         Register DstRemReg,
3014                                                         Register Numer,
3015                                                         Register Denom) const {
3016   const LLT S32 = LLT::scalar(32);
3017   const LLT S64 = LLT::scalar(64);
3018   const LLT S1 = LLT::scalar(1);
3019   Register RcpLo, RcpHi;
3020 
3021   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3022 
3023   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3024 
3025   auto Zero64 = B.buildConstant(S64, 0);
3026   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3027 
3028   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3029   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3030 
3031   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3032   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3033   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3034 
3035   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3036   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3037   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3038   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3039 
3040   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3041   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3042   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3043   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3044   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3045 
3046   auto Zero32 = B.buildConstant(S32, 0);
3047   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3048   auto Add2_HiC =
3049       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3050   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3051   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3052 
3053   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3054   Register NumerLo = UnmergeNumer.getReg(0);
3055   Register NumerHi = UnmergeNumer.getReg(1);
3056 
3057   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3058   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3059   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3060   Register Mul3_Lo = UnmergeMul3.getReg(0);
3061   Register Mul3_Hi = UnmergeMul3.getReg(1);
3062   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3063   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3064   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3065   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3066 
3067   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3068   Register DenomLo = UnmergeDenom.getReg(0);
3069   Register DenomHi = UnmergeDenom.getReg(1);
3070 
3071   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3072   auto C1 = B.buildSExt(S32, CmpHi);
3073 
3074   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3075   auto C2 = B.buildSExt(S32, CmpLo);
3076 
3077   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3078   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3079 
3080   // TODO: Here and below portions of the code can be enclosed into if/endif.
3081   // Currently control flow is unconditional and we have 4 selects after
3082   // potential endif to substitute PHIs.
3083 
3084   // if C3 != 0 ...
3085   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3086   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3087   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3088   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3089 
3090   auto One64 = B.buildConstant(S64, 1);
3091   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3092 
3093   auto C4 =
3094       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3095   auto C5 =
3096       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3097   auto C6 = B.buildSelect(
3098       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3099 
3100   // if (C6 != 0)
3101   auto Add4 = B.buildAdd(S64, Add3, One64);
3102   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3103 
3104   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3105   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3106   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3107 
3108   // endif C6
3109   // endif C3
3110 
3111   if (DstDivReg) {
3112     auto Sel1 = B.buildSelect(
3113         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3114     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3115                   Sel1, MulHi3);
3116   }
3117 
3118   if (DstRemReg) {
3119     auto Sel2 = B.buildSelect(
3120         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3121     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3122                   Sel2, Sub1);
3123   }
3124 }
3125 
3126 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3127                                                   MachineRegisterInfo &MRI,
3128                                                   MachineIRBuilder &B) const {
3129   Register DstDivReg, DstRemReg;
3130   switch (MI.getOpcode()) {
3131   default:
3132     llvm_unreachable("Unexpected opcode!");
3133   case AMDGPU::G_UDIV: {
3134     DstDivReg = MI.getOperand(0).getReg();
3135     break;
3136   }
3137   case AMDGPU::G_UREM: {
3138     DstRemReg = MI.getOperand(0).getReg();
3139     break;
3140   }
3141   case AMDGPU::G_UDIVREM: {
3142     DstDivReg = MI.getOperand(0).getReg();
3143     DstRemReg = MI.getOperand(1).getReg();
3144     break;
3145   }
3146   }
3147 
3148   const LLT S64 = LLT::scalar(64);
3149   const LLT S32 = LLT::scalar(32);
3150   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3151   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3152   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3153   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3154 
3155   if (Ty == S32)
3156     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3157   else if (Ty == S64)
3158     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3159   else
3160     return false;
3161 
3162   MI.eraseFromParent();
3163   return true;
3164 }
3165 
3166 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3167                                                 MachineRegisterInfo &MRI,
3168                                                 MachineIRBuilder &B) const {
3169   const LLT S64 = LLT::scalar(64);
3170   const LLT S32 = LLT::scalar(32);
3171 
3172   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3173   if (Ty != S32 && Ty != S64)
3174     return false;
3175 
3176   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3177   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3178   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3179 
3180   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3181   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3182   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3183 
3184   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3185   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3186 
3187   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3188   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3189 
3190   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3191   switch (MI.getOpcode()) {
3192   default:
3193     llvm_unreachable("Unexpected opcode!");
3194   case AMDGPU::G_SDIV: {
3195     DstDivReg = MI.getOperand(0).getReg();
3196     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3197     break;
3198   }
3199   case AMDGPU::G_SREM: {
3200     DstRemReg = MI.getOperand(0).getReg();
3201     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3202     break;
3203   }
3204   case AMDGPU::G_SDIVREM: {
3205     DstDivReg = MI.getOperand(0).getReg();
3206     DstRemReg = MI.getOperand(1).getReg();
3207     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3208     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3209     break;
3210   }
3211   }
3212 
3213   if (Ty == S32)
3214     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3215   else
3216     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3217 
3218   if (DstDivReg) {
3219     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3220     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3221     B.buildSub(DstDivReg, SignXor, Sign);
3222   }
3223 
3224   if (DstRemReg) {
3225     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3226     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3227     B.buildSub(DstRemReg, SignXor, Sign);
3228   }
3229 
3230   MI.eraseFromParent();
3231   return true;
3232 }
3233 
3234 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3235                                                  MachineRegisterInfo &MRI,
3236                                                  MachineIRBuilder &B) const {
3237   Register Res = MI.getOperand(0).getReg();
3238   Register LHS = MI.getOperand(1).getReg();
3239   Register RHS = MI.getOperand(2).getReg();
3240   uint16_t Flags = MI.getFlags();
3241   LLT ResTy = MRI.getType(Res);
3242 
3243   const MachineFunction &MF = B.getMF();
3244   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3245                             MI.getFlag(MachineInstr::FmAfn);
3246 
3247   if (!AllowInaccurateRcp)
3248     return false;
3249 
3250   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3251     // 1 / x -> RCP(x)
3252     if (CLHS->isExactlyValue(1.0)) {
3253       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3254         .addUse(RHS)
3255         .setMIFlags(Flags);
3256 
3257       MI.eraseFromParent();
3258       return true;
3259     }
3260 
3261     // -1 / x -> RCP( FNEG(x) )
3262     if (CLHS->isExactlyValue(-1.0)) {
3263       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3264       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3265         .addUse(FNeg.getReg(0))
3266         .setMIFlags(Flags);
3267 
3268       MI.eraseFromParent();
3269       return true;
3270     }
3271   }
3272 
3273   // x / y -> x * (1.0 / y)
3274   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3275     .addUse(RHS)
3276     .setMIFlags(Flags);
3277   B.buildFMul(Res, LHS, RCP, Flags);
3278 
3279   MI.eraseFromParent();
3280   return true;
3281 }
3282 
3283 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3284                                                    MachineRegisterInfo &MRI,
3285                                                    MachineIRBuilder &B) const {
3286   Register Res = MI.getOperand(0).getReg();
3287   Register X = MI.getOperand(1).getReg();
3288   Register Y = MI.getOperand(2).getReg();
3289   uint16_t Flags = MI.getFlags();
3290   LLT ResTy = MRI.getType(Res);
3291 
3292   const MachineFunction &MF = B.getMF();
3293   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3294                             MI.getFlag(MachineInstr::FmAfn);
3295 
3296   if (!AllowInaccurateRcp)
3297     return false;
3298 
3299   auto NegY = B.buildFNeg(ResTy, Y);
3300   auto One = B.buildFConstant(ResTy, 1.0);
3301 
3302   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3303     .addUse(Y)
3304     .setMIFlags(Flags);
3305 
3306   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3307   R = B.buildFMA(ResTy, Tmp0, R, R);
3308 
3309   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3310   R = B.buildFMA(ResTy, Tmp1, R, R);
3311 
3312   auto Ret = B.buildFMul(ResTy, X, R);
3313   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3314 
3315   B.buildFMA(Res, Tmp2, R, Ret);
3316   MI.eraseFromParent();
3317   return true;
3318 }
3319 
3320 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3321                                          MachineRegisterInfo &MRI,
3322                                          MachineIRBuilder &B) const {
3323   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3324     return true;
3325 
3326   Register Res = MI.getOperand(0).getReg();
3327   Register LHS = MI.getOperand(1).getReg();
3328   Register RHS = MI.getOperand(2).getReg();
3329 
3330   uint16_t Flags = MI.getFlags();
3331 
3332   LLT S16 = LLT::scalar(16);
3333   LLT S32 = LLT::scalar(32);
3334 
3335   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3336   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3337 
3338   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3339     .addUse(RHSExt.getReg(0))
3340     .setMIFlags(Flags);
3341 
3342   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3343   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3344 
3345   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3346     .addUse(RDst.getReg(0))
3347     .addUse(RHS)
3348     .addUse(LHS)
3349     .setMIFlags(Flags);
3350 
3351   MI.eraseFromParent();
3352   return true;
3353 }
3354 
3355 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3356 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3357 static void toggleSPDenormMode(bool Enable,
3358                                MachineIRBuilder &B,
3359                                const GCNSubtarget &ST,
3360                                AMDGPU::SIModeRegisterDefaults Mode) {
3361   // Set SP denorm mode to this value.
3362   unsigned SPDenormMode =
3363     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3364 
3365   if (ST.hasDenormModeInst()) {
3366     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3367     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3368 
3369     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3370     B.buildInstr(AMDGPU::S_DENORM_MODE)
3371       .addImm(NewDenormModeValue);
3372 
3373   } else {
3374     // Select FP32 bit field in mode register.
3375     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3376                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3377                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3378 
3379     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3380       .addImm(SPDenormMode)
3381       .addImm(SPDenormModeBitField);
3382   }
3383 }
3384 
3385 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3386                                          MachineRegisterInfo &MRI,
3387                                          MachineIRBuilder &B) const {
3388   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3389     return true;
3390 
3391   Register Res = MI.getOperand(0).getReg();
3392   Register LHS = MI.getOperand(1).getReg();
3393   Register RHS = MI.getOperand(2).getReg();
3394   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3395   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3396 
3397   uint16_t Flags = MI.getFlags();
3398 
3399   LLT S32 = LLT::scalar(32);
3400   LLT S1 = LLT::scalar(1);
3401 
3402   auto One = B.buildFConstant(S32, 1.0f);
3403 
3404   auto DenominatorScaled =
3405     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3406       .addUse(LHS)
3407       .addUse(RHS)
3408       .addImm(0)
3409       .setMIFlags(Flags);
3410   auto NumeratorScaled =
3411     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3412       .addUse(LHS)
3413       .addUse(RHS)
3414       .addImm(1)
3415       .setMIFlags(Flags);
3416 
3417   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3418     .addUse(DenominatorScaled.getReg(0))
3419     .setMIFlags(Flags);
3420   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3421 
3422   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3423   // aren't modeled as reading it.
3424   if (!Mode.allFP32Denormals())
3425     toggleSPDenormMode(true, B, ST, Mode);
3426 
3427   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3428   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3429   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3430   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3431   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3432   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3433 
3434   if (!Mode.allFP32Denormals())
3435     toggleSPDenormMode(false, B, ST, Mode);
3436 
3437   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3438     .addUse(Fma4.getReg(0))
3439     .addUse(Fma1.getReg(0))
3440     .addUse(Fma3.getReg(0))
3441     .addUse(NumeratorScaled.getReg(1))
3442     .setMIFlags(Flags);
3443 
3444   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3445     .addUse(Fmas.getReg(0))
3446     .addUse(RHS)
3447     .addUse(LHS)
3448     .setMIFlags(Flags);
3449 
3450   MI.eraseFromParent();
3451   return true;
3452 }
3453 
3454 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3455                                          MachineRegisterInfo &MRI,
3456                                          MachineIRBuilder &B) const {
3457   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3458     return true;
3459 
3460   Register Res = MI.getOperand(0).getReg();
3461   Register LHS = MI.getOperand(1).getReg();
3462   Register RHS = MI.getOperand(2).getReg();
3463 
3464   uint16_t Flags = MI.getFlags();
3465 
3466   LLT S64 = LLT::scalar(64);
3467   LLT S1 = LLT::scalar(1);
3468 
3469   auto One = B.buildFConstant(S64, 1.0);
3470 
3471   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3472     .addUse(LHS)
3473     .addUse(RHS)
3474     .addImm(0)
3475     .setMIFlags(Flags);
3476 
3477   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3478 
3479   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3480     .addUse(DivScale0.getReg(0))
3481     .setMIFlags(Flags);
3482 
3483   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3484   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3485   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3486 
3487   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3488     .addUse(LHS)
3489     .addUse(RHS)
3490     .addImm(1)
3491     .setMIFlags(Flags);
3492 
3493   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3494   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3495   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3496 
3497   Register Scale;
3498   if (!ST.hasUsableDivScaleConditionOutput()) {
3499     // Workaround a hardware bug on SI where the condition output from div_scale
3500     // is not usable.
3501 
3502     LLT S32 = LLT::scalar(32);
3503 
3504     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3505     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3506     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3507     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3508 
3509     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3510                               Scale1Unmerge.getReg(1));
3511     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3512                               Scale0Unmerge.getReg(1));
3513     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3514   } else {
3515     Scale = DivScale1.getReg(1);
3516   }
3517 
3518   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3519     .addUse(Fma4.getReg(0))
3520     .addUse(Fma3.getReg(0))
3521     .addUse(Mul.getReg(0))
3522     .addUse(Scale)
3523     .setMIFlags(Flags);
3524 
3525   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3526     .addUse(Fmas.getReg(0))
3527     .addUse(RHS)
3528     .addUse(LHS)
3529     .setMIFlags(Flags);
3530 
3531   MI.eraseFromParent();
3532   return true;
3533 }
3534 
3535 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3536                                                  MachineRegisterInfo &MRI,
3537                                                  MachineIRBuilder &B) const {
3538   Register Res = MI.getOperand(0).getReg();
3539   Register LHS = MI.getOperand(2).getReg();
3540   Register RHS = MI.getOperand(3).getReg();
3541   uint16_t Flags = MI.getFlags();
3542 
3543   LLT S32 = LLT::scalar(32);
3544   LLT S1 = LLT::scalar(1);
3545 
3546   auto Abs = B.buildFAbs(S32, RHS, Flags);
3547   const APFloat C0Val(1.0f);
3548 
3549   auto C0 = B.buildConstant(S32, 0x6f800000);
3550   auto C1 = B.buildConstant(S32, 0x2f800000);
3551   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3552 
3553   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3554   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3555 
3556   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3557 
3558   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3559     .addUse(Mul0.getReg(0))
3560     .setMIFlags(Flags);
3561 
3562   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3563 
3564   B.buildFMul(Res, Sel, Mul1, Flags);
3565 
3566   MI.eraseFromParent();
3567   return true;
3568 }
3569 
3570 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3571 // FIXME: Why do we handle this one but not other removed instructions?
3572 //
3573 // Reciprocal square root.  The clamp prevents infinite results, clamping
3574 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3575 // +-max_float.
3576 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3577                                                     MachineRegisterInfo &MRI,
3578                                                     MachineIRBuilder &B) const {
3579   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3580     return true;
3581 
3582   Register Dst = MI.getOperand(0).getReg();
3583   Register Src = MI.getOperand(2).getReg();
3584   auto Flags = MI.getFlags();
3585 
3586   LLT Ty = MRI.getType(Dst);
3587 
3588   const fltSemantics *FltSemantics;
3589   if (Ty == LLT::scalar(32))
3590     FltSemantics = &APFloat::IEEEsingle();
3591   else if (Ty == LLT::scalar(64))
3592     FltSemantics = &APFloat::IEEEdouble();
3593   else
3594     return false;
3595 
3596   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3597     .addUse(Src)
3598     .setMIFlags(Flags);
3599 
3600   // We don't need to concern ourselves with the snan handling difference, since
3601   // the rsq quieted (or not) so use the one which will directly select.
3602   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3603   const bool UseIEEE = MFI->getMode().IEEE;
3604 
3605   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3606   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3607                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3608 
3609   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3610 
3611   if (UseIEEE)
3612     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3613   else
3614     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3615   MI.eraseFromParent();
3616   return true;
3617 }
3618 
3619 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3620   switch (IID) {
3621   case Intrinsic::amdgcn_ds_fadd:
3622     return AMDGPU::G_ATOMICRMW_FADD;
3623   case Intrinsic::amdgcn_ds_fmin:
3624     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3625   case Intrinsic::amdgcn_ds_fmax:
3626     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3627   default:
3628     llvm_unreachable("not a DS FP intrinsic");
3629   }
3630 }
3631 
3632 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3633                                                       MachineInstr &MI,
3634                                                       Intrinsic::ID IID) const {
3635   GISelChangeObserver &Observer = Helper.Observer;
3636   Observer.changingInstr(MI);
3637 
3638   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3639 
3640   // The remaining operands were used to set fields in the MemOperand on
3641   // construction.
3642   for (int I = 6; I > 3; --I)
3643     MI.RemoveOperand(I);
3644 
3645   MI.RemoveOperand(1); // Remove the intrinsic ID.
3646   Observer.changedInstr(MI);
3647   return true;
3648 }
3649 
3650 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3651                                             MachineRegisterInfo &MRI,
3652                                             MachineIRBuilder &B) const {
3653   uint64_t Offset =
3654     ST.getTargetLowering()->getImplicitParameterOffset(
3655       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3656   LLT DstTy = MRI.getType(DstReg);
3657   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3658 
3659   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3660   if (!loadInputValue(KernargPtrReg, B,
3661                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3662     return false;
3663 
3664   // FIXME: This should be nuw
3665   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3666   return true;
3667 }
3668 
3669 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3670                                                  MachineRegisterInfo &MRI,
3671                                                  MachineIRBuilder &B) const {
3672   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3673   if (!MFI->isEntryFunction()) {
3674     return legalizePreloadedArgIntrin(MI, MRI, B,
3675                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3676   }
3677 
3678   Register DstReg = MI.getOperand(0).getReg();
3679   if (!getImplicitArgPtr(DstReg, MRI, B))
3680     return false;
3681 
3682   MI.eraseFromParent();
3683   return true;
3684 }
3685 
3686 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3687                                               MachineRegisterInfo &MRI,
3688                                               MachineIRBuilder &B,
3689                                               unsigned AddrSpace) const {
3690   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3691   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3692   Register Hi32 = Unmerge.getReg(1);
3693 
3694   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3695   MI.eraseFromParent();
3696   return true;
3697 }
3698 
3699 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3700 // offset (the offset that is included in bounds checking and swizzling, to be
3701 // split between the instruction's voffset and immoffset fields) and soffset
3702 // (the offset that is excluded from bounds checking and swizzling, to go in
3703 // the instruction's soffset field).  This function takes the first kind of
3704 // offset and figures out how to split it between voffset and immoffset.
3705 std::pair<Register, unsigned>
3706 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3707                                         Register OrigOffset) const {
3708   const unsigned MaxImm = 4095;
3709   Register BaseReg;
3710   unsigned ImmOffset;
3711   const LLT S32 = LLT::scalar(32);
3712   MachineRegisterInfo &MRI = *B.getMRI();
3713 
3714   std::tie(BaseReg, ImmOffset) =
3715       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3716 
3717   // If BaseReg is a pointer, convert it to int.
3718   if (MRI.getType(BaseReg).isPointer())
3719     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3720 
3721   // If the immediate value is too big for the immoffset field, put the value
3722   // and -4096 into the immoffset field so that the value that is copied/added
3723   // for the voffset field is a multiple of 4096, and it stands more chance
3724   // of being CSEd with the copy/add for another similar load/store.
3725   // However, do not do that rounding down to a multiple of 4096 if that is a
3726   // negative number, as it appears to be illegal to have a negative offset
3727   // in the vgpr, even if adding the immediate offset makes it positive.
3728   unsigned Overflow = ImmOffset & ~MaxImm;
3729   ImmOffset -= Overflow;
3730   if ((int32_t)Overflow < 0) {
3731     Overflow += ImmOffset;
3732     ImmOffset = 0;
3733   }
3734 
3735   if (Overflow != 0) {
3736     if (!BaseReg) {
3737       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3738     } else {
3739       auto OverflowVal = B.buildConstant(S32, Overflow);
3740       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3741     }
3742   }
3743 
3744   if (!BaseReg)
3745     BaseReg = B.buildConstant(S32, 0).getReg(0);
3746 
3747   return std::make_pair(BaseReg, ImmOffset);
3748 }
3749 
3750 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3751 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3752                                           Register VOffset, Register SOffset,
3753                                           unsigned ImmOffset, Register VIndex,
3754                                           MachineRegisterInfo &MRI) const {
3755   Optional<ValueAndVReg> MaybeVOffsetVal =
3756       getConstantVRegValWithLookThrough(VOffset, MRI);
3757   Optional<ValueAndVReg> MaybeSOffsetVal =
3758       getConstantVRegValWithLookThrough(SOffset, MRI);
3759   Optional<ValueAndVReg> MaybeVIndexVal =
3760       getConstantVRegValWithLookThrough(VIndex, MRI);
3761   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3762   // update the MMO with that offset. The stride is unknown so we can only do
3763   // this if VIndex is constant 0.
3764   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3765       MaybeVIndexVal->Value == 0) {
3766     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3767                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3768     MMO->setOffset(TotalOffset);
3769   } else {
3770     // We don't have a constant combined offset to use in the MMO. Give up.
3771     MMO->setValue((Value *)nullptr);
3772   }
3773 }
3774 
3775 /// Handle register layout difference for f16 images for some subtargets.
3776 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3777                                              MachineRegisterInfo &MRI,
3778                                              Register Reg,
3779                                              bool ImageStore) const {
3780   const LLT S16 = LLT::scalar(16);
3781   const LLT S32 = LLT::scalar(32);
3782   LLT StoreVT = MRI.getType(Reg);
3783   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3784 
3785   if (ST.hasUnpackedD16VMem()) {
3786     auto Unmerge = B.buildUnmerge(S16, Reg);
3787 
3788     SmallVector<Register, 4> WideRegs;
3789     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3790       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3791 
3792     int NumElts = StoreVT.getNumElements();
3793 
3794     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3795         .getReg(0);
3796   }
3797 
3798   if (ImageStore && ST.hasImageStoreD16Bug()) {
3799     if (StoreVT.getNumElements() == 2) {
3800       SmallVector<Register, 4> PackedRegs;
3801       Reg = B.buildBitcast(S32, Reg).getReg(0);
3802       PackedRegs.push_back(Reg);
3803       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3804       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3805           .getReg(0);
3806     }
3807 
3808     if (StoreVT.getNumElements() == 3) {
3809       SmallVector<Register, 4> PackedRegs;
3810       auto Unmerge = B.buildUnmerge(S16, Reg);
3811       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3812         PackedRegs.push_back(Unmerge.getReg(I));
3813       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3814       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3815       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3816     }
3817 
3818     if (StoreVT.getNumElements() == 4) {
3819       SmallVector<Register, 4> PackedRegs;
3820       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3821       auto Unmerge = B.buildUnmerge(S32, Reg);
3822       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3823         PackedRegs.push_back(Unmerge.getReg(I));
3824       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3825       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3826           .getReg(0);
3827     }
3828 
3829     llvm_unreachable("invalid data type");
3830   }
3831 
3832   return Reg;
3833 }
3834 
3835 Register AMDGPULegalizerInfo::fixStoreSourceType(
3836   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3837   MachineRegisterInfo *MRI = B.getMRI();
3838   LLT Ty = MRI->getType(VData);
3839 
3840   const LLT S16 = LLT::scalar(16);
3841 
3842   // Fixup illegal register types for i8 stores.
3843   if (Ty == LLT::scalar(8) || Ty == S16) {
3844     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3845     return AnyExt;
3846   }
3847 
3848   if (Ty.isVector()) {
3849     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3850       if (IsFormat)
3851         return handleD16VData(B, *MRI, VData);
3852     }
3853   }
3854 
3855   return VData;
3856 }
3857 
3858 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3859                                               MachineRegisterInfo &MRI,
3860                                               MachineIRBuilder &B,
3861                                               bool IsTyped,
3862                                               bool IsFormat) const {
3863   Register VData = MI.getOperand(1).getReg();
3864   LLT Ty = MRI.getType(VData);
3865   LLT EltTy = Ty.getScalarType();
3866   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3867   const LLT S32 = LLT::scalar(32);
3868 
3869   VData = fixStoreSourceType(B, VData, IsFormat);
3870   Register RSrc = MI.getOperand(2).getReg();
3871 
3872   MachineMemOperand *MMO = *MI.memoperands_begin();
3873   const int MemSize = MMO->getSize();
3874 
3875   unsigned ImmOffset;
3876 
3877   // The typed intrinsics add an immediate after the registers.
3878   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3879 
3880   // The struct intrinsic variants add one additional operand over raw.
3881   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3882   Register VIndex;
3883   int OpOffset = 0;
3884   if (HasVIndex) {
3885     VIndex = MI.getOperand(3).getReg();
3886     OpOffset = 1;
3887   } else {
3888     VIndex = B.buildConstant(S32, 0).getReg(0);
3889   }
3890 
3891   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3892   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3893 
3894   unsigned Format = 0;
3895   if (IsTyped) {
3896     Format = MI.getOperand(5 + OpOffset).getImm();
3897     ++OpOffset;
3898   }
3899 
3900   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3901 
3902   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3903   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3904 
3905   unsigned Opc;
3906   if (IsTyped) {
3907     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3908                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3909   } else if (IsFormat) {
3910     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3911                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3912   } else {
3913     switch (MemSize) {
3914     case 1:
3915       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3916       break;
3917     case 2:
3918       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3919       break;
3920     default:
3921       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3922       break;
3923     }
3924   }
3925 
3926   auto MIB = B.buildInstr(Opc)
3927     .addUse(VData)              // vdata
3928     .addUse(RSrc)               // rsrc
3929     .addUse(VIndex)             // vindex
3930     .addUse(VOffset)            // voffset
3931     .addUse(SOffset)            // soffset
3932     .addImm(ImmOffset);         // offset(imm)
3933 
3934   if (IsTyped)
3935     MIB.addImm(Format);
3936 
3937   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3938      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3939      .addMemOperand(MMO);
3940 
3941   MI.eraseFromParent();
3942   return true;
3943 }
3944 
3945 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3946                                              MachineRegisterInfo &MRI,
3947                                              MachineIRBuilder &B,
3948                                              bool IsFormat,
3949                                              bool IsTyped) const {
3950   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3951   MachineMemOperand *MMO = *MI.memoperands_begin();
3952   const LLT MemTy = MMO->getMemoryType();
3953   const LLT S32 = LLT::scalar(32);
3954 
3955   Register Dst = MI.getOperand(0).getReg();
3956   Register RSrc = MI.getOperand(2).getReg();
3957 
3958   // The typed intrinsics add an immediate after the registers.
3959   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3960 
3961   // The struct intrinsic variants add one additional operand over raw.
3962   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3963   Register VIndex;
3964   int OpOffset = 0;
3965   if (HasVIndex) {
3966     VIndex = MI.getOperand(3).getReg();
3967     OpOffset = 1;
3968   } else {
3969     VIndex = B.buildConstant(S32, 0).getReg(0);
3970   }
3971 
3972   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3973   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3974 
3975   unsigned Format = 0;
3976   if (IsTyped) {
3977     Format = MI.getOperand(5 + OpOffset).getImm();
3978     ++OpOffset;
3979   }
3980 
3981   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3982   unsigned ImmOffset;
3983 
3984   LLT Ty = MRI.getType(Dst);
3985   LLT EltTy = Ty.getScalarType();
3986   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3987   const bool Unpacked = ST.hasUnpackedD16VMem();
3988 
3989   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3990   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3991 
3992   unsigned Opc;
3993 
3994   if (IsTyped) {
3995     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3996                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3997   } else if (IsFormat) {
3998     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3999                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4000   } else {
4001     switch (MemTy.getSizeInBits()) {
4002     case 8:
4003       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4004       break;
4005     case 16:
4006       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4007       break;
4008     default:
4009       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4010       break;
4011     }
4012   }
4013 
4014   Register LoadDstReg;
4015 
4016   bool IsExtLoad =
4017       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4018   LLT UnpackedTy = Ty.changeElementSize(32);
4019 
4020   if (IsExtLoad)
4021     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4022   else if (Unpacked && IsD16 && Ty.isVector())
4023     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4024   else
4025     LoadDstReg = Dst;
4026 
4027   auto MIB = B.buildInstr(Opc)
4028     .addDef(LoadDstReg)         // vdata
4029     .addUse(RSrc)               // rsrc
4030     .addUse(VIndex)             // vindex
4031     .addUse(VOffset)            // voffset
4032     .addUse(SOffset)            // soffset
4033     .addImm(ImmOffset);         // offset(imm)
4034 
4035   if (IsTyped)
4036     MIB.addImm(Format);
4037 
4038   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4039      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4040      .addMemOperand(MMO);
4041 
4042   if (LoadDstReg != Dst) {
4043     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4044 
4045     // Widen result for extending loads was widened.
4046     if (IsExtLoad)
4047       B.buildTrunc(Dst, LoadDstReg);
4048     else {
4049       // Repack to original 16-bit vector result
4050       // FIXME: G_TRUNC should work, but legalization currently fails
4051       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4052       SmallVector<Register, 4> Repack;
4053       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4054         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4055       B.buildMerge(Dst, Repack);
4056     }
4057   }
4058 
4059   MI.eraseFromParent();
4060   return true;
4061 }
4062 
4063 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4064                                                MachineIRBuilder &B,
4065                                                bool IsInc) const {
4066   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4067                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4068   B.buildInstr(Opc)
4069     .addDef(MI.getOperand(0).getReg())
4070     .addUse(MI.getOperand(2).getReg())
4071     .addUse(MI.getOperand(3).getReg())
4072     .cloneMemRefs(MI);
4073   MI.eraseFromParent();
4074   return true;
4075 }
4076 
4077 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4078   switch (IntrID) {
4079   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4080   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4081     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4082   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4083   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4084     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4085   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4086   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4087     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4088   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4089   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4090     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4091   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4092   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4093     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4094   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4095   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4096     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4097   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4098   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4099     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4100   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4101   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4102     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4103   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4104   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4105     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4106   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4107   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4108     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4109   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4110   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4111     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4112   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4113   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4114     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4115   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4116   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4117     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4118   case Intrinsic::amdgcn_buffer_atomic_fadd:
4119   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4120   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4121     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4122   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4123   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4124     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4125   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4126   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4127     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4128   default:
4129     llvm_unreachable("unhandled atomic opcode");
4130   }
4131 }
4132 
4133 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4134                                                MachineIRBuilder &B,
4135                                                Intrinsic::ID IID) const {
4136   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4137                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4138   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4139 
4140   Register Dst;
4141 
4142   int OpOffset = 0;
4143   if (HasReturn) {
4144     // A few FP atomics do not support return values.
4145     Dst = MI.getOperand(0).getReg();
4146   } else {
4147     OpOffset = -1;
4148   }
4149 
4150   Register VData = MI.getOperand(2 + OpOffset).getReg();
4151   Register CmpVal;
4152 
4153   if (IsCmpSwap) {
4154     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4155     ++OpOffset;
4156   }
4157 
4158   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4159   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4160 
4161   // The struct intrinsic variants add one additional operand over raw.
4162   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4163   Register VIndex;
4164   if (HasVIndex) {
4165     VIndex = MI.getOperand(4 + OpOffset).getReg();
4166     ++OpOffset;
4167   } else {
4168     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4169   }
4170 
4171   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4172   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4173   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4174 
4175   MachineMemOperand *MMO = *MI.memoperands_begin();
4176 
4177   unsigned ImmOffset;
4178   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4179   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4180 
4181   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4182 
4183   if (HasReturn)
4184     MIB.addDef(Dst);
4185 
4186   MIB.addUse(VData); // vdata
4187 
4188   if (IsCmpSwap)
4189     MIB.addReg(CmpVal);
4190 
4191   MIB.addUse(RSrc)               // rsrc
4192      .addUse(VIndex)             // vindex
4193      .addUse(VOffset)            // voffset
4194      .addUse(SOffset)            // soffset
4195      .addImm(ImmOffset)          // offset(imm)
4196      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4197      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4198      .addMemOperand(MMO);
4199 
4200   MI.eraseFromParent();
4201   return true;
4202 }
4203 
4204 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4205 /// vector with s16 typed elements.
4206 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4207                                       SmallVectorImpl<Register> &PackedAddrs,
4208                                       unsigned ArgOffset,
4209                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4210                                       bool IsA16, bool IsG16) {
4211   const LLT S16 = LLT::scalar(16);
4212   const LLT V2S16 = LLT::fixed_vector(2, 16);
4213   auto EndIdx = Intr->VAddrEnd;
4214 
4215   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4216     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4217     if (!SrcOp.isReg())
4218       continue; // _L to _LZ may have eliminated this.
4219 
4220     Register AddrReg = SrcOp.getReg();
4221 
4222     if ((I < Intr->GradientStart) ||
4223         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4224         (I >= Intr->CoordStart && !IsA16)) {
4225       // Handle any gradient or coordinate operands that should not be packed
4226       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4227       PackedAddrs.push_back(AddrReg);
4228     } else {
4229       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4230       // derivatives dx/dh and dx/dv are packed with undef.
4231       if (((I + 1) >= EndIdx) ||
4232           ((Intr->NumGradients / 2) % 2 == 1 &&
4233            (I == static_cast<unsigned>(Intr->GradientStart +
4234                                        (Intr->NumGradients / 2) - 1) ||
4235             I == static_cast<unsigned>(Intr->GradientStart +
4236                                        Intr->NumGradients - 1))) ||
4237           // Check for _L to _LZ optimization
4238           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4239         PackedAddrs.push_back(
4240             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4241                 .getReg(0));
4242       } else {
4243         PackedAddrs.push_back(
4244             B.buildBuildVector(
4245                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4246                 .getReg(0));
4247         ++I;
4248       }
4249     }
4250   }
4251 }
4252 
4253 /// Convert from separate vaddr components to a single vector address register,
4254 /// and replace the remaining operands with $noreg.
4255 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4256                                      int DimIdx, int NumVAddrs) {
4257   const LLT S32 = LLT::scalar(32);
4258 
4259   SmallVector<Register, 8> AddrRegs;
4260   for (int I = 0; I != NumVAddrs; ++I) {
4261     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4262     if (SrcOp.isReg()) {
4263       AddrRegs.push_back(SrcOp.getReg());
4264       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4265     }
4266   }
4267 
4268   int NumAddrRegs = AddrRegs.size();
4269   if (NumAddrRegs != 1) {
4270     // Above 8 elements round up to next power of 2 (i.e. 16).
4271     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4272       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4273       auto Undef = B.buildUndef(S32);
4274       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4275       NumAddrRegs = RoundedNumRegs;
4276     }
4277 
4278     auto VAddr =
4279         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4280     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4281   }
4282 
4283   for (int I = 1; I != NumVAddrs; ++I) {
4284     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4285     if (SrcOp.isReg())
4286       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4287   }
4288 }
4289 
4290 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4291 ///
4292 /// Depending on the subtarget, load/store with 16-bit element data need to be
4293 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4294 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4295 /// registers.
4296 ///
4297 /// We don't want to directly select image instructions just yet, but also want
4298 /// to exposes all register repacking to the legalizer/combiners. We also don't
4299 /// want a selected instrution entering RegBankSelect. In order to avoid
4300 /// defining a multitude of intermediate image instructions, directly hack on
4301 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4302 /// now unnecessary arguments with $noreg.
4303 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4304     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4305     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4306 
4307   const unsigned NumDefs = MI.getNumExplicitDefs();
4308   const unsigned ArgOffset = NumDefs + 1;
4309   bool IsTFE = NumDefs == 2;
4310   // We are only processing the operands of d16 image operations on subtargets
4311   // that use the unpacked register layout, or need to repack the TFE result.
4312 
4313   // TODO: Do we need to guard against already legalized intrinsics?
4314   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4315       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4316 
4317   MachineRegisterInfo *MRI = B.getMRI();
4318   const LLT S32 = LLT::scalar(32);
4319   const LLT S16 = LLT::scalar(16);
4320   const LLT V2S16 = LLT::fixed_vector(2, 16);
4321 
4322   unsigned DMask = 0;
4323 
4324   // Check for 16 bit addresses and pack if true.
4325   LLT GradTy =
4326       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4327   LLT AddrTy =
4328       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4329   const bool IsG16 = GradTy == S16;
4330   const bool IsA16 = AddrTy == S16;
4331 
4332   int DMaskLanes = 0;
4333   if (!BaseOpcode->Atomic) {
4334     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4335     if (BaseOpcode->Gather4) {
4336       DMaskLanes = 4;
4337     } else if (DMask != 0) {
4338       DMaskLanes = countPopulation(DMask);
4339     } else if (!IsTFE && !BaseOpcode->Store) {
4340       // If dmask is 0, this is a no-op load. This can be eliminated.
4341       B.buildUndef(MI.getOperand(0));
4342       MI.eraseFromParent();
4343       return true;
4344     }
4345   }
4346 
4347   Observer.changingInstr(MI);
4348   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4349 
4350   unsigned NewOpcode = NumDefs == 0 ?
4351     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4352 
4353   // Track that we legalized this
4354   MI.setDesc(B.getTII().get(NewOpcode));
4355 
4356   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4357   // dmask to be at least 1 otherwise the instruction will fail
4358   if (IsTFE && DMask == 0) {
4359     DMask = 0x1;
4360     DMaskLanes = 1;
4361     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4362   }
4363 
4364   if (BaseOpcode->Atomic) {
4365     Register VData0 = MI.getOperand(2).getReg();
4366     LLT Ty = MRI->getType(VData0);
4367 
4368     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4369     if (Ty.isVector())
4370       return false;
4371 
4372     if (BaseOpcode->AtomicX2) {
4373       Register VData1 = MI.getOperand(3).getReg();
4374       // The two values are packed in one register.
4375       LLT PackedTy = LLT::fixed_vector(2, Ty);
4376       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4377       MI.getOperand(2).setReg(Concat.getReg(0));
4378       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4379     }
4380   }
4381 
4382   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4383 
4384   // Optimize _L to _LZ when _L is zero
4385   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4386           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4387     const ConstantFP *ConstantLod;
4388 
4389     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4390                  m_GFCst(ConstantLod))) {
4391       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4392         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4393         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4394             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4395                                                       Intr->Dim);
4396 
4397         // The starting indexes should remain in the same place.
4398         --CorrectedNumVAddrs;
4399 
4400         MI.getOperand(MI.getNumExplicitDefs())
4401             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4402         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4403         Intr = NewImageDimIntr;
4404       }
4405     }
4406   }
4407 
4408   // Optimize _mip away, when 'lod' is zero
4409   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4410     int64_t ConstantLod;
4411     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4412                  m_ICst(ConstantLod))) {
4413       if (ConstantLod == 0) {
4414         // TODO: Change intrinsic opcode and remove operand instead or replacing
4415         // it with 0, as the _L to _LZ handling is done above.
4416         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4417         --CorrectedNumVAddrs;
4418       }
4419     }
4420   }
4421 
4422   // Rewrite the addressing register layout before doing anything else.
4423   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4424     // 16 bit gradients are supported, but are tied to the A16 control
4425     // so both gradients and addresses must be 16 bit
4426     return false;
4427   }
4428 
4429   if (IsA16 && !ST.hasA16()) {
4430     // A16 not supported
4431     return false;
4432   }
4433 
4434   if (IsA16 || IsG16) {
4435     if (Intr->NumVAddrs > 1) {
4436       SmallVector<Register, 4> PackedRegs;
4437 
4438       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4439                                 IsG16);
4440 
4441       // See also below in the non-a16 branch
4442       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4443                           PackedRegs.size() <= ST.getNSAMaxSize();
4444 
4445       if (!UseNSA && PackedRegs.size() > 1) {
4446         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4447         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4448         PackedRegs[0] = Concat.getReg(0);
4449         PackedRegs.resize(1);
4450       }
4451 
4452       const unsigned NumPacked = PackedRegs.size();
4453       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4454         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4455         if (!SrcOp.isReg()) {
4456           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4457           continue;
4458         }
4459 
4460         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4461 
4462         if (I - Intr->VAddrStart < NumPacked)
4463           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4464         else
4465           SrcOp.setReg(AMDGPU::NoRegister);
4466       }
4467     }
4468   } else {
4469     // If the register allocator cannot place the address registers contiguously
4470     // without introducing moves, then using the non-sequential address encoding
4471     // is always preferable, since it saves VALU instructions and is usually a
4472     // wash in terms of code size or even better.
4473     //
4474     // However, we currently have no way of hinting to the register allocator
4475     // that MIMG addresses should be placed contiguously when it is possible to
4476     // do so, so force non-NSA for the common 2-address case as a heuristic.
4477     //
4478     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4479     // allocation when possible.
4480     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4481                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4482 
4483     if (!UseNSA && Intr->NumVAddrs > 1)
4484       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4485                                Intr->NumVAddrs);
4486   }
4487 
4488   int Flags = 0;
4489   if (IsA16)
4490     Flags |= 1;
4491   if (IsG16)
4492     Flags |= 2;
4493   MI.addOperand(MachineOperand::CreateImm(Flags));
4494 
4495   if (BaseOpcode->Store) { // No TFE for stores?
4496     // TODO: Handle dmask trim
4497     Register VData = MI.getOperand(1).getReg();
4498     LLT Ty = MRI->getType(VData);
4499     if (!Ty.isVector() || Ty.getElementType() != S16)
4500       return true;
4501 
4502     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4503     if (RepackedReg != VData) {
4504       MI.getOperand(1).setReg(RepackedReg);
4505     }
4506 
4507     return true;
4508   }
4509 
4510   Register DstReg = MI.getOperand(0).getReg();
4511   LLT Ty = MRI->getType(DstReg);
4512   const LLT EltTy = Ty.getScalarType();
4513   const bool IsD16 = Ty.getScalarType() == S16;
4514   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4515 
4516   // Confirm that the return type is large enough for the dmask specified
4517   if (NumElts < DMaskLanes)
4518     return false;
4519 
4520   if (NumElts > 4 || DMaskLanes > 4)
4521     return false;
4522 
4523   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4524   const LLT AdjustedTy =
4525       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4526 
4527   // The raw dword aligned data component of the load. The only legal cases
4528   // where this matters should be when using the packed D16 format, for
4529   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4530   LLT RoundedTy;
4531 
4532   // S32 vector to to cover all data, plus TFE result element.
4533   LLT TFETy;
4534 
4535   // Register type to use for each loaded component. Will be S32 or V2S16.
4536   LLT RegTy;
4537 
4538   if (IsD16 && ST.hasUnpackedD16VMem()) {
4539     RoundedTy =
4540         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4541     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4542     RegTy = S32;
4543   } else {
4544     unsigned EltSize = EltTy.getSizeInBits();
4545     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4546     unsigned RoundedSize = 32 * RoundedElts;
4547     RoundedTy = LLT::scalarOrVector(
4548         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4549     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4550     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4551   }
4552 
4553   // The return type does not need adjustment.
4554   // TODO: Should we change s16 case to s32 or <2 x s16>?
4555   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4556     return true;
4557 
4558   Register Dst1Reg;
4559 
4560   // Insert after the instruction.
4561   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4562 
4563   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4564   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4565   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4566   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4567 
4568   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4569 
4570   MI.getOperand(0).setReg(NewResultReg);
4571 
4572   // In the IR, TFE is supposed to be used with a 2 element struct return
4573   // type. The intruction really returns these two values in one contiguous
4574   // register, with one additional dword beyond the loaded data. Rewrite the
4575   // return type to use a single register result.
4576 
4577   if (IsTFE) {
4578     Dst1Reg = MI.getOperand(1).getReg();
4579     if (MRI->getType(Dst1Reg) != S32)
4580       return false;
4581 
4582     // TODO: Make sure the TFE operand bit is set.
4583     MI.RemoveOperand(1);
4584 
4585     // Handle the easy case that requires no repack instructions.
4586     if (Ty == S32) {
4587       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4588       return true;
4589     }
4590   }
4591 
4592   // Now figure out how to copy the new result register back into the old
4593   // result.
4594   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4595 
4596   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4597 
4598   if (ResultNumRegs == 1) {
4599     assert(!IsTFE);
4600     ResultRegs[0] = NewResultReg;
4601   } else {
4602     // We have to repack into a new vector of some kind.
4603     for (int I = 0; I != NumDataRegs; ++I)
4604       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4605     B.buildUnmerge(ResultRegs, NewResultReg);
4606 
4607     // Drop the final TFE element to get the data part. The TFE result is
4608     // directly written to the right place already.
4609     if (IsTFE)
4610       ResultRegs.resize(NumDataRegs);
4611   }
4612 
4613   // For an s16 scalar result, we form an s32 result with a truncate regardless
4614   // of packed vs. unpacked.
4615   if (IsD16 && !Ty.isVector()) {
4616     B.buildTrunc(DstReg, ResultRegs[0]);
4617     return true;
4618   }
4619 
4620   // Avoid a build/concat_vector of 1 entry.
4621   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4622     B.buildBitcast(DstReg, ResultRegs[0]);
4623     return true;
4624   }
4625 
4626   assert(Ty.isVector());
4627 
4628   if (IsD16) {
4629     // For packed D16 results with TFE enabled, all the data components are
4630     // S32. Cast back to the expected type.
4631     //
4632     // TODO: We don't really need to use load s32 elements. We would only need one
4633     // cast for the TFE result if a multiple of v2s16 was used.
4634     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4635       for (Register &Reg : ResultRegs)
4636         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4637     } else if (ST.hasUnpackedD16VMem()) {
4638       for (Register &Reg : ResultRegs)
4639         Reg = B.buildTrunc(S16, Reg).getReg(0);
4640     }
4641   }
4642 
4643   auto padWithUndef = [&](LLT Ty, int NumElts) {
4644     if (NumElts == 0)
4645       return;
4646     Register Undef = B.buildUndef(Ty).getReg(0);
4647     for (int I = 0; I != NumElts; ++I)
4648       ResultRegs.push_back(Undef);
4649   };
4650 
4651   // Pad out any elements eliminated due to the dmask.
4652   LLT ResTy = MRI->getType(ResultRegs[0]);
4653   if (!ResTy.isVector()) {
4654     padWithUndef(ResTy, NumElts - ResultRegs.size());
4655     B.buildBuildVector(DstReg, ResultRegs);
4656     return true;
4657   }
4658 
4659   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4660   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4661 
4662   // Deal with the one annoying legal case.
4663   const LLT V3S16 = LLT::fixed_vector(3, 16);
4664   if (Ty == V3S16) {
4665     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4666     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4667     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4668     return true;
4669   }
4670 
4671   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4672   B.buildConcatVectors(DstReg, ResultRegs);
4673   return true;
4674 }
4675 
4676 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4677   LegalizerHelper &Helper, MachineInstr &MI) const {
4678   MachineIRBuilder &B = Helper.MIRBuilder;
4679   GISelChangeObserver &Observer = Helper.Observer;
4680 
4681   Register Dst = MI.getOperand(0).getReg();
4682   LLT Ty = B.getMRI()->getType(Dst);
4683   unsigned Size = Ty.getSizeInBits();
4684   MachineFunction &MF = B.getMF();
4685 
4686   Observer.changingInstr(MI);
4687 
4688   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4689     Ty = getBitcastRegisterType(Ty);
4690     Helper.bitcastDst(MI, Ty, 0);
4691     Dst = MI.getOperand(0).getReg();
4692     B.setInsertPt(B.getMBB(), MI);
4693   }
4694 
4695   // FIXME: We don't really need this intermediate instruction. The intrinsic
4696   // should be fixed to have a memory operand. Since it's readnone, we're not
4697   // allowed to add one.
4698   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4699   MI.RemoveOperand(1); // Remove intrinsic ID
4700 
4701   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4702   // TODO: Should this use datalayout alignment?
4703   const unsigned MemSize = (Size + 7) / 8;
4704   const Align MemAlign(4);
4705   MachineMemOperand *MMO = MF.getMachineMemOperand(
4706       MachinePointerInfo(),
4707       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4708           MachineMemOperand::MOInvariant,
4709       MemSize, MemAlign);
4710   MI.addMemOperand(MF, MMO);
4711 
4712   // There are no 96-bit result scalar loads, but widening to 128-bit should
4713   // always be legal. We may need to restore this to a 96-bit result if it turns
4714   // out this needs to be converted to a vector load during RegBankSelect.
4715   if (!isPowerOf2_32(Size)) {
4716     if (Ty.isVector())
4717       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4718     else
4719       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4720   }
4721 
4722   Observer.changedInstr(MI);
4723   return true;
4724 }
4725 
4726 // TODO: Move to selection
4727 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4728                                                 MachineRegisterInfo &MRI,
4729                                                 MachineIRBuilder &B) const {
4730   if (!ST.isTrapHandlerEnabled() ||
4731       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4732     return legalizeTrapEndpgm(MI, MRI, B);
4733 
4734   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4735     switch (*HsaAbiVer) {
4736     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4737     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4738       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4739     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4740       return ST.supportsGetDoorbellID() ?
4741           legalizeTrapHsa(MI, MRI, B) :
4742           legalizeTrapHsaQueuePtr(MI, MRI, B);
4743     }
4744   }
4745 
4746   llvm_unreachable("Unknown trap handler");
4747 }
4748 
4749 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4750     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4751   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4752   MI.eraseFromParent();
4753   return true;
4754 }
4755 
4756 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4757     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4758   // Pass queue pointer to trap handler as input, and insert trap instruction
4759   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4760   Register LiveIn =
4761     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4762   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4763     return false;
4764 
4765   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4766   B.buildCopy(SGPR01, LiveIn);
4767   B.buildInstr(AMDGPU::S_TRAP)
4768       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4769       .addReg(SGPR01, RegState::Implicit);
4770 
4771   MI.eraseFromParent();
4772   return true;
4773 }
4774 
4775 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4776     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4777   B.buildInstr(AMDGPU::S_TRAP)
4778       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4779   MI.eraseFromParent();
4780   return true;
4781 }
4782 
4783 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4784     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4785   // Is non-HSA path or trap-handler disabled? then, report a warning
4786   // accordingly
4787   if (!ST.isTrapHandlerEnabled() ||
4788       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4789     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4790                                      "debugtrap handler not supported",
4791                                      MI.getDebugLoc(), DS_Warning);
4792     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4793     Ctx.diagnose(NoTrap);
4794   } else {
4795     // Insert debug-trap instruction
4796     B.buildInstr(AMDGPU::S_TRAP)
4797         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4798   }
4799 
4800   MI.eraseFromParent();
4801   return true;
4802 }
4803 
4804 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4805                                                MachineIRBuilder &B) const {
4806   MachineRegisterInfo &MRI = *B.getMRI();
4807   const LLT S16 = LLT::scalar(16);
4808   const LLT S32 = LLT::scalar(32);
4809 
4810   Register DstReg = MI.getOperand(0).getReg();
4811   Register NodePtr = MI.getOperand(2).getReg();
4812   Register RayExtent = MI.getOperand(3).getReg();
4813   Register RayOrigin = MI.getOperand(4).getReg();
4814   Register RayDir = MI.getOperand(5).getReg();
4815   Register RayInvDir = MI.getOperand(6).getReg();
4816   Register TDescr = MI.getOperand(7).getReg();
4817 
4818   if (!ST.hasGFX10_AEncoding()) {
4819     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4820                                         "intrinsic not supported on subtarget",
4821                                         MI.getDebugLoc());
4822     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4823     return false;
4824   }
4825 
4826   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4827   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4828   const unsigned NumVDataDwords = 4;
4829   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4830   const bool UseNSA =
4831       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4832   const unsigned BaseOpcodes[2][2] = {
4833       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4834       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4835        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4836   int Opcode;
4837   if (UseNSA) {
4838     Opcode =
4839         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4840                               NumVDataDwords, NumVAddrDwords);
4841   } else {
4842     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4843                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4844                                    PowerOf2Ceil(NumVAddrDwords));
4845   }
4846   assert(Opcode != -1);
4847 
4848   SmallVector<Register, 12> Ops;
4849   if (Is64) {
4850     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4851     Ops.push_back(Unmerge.getReg(0));
4852     Ops.push_back(Unmerge.getReg(1));
4853   } else {
4854     Ops.push_back(NodePtr);
4855   }
4856   Ops.push_back(RayExtent);
4857 
4858   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4859     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4860     Ops.push_back(Unmerge.getReg(0));
4861     Ops.push_back(Unmerge.getReg(1));
4862     Ops.push_back(Unmerge.getReg(2));
4863   };
4864 
4865   packLanes(RayOrigin);
4866   if (IsA16) {
4867     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4868     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4869     Register R1 = MRI.createGenericVirtualRegister(S32);
4870     Register R2 = MRI.createGenericVirtualRegister(S32);
4871     Register R3 = MRI.createGenericVirtualRegister(S32);
4872     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4873     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4874     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4875     Ops.push_back(R1);
4876     Ops.push_back(R2);
4877     Ops.push_back(R3);
4878   } else {
4879     packLanes(RayDir);
4880     packLanes(RayInvDir);
4881   }
4882 
4883   if (!UseNSA) {
4884     // Build a single vector containing all the operands so far prepared.
4885     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4886     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4887     Ops.clear();
4888     Ops.push_back(MergedOps);
4889   }
4890 
4891   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4892     .addDef(DstReg)
4893     .addImm(Opcode);
4894 
4895   for (Register R : Ops) {
4896     MIB.addUse(R);
4897   }
4898 
4899   MIB.addUse(TDescr)
4900      .addImm(IsA16 ? 1 : 0)
4901      .cloneMemRefs(MI);
4902 
4903   MI.eraseFromParent();
4904   return true;
4905 }
4906 
4907 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4908                                             MachineInstr &MI) const {
4909   MachineIRBuilder &B = Helper.MIRBuilder;
4910   MachineRegisterInfo &MRI = *B.getMRI();
4911 
4912   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4913   auto IntrID = MI.getIntrinsicID();
4914   switch (IntrID) {
4915   case Intrinsic::amdgcn_if:
4916   case Intrinsic::amdgcn_else: {
4917     MachineInstr *Br = nullptr;
4918     MachineBasicBlock *UncondBrTarget = nullptr;
4919     bool Negated = false;
4920     if (MachineInstr *BrCond =
4921             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4922       const SIRegisterInfo *TRI
4923         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4924 
4925       Register Def = MI.getOperand(1).getReg();
4926       Register Use = MI.getOperand(3).getReg();
4927 
4928       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4929 
4930       if (Negated)
4931         std::swap(CondBrTarget, UncondBrTarget);
4932 
4933       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4934       if (IntrID == Intrinsic::amdgcn_if) {
4935         B.buildInstr(AMDGPU::SI_IF)
4936           .addDef(Def)
4937           .addUse(Use)
4938           .addMBB(UncondBrTarget);
4939       } else {
4940         B.buildInstr(AMDGPU::SI_ELSE)
4941             .addDef(Def)
4942             .addUse(Use)
4943             .addMBB(UncondBrTarget);
4944       }
4945 
4946       if (Br) {
4947         Br->getOperand(0).setMBB(CondBrTarget);
4948       } else {
4949         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4950         // since we're swapping branch targets it needs to be reinserted.
4951         // FIXME: IRTranslator should probably not do this
4952         B.buildBr(*CondBrTarget);
4953       }
4954 
4955       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4956       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4957       MI.eraseFromParent();
4958       BrCond->eraseFromParent();
4959       return true;
4960     }
4961 
4962     return false;
4963   }
4964   case Intrinsic::amdgcn_loop: {
4965     MachineInstr *Br = nullptr;
4966     MachineBasicBlock *UncondBrTarget = nullptr;
4967     bool Negated = false;
4968     if (MachineInstr *BrCond =
4969             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4970       const SIRegisterInfo *TRI
4971         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4972 
4973       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4974       Register Reg = MI.getOperand(2).getReg();
4975 
4976       if (Negated)
4977         std::swap(CondBrTarget, UncondBrTarget);
4978 
4979       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4980       B.buildInstr(AMDGPU::SI_LOOP)
4981         .addUse(Reg)
4982         .addMBB(UncondBrTarget);
4983 
4984       if (Br)
4985         Br->getOperand(0).setMBB(CondBrTarget);
4986       else
4987         B.buildBr(*CondBrTarget);
4988 
4989       MI.eraseFromParent();
4990       BrCond->eraseFromParent();
4991       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4992       return true;
4993     }
4994 
4995     return false;
4996   }
4997   case Intrinsic::amdgcn_kernarg_segment_ptr:
4998     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4999       // This only makes sense to call in a kernel, so just lower to null.
5000       B.buildConstant(MI.getOperand(0).getReg(), 0);
5001       MI.eraseFromParent();
5002       return true;
5003     }
5004 
5005     return legalizePreloadedArgIntrin(
5006       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5007   case Intrinsic::amdgcn_implicitarg_ptr:
5008     return legalizeImplicitArgPtr(MI, MRI, B);
5009   case Intrinsic::amdgcn_workitem_id_x:
5010     return legalizePreloadedArgIntrin(MI, MRI, B,
5011                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5012   case Intrinsic::amdgcn_workitem_id_y:
5013     return legalizePreloadedArgIntrin(MI, MRI, B,
5014                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5015   case Intrinsic::amdgcn_workitem_id_z:
5016     return legalizePreloadedArgIntrin(MI, MRI, B,
5017                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5018   case Intrinsic::amdgcn_workgroup_id_x:
5019     return legalizePreloadedArgIntrin(MI, MRI, B,
5020                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5021   case Intrinsic::amdgcn_workgroup_id_y:
5022     return legalizePreloadedArgIntrin(MI, MRI, B,
5023                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5024   case Intrinsic::amdgcn_workgroup_id_z:
5025     return legalizePreloadedArgIntrin(MI, MRI, B,
5026                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5027   case Intrinsic::amdgcn_dispatch_ptr:
5028     return legalizePreloadedArgIntrin(MI, MRI, B,
5029                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5030   case Intrinsic::amdgcn_queue_ptr:
5031     return legalizePreloadedArgIntrin(MI, MRI, B,
5032                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5033   case Intrinsic::amdgcn_implicit_buffer_ptr:
5034     return legalizePreloadedArgIntrin(
5035       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5036   case Intrinsic::amdgcn_dispatch_id:
5037     return legalizePreloadedArgIntrin(MI, MRI, B,
5038                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5039   case Intrinsic::amdgcn_fdiv_fast:
5040     return legalizeFDIVFastIntrin(MI, MRI, B);
5041   case Intrinsic::amdgcn_is_shared:
5042     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5043   case Intrinsic::amdgcn_is_private:
5044     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5045   case Intrinsic::amdgcn_wavefrontsize: {
5046     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5047     MI.eraseFromParent();
5048     return true;
5049   }
5050   case Intrinsic::amdgcn_s_buffer_load:
5051     return legalizeSBufferLoad(Helper, MI);
5052   case Intrinsic::amdgcn_raw_buffer_store:
5053   case Intrinsic::amdgcn_struct_buffer_store:
5054     return legalizeBufferStore(MI, MRI, B, false, false);
5055   case Intrinsic::amdgcn_raw_buffer_store_format:
5056   case Intrinsic::amdgcn_struct_buffer_store_format:
5057     return legalizeBufferStore(MI, MRI, B, false, true);
5058   case Intrinsic::amdgcn_raw_tbuffer_store:
5059   case Intrinsic::amdgcn_struct_tbuffer_store:
5060     return legalizeBufferStore(MI, MRI, B, true, true);
5061   case Intrinsic::amdgcn_raw_buffer_load:
5062   case Intrinsic::amdgcn_struct_buffer_load:
5063     return legalizeBufferLoad(MI, MRI, B, false, false);
5064   case Intrinsic::amdgcn_raw_buffer_load_format:
5065   case Intrinsic::amdgcn_struct_buffer_load_format:
5066     return legalizeBufferLoad(MI, MRI, B, true, false);
5067   case Intrinsic::amdgcn_raw_tbuffer_load:
5068   case Intrinsic::amdgcn_struct_tbuffer_load:
5069     return legalizeBufferLoad(MI, MRI, B, true, true);
5070   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5071   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5072   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5073   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5074   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5075   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5076   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5077   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5078   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5079   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5080   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5081   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5082   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5083   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5084   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5085   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5086   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5087   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5088   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5089   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5090   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5091   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5092   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5093   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5094   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5095   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5096   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5097   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5098   case Intrinsic::amdgcn_buffer_atomic_fadd:
5099   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5100   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5101   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5102   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5103     return legalizeBufferAtomic(MI, B, IntrID);
5104   case Intrinsic::amdgcn_atomic_inc:
5105     return legalizeAtomicIncDec(MI, B, true);
5106   case Intrinsic::amdgcn_atomic_dec:
5107     return legalizeAtomicIncDec(MI, B, false);
5108   case Intrinsic::trap:
5109     return legalizeTrapIntrinsic(MI, MRI, B);
5110   case Intrinsic::debugtrap:
5111     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5112   case Intrinsic::amdgcn_rsq_clamp:
5113     return legalizeRsqClampIntrinsic(MI, MRI, B);
5114   case Intrinsic::amdgcn_ds_fadd:
5115   case Intrinsic::amdgcn_ds_fmin:
5116   case Intrinsic::amdgcn_ds_fmax:
5117     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5118   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5119     return legalizeBVHIntrinsic(MI, B);
5120   default: {
5121     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5122             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5123       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5124     return true;
5125   }
5126   }
5127 
5128   return true;
5129 }
5130