1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElementsStrict(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 
1056     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057       return true;
1058 
1059     const LLT PtrTy = Query.Types[1];
1060     unsigned AS = PtrTy.getAddressSpace();
1061     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062       return true;
1063 
1064     // Catch weird sized loads that don't evenly divide into the access sizes
1065     // TODO: May be able to widen depending on alignment etc.
1066     unsigned NumRegs = (MemSize + 31) / 32;
1067     if (NumRegs == 3) {
1068       if (!ST.hasDwordx3LoadStores())
1069         return true;
1070     } else {
1071       // If the alignment allows, these should have been widened.
1072       if (!isPowerOf2_32(NumRegs))
1073         return true;
1074     }
1075 
1076     return false;
1077   };
1078 
1079   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084   // LDS
1085   // TODO: Unsupported flat for SI.
1086 
1087   for (unsigned Op : {G_LOAD, G_STORE}) {
1088     const bool IsStore = Op == G_STORE;
1089 
1090     auto &Actions = getActionDefinitionsBuilder(Op);
1091     // Explicitly list some common cases.
1092     // TODO: Does this help compile time at all?
1093     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096                                       {S64, GlobalPtr, S64, GlobalAlign32},
1097                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099                                       {S32, GlobalPtr, S8, GlobalAlign8},
1100                                       {S32, GlobalPtr, S16, GlobalAlign16},
1101 
1102                                       {S32, LocalPtr, S32, 32},
1103                                       {S64, LocalPtr, S64, 32},
1104                                       {V2S32, LocalPtr, V2S32, 32},
1105                                       {S32, LocalPtr, S8, 8},
1106                                       {S32, LocalPtr, S16, 16},
1107                                       {V2S16, LocalPtr, S32, 32},
1108 
1109                                       {S32, PrivatePtr, S32, 32},
1110                                       {S32, PrivatePtr, S8, 8},
1111                                       {S32, PrivatePtr, S16, 16},
1112                                       {V2S16, PrivatePtr, S32, 32},
1113 
1114                                       {S32, ConstantPtr, S32, GlobalAlign32},
1115                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117                                       {S64, ConstantPtr, S64, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119     Actions.legalIf(
1120       [=](const LegalityQuery &Query) -> bool {
1121         return isLoadStoreLegal(ST, Query);
1122       });
1123 
1124     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125     // 64-bits.
1126     //
1127     // TODO: Should generalize bitcast action into coerce, which will also cover
1128     // inserting addrspacecasts.
1129     Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131     // Turn any illegal element vectors into something easier to deal
1132     // with. These will ultimately produce 32-bit scalar shifts to extract the
1133     // parts anyway.
1134     //
1135     // For odd 16-bit element vectors, prefer to split those into pieces with
1136     // 16-bit vector parts.
1137     Actions.bitcastIf(
1138       [=](const LegalityQuery &Query) -> bool {
1139         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140                                           Query.MMODescrs[0].MemoryTy);
1141       }, bitcastToRegisterType(0));
1142 
1143     if (!IsStore) {
1144       // Widen suitably aligned loads by loading extra bytes. The standard
1145       // legalization actions can't properly express widening memory operands.
1146       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147         return shouldWidenLoad(ST, Query, G_LOAD);
1148       });
1149     }
1150 
1151     // FIXME: load/store narrowing should be moved to lower action
1152     Actions
1153         .narrowScalarIf(
1154             [=](const LegalityQuery &Query) -> bool {
1155               return !Query.Types[0].isVector() &&
1156                      needToSplitMemOp(Query, Op == G_LOAD);
1157             },
1158             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159               const LLT DstTy = Query.Types[0];
1160               const LLT PtrTy = Query.Types[1];
1161 
1162               const unsigned DstSize = DstTy.getSizeInBits();
1163               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164 
1165               // Split extloads.
1166               if (DstSize > MemSize)
1167                 return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169               unsigned MaxSize = maxSizeForAddrSpace(ST,
1170                                                      PtrTy.getAddressSpace(),
1171                                                      Op == G_LOAD);
1172               if (MemSize > MaxSize)
1173                 return std::make_pair(0, LLT::scalar(MaxSize));
1174 
1175               unsigned Align = Query.MMODescrs[0].AlignInBits;
1176               return std::make_pair(0, LLT::scalar(Align));
1177             })
1178         .fewerElementsIf(
1179             [=](const LegalityQuery &Query) -> bool {
1180               return Query.Types[0].isVector() &&
1181                      needToSplitMemOp(Query, Op == G_LOAD);
1182             },
1183             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184               const LLT DstTy = Query.Types[0];
1185               const LLT PtrTy = Query.Types[1];
1186 
1187               LLT EltTy = DstTy.getElementType();
1188               unsigned MaxSize = maxSizeForAddrSpace(ST,
1189                                                      PtrTy.getAddressSpace(),
1190                                                      Op == G_LOAD);
1191 
1192               // FIXME: Handle widened to power of 2 results better. This ends
1193               // up scalarizing.
1194               // FIXME: 3 element stores scalarized on SI
1195 
1196               // Split if it's too large for the address space.
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198               if (MemSize > MaxSize) {
1199                 unsigned NumElts = DstTy.getNumElements();
1200                 unsigned EltSize = EltTy.getSizeInBits();
1201 
1202                 if (MaxSize % EltSize == 0) {
1203                   return std::make_pair(
1204                       0, LLT::scalarOrVector(
1205                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = MemSize / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(
1217                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               if (DstTy.getSizeInBits() > MemSize)
1222                 return std::make_pair(0, EltTy);
1223 
1224               unsigned EltSize = EltTy.getSizeInBits();
1225               unsigned DstSize = DstTy.getSizeInBits();
1226               if (!isPowerOf2_32(DstSize)) {
1227                 // We're probably decomposing an odd sized store. Try to split
1228                 // to the widest type. TODO: Account for alignment. As-is it
1229                 // should be OK, since the new parts will be further legalized.
1230                 unsigned FloorSize = PowerOf2Floor(DstSize);
1231                 return std::make_pair(
1232                     0, LLT::scalarOrVector(
1233                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234               }
1235 
1236               // May need relegalization for the scalars.
1237               return std::make_pair(0, EltTy);
1238             })
1239     .minScalar(0, S32)
1240     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241     .widenScalarToNextPow2(0)
1242     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243     .lower();
1244   }
1245 
1246   // FIXME: Unaligned accesses not lowered.
1247   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249                                                   {S32, GlobalPtr, S16, 2 * 8},
1250                                                   {S32, LocalPtr, S8, 8},
1251                                                   {S32, LocalPtr, S16, 16},
1252                                                   {S32, PrivatePtr, S8, 8},
1253                                                   {S32, PrivatePtr, S16, 16},
1254                                                   {S32, ConstantPtr, S8, 8},
1255                                                   {S32, ConstantPtr, S16, 2 * 8}})
1256                        .legalIf(
1257                          [=](const LegalityQuery &Query) -> bool {
1258                            return isLoadStoreLegal(ST, Query);
1259                          });
1260 
1261   if (ST.hasFlatAddressSpace()) {
1262     ExtLoads.legalForTypesWithMemDesc(
1263         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264   }
1265 
1266   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267   // 64-bits.
1268   //
1269   // TODO: Should generalize bitcast action into coerce, which will also cover
1270   // inserting addrspacecasts.
1271   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272 
1273   ExtLoads.clampScalar(0, S32, S32)
1274           .widenScalarToNextPow2(0)
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290   if (ST.hasLDSFPAtomicAdd()) {
1291     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292     if (ST.hasGFX90AInsts())
1293       Atomic.legalFor({{S64, LocalPtr}});
1294   }
1295   if (ST.hasAtomicFaddInsts())
1296     Atomic.legalFor({{S32, GlobalPtr}});
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310                                  LocalPtr, FlatPtr, PrivatePtr,
1311                                  LLT::fixed_vector(2, LocalPtr),
1312                                  LLT::fixed_vector(2, PrivatePtr)},
1313                                 {S1, S32})
1314       .clampScalar(0, S16, S64)
1315       .scalarize(1)
1316       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318       .clampMaxNumElements(0, S32, 2)
1319       .clampMaxNumElements(0, LocalPtr, 2)
1320       .clampMaxNumElements(0, PrivatePtr, 2)
1321       .scalarize(0)
1322       .widenScalarToNextPow2(0)
1323       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324 
1325   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326   // be more flexible with the shift amount type.
1327   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328     .legalFor({{S32, S32}, {S64, S32}});
1329   if (ST.has16BitInsts()) {
1330     if (ST.hasVOP3PInsts()) {
1331       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332             .clampMaxNumElements(0, S16, 2);
1333     } else
1334       Shifts.legalFor({{S16, S16}});
1335 
1336     // TODO: Support 16-bit shift amounts for all types
1337     Shifts.widenScalarIf(
1338       [=](const LegalityQuery &Query) {
1339         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340         // 32-bit amount.
1341         const LLT ValTy = Query.Types[0];
1342         const LLT AmountTy = Query.Types[1];
1343         return ValTy.getSizeInBits() <= 16 &&
1344                AmountTy.getSizeInBits() < 16;
1345       }, changeTo(1, S16));
1346     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347     Shifts.clampScalar(1, S32, S32);
1348     Shifts.clampScalar(0, S16, S64);
1349     Shifts.widenScalarToNextPow2(0, 16);
1350 
1351     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352       .minScalar(0, S16)
1353       .scalarize(0)
1354       .lower();
1355   } else {
1356     // Make sure we legalize the shift amount type first, as the general
1357     // expansion for the shifted type will produce much worse code if it hasn't
1358     // been truncated already.
1359     Shifts.clampScalar(1, S32, S32);
1360     Shifts.clampScalar(0, S32, S64);
1361     Shifts.widenScalarToNextPow2(0, 32);
1362 
1363     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364       .minScalar(0, S32)
1365       .scalarize(0)
1366       .lower();
1367   }
1368   Shifts.scalarize(0);
1369 
1370   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373     unsigned IdxTypeIdx = 2;
1374 
1375     getActionDefinitionsBuilder(Op)
1376       .customIf([=](const LegalityQuery &Query) {
1377           const LLT EltTy = Query.Types[EltTypeIdx];
1378           const LLT VecTy = Query.Types[VecTypeIdx];
1379           const LLT IdxTy = Query.Types[IdxTypeIdx];
1380           const unsigned EltSize = EltTy.getSizeInBits();
1381           return (EltSize == 32 || EltSize == 64) &&
1382                   VecTy.getSizeInBits() % 32 == 0 &&
1383                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1384                   IdxTy.getSizeInBits() == 32;
1385         })
1386       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387                  bitcastToVectorElement32(VecTypeIdx))
1388       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389       .bitcastIf(
1390         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391         [=](const LegalityQuery &Query) {
1392           // For > 64-bit element types, try to turn this into a 64-bit
1393           // element vector since we may be able to do better indexing
1394           // if this is scalar. If not, fall back to 32.
1395           const LLT EltTy = Query.Types[EltTypeIdx];
1396           const LLT VecTy = Query.Types[VecTypeIdx];
1397           const unsigned DstEltSize = EltTy.getSizeInBits();
1398           const unsigned VecSize = VecTy.getSizeInBits();
1399 
1400           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401           return std::make_pair(
1402               VecTypeIdx,
1403               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404         })
1405       .clampScalar(EltTypeIdx, S32, S64)
1406       .clampScalar(VecTypeIdx, S32, S64)
1407       .clampScalar(IdxTypeIdx, S32, S32)
1408       .clampMaxNumElements(VecTypeIdx, S32, 32)
1409       // TODO: Clamp elements for 64-bit vectors?
1410       // It should only be necessary with variable indexes.
1411       // As a last resort, lower to the stack
1412       .lower();
1413   }
1414 
1415   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416     .unsupportedIf([=](const LegalityQuery &Query) {
1417         const LLT &EltTy = Query.Types[1].getElementType();
1418         return Query.Types[0] != EltTy;
1419       });
1420 
1421   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425     // FIXME: Doesn't handle extract of illegal sizes.
1426     getActionDefinitionsBuilder(Op)
1427       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428       .lowerIf([=](const LegalityQuery &Query) {
1429           // Sub-vector(or single element) insert and extract.
1430           // TODO: verify immediate offset here since lower only works with
1431           // whole elements.
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           return BigTy.isVector();
1434         })
1435       // FIXME: Multiples of 16 should not be legal.
1436       .legalIf([=](const LegalityQuery &Query) {
1437           const LLT BigTy = Query.Types[BigTyIdx];
1438           const LLT LitTy = Query.Types[LitTyIdx];
1439           return (BigTy.getSizeInBits() % 32 == 0) &&
1440                  (LitTy.getSizeInBits() % 16 == 0);
1441         })
1442       .widenScalarIf(
1443         [=](const LegalityQuery &Query) {
1444           const LLT BigTy = Query.Types[BigTyIdx];
1445           return (BigTy.getScalarSizeInBits() < 16);
1446         },
1447         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1448       .widenScalarIf(
1449         [=](const LegalityQuery &Query) {
1450           const LLT LitTy = Query.Types[LitTyIdx];
1451           return (LitTy.getScalarSizeInBits() < 16);
1452         },
1453         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1454       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455       .widenScalarToNextPow2(BigTyIdx, 32);
1456 
1457   }
1458 
1459   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460     .legalForCartesianProduct(AllS32Vectors, {S32})
1461     .legalForCartesianProduct(AllS64Vectors, {S64})
1462     .clampNumElements(0, V16S32, V32S32)
1463     .clampNumElements(0, V2S64, V16S64)
1464     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465 
1466   if (ST.hasScalarPackInsts()) {
1467     BuildVector
1468       // FIXME: Should probably widen s1 vectors straight to s32
1469       .minScalarOrElt(0, S16)
1470       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471       .minScalar(1, S32);
1472 
1473     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474       .legalFor({V2S16, S32})
1475       .lower();
1476     BuildVector.minScalarOrElt(0, S32);
1477   } else {
1478     BuildVector.customFor({V2S16, S16});
1479     BuildVector.minScalarOrElt(0, S32);
1480 
1481     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482       .customFor({V2S16, S32})
1483       .lower();
1484   }
1485 
1486   BuildVector.legalIf(isRegisterType(0));
1487 
1488   // FIXME: Clamp maximum size
1489   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491     .clampMaxNumElements(0, S32, 32)
1492     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493     .clampMaxNumElements(0, S16, 64);
1494 
1495   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496   // pre-legalize.
1497   if (ST.hasVOP3PInsts()) {
1498     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499       .customFor({V2S16, V2S16})
1500       .lower();
1501   } else
1502     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503 
1504   // Merge/Unmerge
1505   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508 
1509     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510       const LLT Ty = Query.Types[TypeIdx];
1511       if (Ty.isVector()) {
1512         const LLT &EltTy = Ty.getElementType();
1513         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514           return true;
1515         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516           return true;
1517       }
1518       return false;
1519     };
1520 
1521     auto &Builder = getActionDefinitionsBuilder(Op)
1522       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523       .lowerFor({{S16, V2S16}})
1524       .lowerIf([=](const LegalityQuery &Query) {
1525           const LLT BigTy = Query.Types[BigTyIdx];
1526           return BigTy.getSizeInBits() == 32;
1527         })
1528       // Try to widen to s16 first for small types.
1529       // TODO: Only do this on targets with legal s16 shifts
1530       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534                            elementTypeIs(1, S16)),
1535                        changeTo(1, V2S16))
1536       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538       // valid.
1539       .clampScalar(LitTyIdx, S32, S512)
1540       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541       // Break up vectors with weird elements into scalars
1542       .fewerElementsIf(
1543         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544         scalarize(0))
1545       .fewerElementsIf(
1546         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547         scalarize(1))
1548       .clampScalar(BigTyIdx, S32, MaxScalar);
1549 
1550     if (Op == G_MERGE_VALUES) {
1551       Builder.widenScalarIf(
1552         // TODO: Use 16-bit shifts if legal for 8-bit values?
1553         [=](const LegalityQuery &Query) {
1554           const LLT Ty = Query.Types[LitTyIdx];
1555           return Ty.getSizeInBits() < 32;
1556         },
1557         changeTo(LitTyIdx, S32));
1558     }
1559 
1560     Builder.widenScalarIf(
1561       [=](const LegalityQuery &Query) {
1562         const LLT Ty = Query.Types[BigTyIdx];
1563         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564           Ty.getSizeInBits() % 16 != 0;
1565       },
1566       [=](const LegalityQuery &Query) {
1567         // Pick the next power of 2, or a multiple of 64 over 128.
1568         // Whichever is smaller.
1569         const LLT &Ty = Query.Types[BigTyIdx];
1570         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571         if (NewSizeInBits >= 256) {
1572           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573           if (RoundedTo < NewSizeInBits)
1574             NewSizeInBits = RoundedTo;
1575         }
1576         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577       })
1578       // Any vectors left are the wrong size. Scalarize them.
1579       .scalarize(0)
1580       .scalarize(1);
1581   }
1582 
1583   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584   // RegBankSelect.
1585   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586     .legalFor({{S32}, {S64}});
1587 
1588   if (ST.hasVOP3PInsts()) {
1589     SextInReg.lowerFor({{V2S16}})
1590       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591       // get more vector shift opportunities, since we'll get those when
1592       // expanded.
1593       .clampMaxNumElementsStrict(0, S16, 2);
1594   } else if (ST.has16BitInsts()) {
1595     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596   } else {
1597     // Prefer to promote to s32 before lowering if we don't have 16-bit
1598     // shifts. This avoid a lot of intermediate truncate and extend operations.
1599     SextInReg.lowerFor({{S32}, {S64}});
1600   }
1601 
1602   SextInReg
1603     .scalarize(0)
1604     .clampScalar(0, S32, S64)
1605     .lower();
1606 
1607   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608     .scalarize(0)
1609     .lower();
1610 
1611   // TODO: Only Try to form v2s16 with legal packed instructions.
1612   getActionDefinitionsBuilder(G_FSHR)
1613     .legalFor({{S32, S32}})
1614     .lowerFor({{V2S16, V2S16}})
1615     .clampMaxNumElementsStrict(0, S16, 2)
1616     .scalarize(0)
1617     .lower();
1618 
1619   if (ST.hasVOP3PInsts()) {
1620     getActionDefinitionsBuilder(G_FSHL)
1621       .lowerFor({{V2S16, V2S16}})
1622       .clampMaxNumElementsStrict(0, S16, 2)
1623       .scalarize(0)
1624       .lower();
1625   } else {
1626     getActionDefinitionsBuilder(G_FSHL)
1627       .scalarize(0)
1628       .lower();
1629   }
1630 
1631   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632     .legalFor({S64});
1633 
1634   getActionDefinitionsBuilder(G_FENCE)
1635     .alwaysLegal();
1636 
1637   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638       .scalarize(0)
1639       .minScalar(0, S32)
1640       .lower();
1641 
1642   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643       .legalFor({{S32, S32}, {S64, S32}})
1644       .clampScalar(1, S32, S32)
1645       .clampScalar(0, S32, S64)
1646       .widenScalarToNextPow2(0)
1647       .scalarize(0);
1648 
1649   getActionDefinitionsBuilder({
1650       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651       G_FCOPYSIGN,
1652 
1653       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654       G_ATOMICRMW_NAND,
1655       G_ATOMICRMW_FSUB,
1656       G_READ_REGISTER,
1657       G_WRITE_REGISTER,
1658 
1659       G_SADDO, G_SSUBO,
1660 
1661        // TODO: Implement
1662       G_FMINIMUM, G_FMAXIMUM}).lower();
1663 
1664   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665       .lower();
1666 
1667   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670     .unsupported();
1671 
1672   getLegacyLegalizerInfo().computeTables();
1673   verify(*ST.getInstrInfo());
1674 }
1675 
1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677                                          MachineInstr &MI) const {
1678   MachineIRBuilder &B = Helper.MIRBuilder;
1679   MachineRegisterInfo &MRI = *B.getMRI();
1680 
1681   switch (MI.getOpcode()) {
1682   case TargetOpcode::G_ADDRSPACE_CAST:
1683     return legalizeAddrSpaceCast(MI, MRI, B);
1684   case TargetOpcode::G_FRINT:
1685     return legalizeFrint(MI, MRI, B);
1686   case TargetOpcode::G_FCEIL:
1687     return legalizeFceil(MI, MRI, B);
1688   case TargetOpcode::G_FREM:
1689     return legalizeFrem(MI, MRI, B);
1690   case TargetOpcode::G_INTRINSIC_TRUNC:
1691     return legalizeIntrinsicTrunc(MI, MRI, B);
1692   case TargetOpcode::G_SITOFP:
1693     return legalizeITOFP(MI, MRI, B, true);
1694   case TargetOpcode::G_UITOFP:
1695     return legalizeITOFP(MI, MRI, B, false);
1696   case TargetOpcode::G_FPTOSI:
1697     return legalizeFPTOI(MI, MRI, B, true);
1698   case TargetOpcode::G_FPTOUI:
1699     return legalizeFPTOI(MI, MRI, B, false);
1700   case TargetOpcode::G_FMINNUM:
1701   case TargetOpcode::G_FMAXNUM:
1702   case TargetOpcode::G_FMINNUM_IEEE:
1703   case TargetOpcode::G_FMAXNUM_IEEE:
1704     return legalizeMinNumMaxNum(Helper, MI);
1705   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706     return legalizeExtractVectorElt(MI, MRI, B);
1707   case TargetOpcode::G_INSERT_VECTOR_ELT:
1708     return legalizeInsertVectorElt(MI, MRI, B);
1709   case TargetOpcode::G_SHUFFLE_VECTOR:
1710     return legalizeShuffleVector(MI, MRI, B);
1711   case TargetOpcode::G_FSIN:
1712   case TargetOpcode::G_FCOS:
1713     return legalizeSinCos(MI, MRI, B);
1714   case TargetOpcode::G_GLOBAL_VALUE:
1715     return legalizeGlobalValue(MI, MRI, B);
1716   case TargetOpcode::G_LOAD:
1717   case TargetOpcode::G_SEXTLOAD:
1718   case TargetOpcode::G_ZEXTLOAD:
1719     return legalizeLoad(Helper, MI);
1720   case TargetOpcode::G_FMAD:
1721     return legalizeFMad(MI, MRI, B);
1722   case TargetOpcode::G_FDIV:
1723     return legalizeFDIV(MI, MRI, B);
1724   case TargetOpcode::G_UDIV:
1725   case TargetOpcode::G_UREM:
1726   case TargetOpcode::G_UDIVREM:
1727     return legalizeUnsignedDIV_REM(MI, MRI, B);
1728   case TargetOpcode::G_SDIV:
1729   case TargetOpcode::G_SREM:
1730   case TargetOpcode::G_SDIVREM:
1731     return legalizeSignedDIV_REM(MI, MRI, B);
1732   case TargetOpcode::G_ATOMIC_CMPXCHG:
1733     return legalizeAtomicCmpXChg(MI, MRI, B);
1734   case TargetOpcode::G_FLOG:
1735     return legalizeFlog(MI, B, numbers::ln2f);
1736   case TargetOpcode::G_FLOG10:
1737     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738   case TargetOpcode::G_FEXP:
1739     return legalizeFExp(MI, B);
1740   case TargetOpcode::G_FPOW:
1741     return legalizeFPow(MI, B);
1742   case TargetOpcode::G_FFLOOR:
1743     return legalizeFFloor(MI, MRI, B);
1744   case TargetOpcode::G_BUILD_VECTOR:
1745     return legalizeBuildVector(MI, MRI, B);
1746   case TargetOpcode::G_CTLZ:
1747   case TargetOpcode::G_CTTZ:
1748     return legalizeCTLZ_CTTZ(MI, MRI, B);
1749   default:
1750     return false;
1751   }
1752 
1753   llvm_unreachable("expected switch to return");
1754 }
1755 
1756 Register AMDGPULegalizerInfo::getSegmentAperture(
1757   unsigned AS,
1758   MachineRegisterInfo &MRI,
1759   MachineIRBuilder &B) const {
1760   MachineFunction &MF = B.getMF();
1761   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762   const LLT S32 = LLT::scalar(32);
1763 
1764   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1765 
1766   if (ST.hasApertureRegs()) {
1767     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768     // getreg.
1769     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1771         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1772     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1773         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1774         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1775     unsigned Encoding =
1776         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1777         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1778         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779 
1780     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781 
1782     B.buildInstr(AMDGPU::S_GETREG_B32)
1783       .addDef(GetReg)
1784       .addImm(Encoding);
1785     MRI.setType(GetReg, S32);
1786 
1787     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789   }
1790 
1791   Register QueuePtr = MRI.createGenericVirtualRegister(
1792     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1793 
1794   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1795     return Register();
1796 
1797   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798   // private_segment_aperture_base_hi.
1799   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800 
1801   // TODO: can we be smarter about machine pointer info?
1802   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1803   MachineMemOperand *MMO = MF.getMachineMemOperand(
1804       PtrInfo,
1805       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1806           MachineMemOperand::MOInvariant,
1807       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808 
1809   Register LoadAddr;
1810 
1811   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813 }
1814 
1815 /// Return true if the value is a known valid address, such that a null check is
1816 /// not necessary.
1817 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1818                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1819   MachineInstr *Def = MRI.getVRegDef(Val);
1820   switch (Def->getOpcode()) {
1821   case AMDGPU::G_FRAME_INDEX:
1822   case AMDGPU::G_GLOBAL_VALUE:
1823   case AMDGPU::G_BLOCK_ADDR:
1824     return true;
1825   case AMDGPU::G_CONSTANT: {
1826     const ConstantInt *CI = Def->getOperand(1).getCImm();
1827     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1828   }
1829   default:
1830     return false;
1831   }
1832 
1833   return false;
1834 }
1835 
1836 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1837   MachineInstr &MI, MachineRegisterInfo &MRI,
1838   MachineIRBuilder &B) const {
1839   MachineFunction &MF = B.getMF();
1840 
1841   const LLT S32 = LLT::scalar(32);
1842   Register Dst = MI.getOperand(0).getReg();
1843   Register Src = MI.getOperand(1).getReg();
1844 
1845   LLT DstTy = MRI.getType(Dst);
1846   LLT SrcTy = MRI.getType(Src);
1847   unsigned DestAS = DstTy.getAddressSpace();
1848   unsigned SrcAS = SrcTy.getAddressSpace();
1849 
1850   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1851   // vector element.
1852   assert(!DstTy.isVector());
1853 
1854   const AMDGPUTargetMachine &TM
1855     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1856 
1857   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1858     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1859     return true;
1860   }
1861 
1862   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863     // Truncate.
1864     B.buildExtract(Dst, Src, 0);
1865     MI.eraseFromParent();
1866     return true;
1867   }
1868 
1869   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1870     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1871     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1872 
1873     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1874     // another. Merge operands are required to be the same type, but creating an
1875     // extra ptrtoint would be kind of pointless.
1876     auto HighAddr = B.buildConstant(
1877       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1878     B.buildMerge(Dst, {Src, HighAddr});
1879     MI.eraseFromParent();
1880     return true;
1881   }
1882 
1883   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1884     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1885            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1886 
1887     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1888       // Extract low 32-bits of the pointer.
1889       B.buildExtract(Dst, Src, 0);
1890       MI.eraseFromParent();
1891       return true;
1892     }
1893 
1894     unsigned NullVal = TM.getNullPointerValue(DestAS);
1895 
1896     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1897     auto FlatNull = B.buildConstant(SrcTy, 0);
1898 
1899     // Extract low 32-bits of the pointer.
1900     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1901 
1902     auto CmpRes =
1903         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1904     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1905 
1906     MI.eraseFromParent();
1907     return true;
1908   }
1909 
1910   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1911     return false;
1912 
1913   if (!ST.hasFlatAddressSpace())
1914     return false;
1915 
1916   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1917   if (!ApertureReg.isValid())
1918     return false;
1919 
1920   // Coerce the type of the low half of the result so we can use merge_values.
1921   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1922 
1923   // TODO: Should we allow mismatched types but matching sizes in merges to
1924   // avoid the ptrtoint?
1925   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1926 
1927   if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1928     B.buildCopy(Dst, BuildPtr);
1929     MI.eraseFromParent();
1930     return true;
1931   }
1932 
1933   auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1934   auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1935 
1936   auto CmpRes =
1937       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1938 
1939   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1940 
1941   MI.eraseFromParent();
1942   return true;
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeFrint(
1946   MachineInstr &MI, MachineRegisterInfo &MRI,
1947   MachineIRBuilder &B) const {
1948   Register Src = MI.getOperand(1).getReg();
1949   LLT Ty = MRI.getType(Src);
1950   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1951 
1952   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1953   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1954 
1955   auto C1 = B.buildFConstant(Ty, C1Val);
1956   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1957 
1958   // TODO: Should this propagate fast-math-flags?
1959   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1960   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1961 
1962   auto C2 = B.buildFConstant(Ty, C2Val);
1963   auto Fabs = B.buildFAbs(Ty, Src);
1964 
1965   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1966   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1967   MI.eraseFromParent();
1968   return true;
1969 }
1970 
1971 bool AMDGPULegalizerInfo::legalizeFceil(
1972   MachineInstr &MI, MachineRegisterInfo &MRI,
1973   MachineIRBuilder &B) const {
1974 
1975   const LLT S1 = LLT::scalar(1);
1976   const LLT S64 = LLT::scalar(64);
1977 
1978   Register Src = MI.getOperand(1).getReg();
1979   assert(MRI.getType(Src) == S64);
1980 
1981   // result = trunc(src)
1982   // if (src > 0.0 && src != result)
1983   //   result += 1.0
1984 
1985   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1986 
1987   const auto Zero = B.buildFConstant(S64, 0.0);
1988   const auto One = B.buildFConstant(S64, 1.0);
1989   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1990   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1991   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1992   auto Add = B.buildSelect(S64, And, One, Zero);
1993 
1994   // TODO: Should this propagate fast-math-flags?
1995   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1996   return true;
1997 }
1998 
1999 bool AMDGPULegalizerInfo::legalizeFrem(
2000   MachineInstr &MI, MachineRegisterInfo &MRI,
2001   MachineIRBuilder &B) const {
2002     Register DstReg = MI.getOperand(0).getReg();
2003     Register Src0Reg = MI.getOperand(1).getReg();
2004     Register Src1Reg = MI.getOperand(2).getReg();
2005     auto Flags = MI.getFlags();
2006     LLT Ty = MRI.getType(DstReg);
2007 
2008     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2009     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2010     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2011     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2012     MI.eraseFromParent();
2013     return true;
2014 }
2015 
2016 static MachineInstrBuilder extractF64Exponent(Register Hi,
2017                                               MachineIRBuilder &B) {
2018   const unsigned FractBits = 52;
2019   const unsigned ExpBits = 11;
2020   LLT S32 = LLT::scalar(32);
2021 
2022   auto Const0 = B.buildConstant(S32, FractBits - 32);
2023   auto Const1 = B.buildConstant(S32, ExpBits);
2024 
2025   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2026     .addUse(Hi)
2027     .addUse(Const0.getReg(0))
2028     .addUse(Const1.getReg(0));
2029 
2030   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2031 }
2032 
2033 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2034   MachineInstr &MI, MachineRegisterInfo &MRI,
2035   MachineIRBuilder &B) const {
2036   const LLT S1 = LLT::scalar(1);
2037   const LLT S32 = LLT::scalar(32);
2038   const LLT S64 = LLT::scalar(64);
2039 
2040   Register Src = MI.getOperand(1).getReg();
2041   assert(MRI.getType(Src) == S64);
2042 
2043   // TODO: Should this use extract since the low half is unused?
2044   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2045   Register Hi = Unmerge.getReg(1);
2046 
2047   // Extract the upper half, since this is where we will find the sign and
2048   // exponent.
2049   auto Exp = extractF64Exponent(Hi, B);
2050 
2051   const unsigned FractBits = 52;
2052 
2053   // Extract the sign bit.
2054   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2055   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2056 
2057   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2058 
2059   const auto Zero32 = B.buildConstant(S32, 0);
2060 
2061   // Extend back to 64-bits.
2062   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2063 
2064   auto Shr = B.buildAShr(S64, FractMask, Exp);
2065   auto Not = B.buildNot(S64, Shr);
2066   auto Tmp0 = B.buildAnd(S64, Src, Not);
2067   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2068 
2069   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2070   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2071 
2072   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2073   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2074   MI.eraseFromParent();
2075   return true;
2076 }
2077 
2078 bool AMDGPULegalizerInfo::legalizeITOFP(
2079   MachineInstr &MI, MachineRegisterInfo &MRI,
2080   MachineIRBuilder &B, bool Signed) const {
2081 
2082   Register Dst = MI.getOperand(0).getReg();
2083   Register Src = MI.getOperand(1).getReg();
2084 
2085   const LLT S64 = LLT::scalar(64);
2086   const LLT S32 = LLT::scalar(32);
2087 
2088   assert(MRI.getType(Src) == S64);
2089 
2090   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2091   auto ThirtyTwo = B.buildConstant(S32, 32);
2092 
2093   if (MRI.getType(Dst) == S64) {
2094     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2095                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2096 
2097     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2098     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2099                      .addUse(CvtHi.getReg(0))
2100                      .addUse(ThirtyTwo.getReg(0));
2101 
2102     // TODO: Should this propagate fast-math-flags?
2103     B.buildFAdd(Dst, LdExp, CvtLo);
2104     MI.eraseFromParent();
2105     return true;
2106   }
2107 
2108   assert(MRI.getType(Dst) == S32);
2109 
2110   auto One = B.buildConstant(S32, 1);
2111 
2112   MachineInstrBuilder ShAmt;
2113   if (Signed) {
2114     auto ThirtyOne = B.buildConstant(S32, 31);
2115     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2116     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2117     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2118     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2119                                /*HasSideEffects=*/false)
2120                   .addUse(Unmerge.getReg(1));
2121     auto LS2 = B.buildSub(S32, LS, One);
2122     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2123   } else
2124     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2125   auto Norm = B.buildShl(S64, Src, ShAmt);
2126   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2127   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2128   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2129   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2130   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2131   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2132                    /*HasSideEffects=*/false)
2133       .addUse(FVal.getReg(0))
2134       .addUse(Scale.getReg(0));
2135   MI.eraseFromParent();
2136   return true;
2137 }
2138 
2139 // TODO: Copied from DAG implementation. Verify logic and document how this
2140 // actually works.
2141 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2142                                         MachineRegisterInfo &MRI,
2143                                         MachineIRBuilder &B,
2144                                         bool Signed) const {
2145 
2146   Register Dst = MI.getOperand(0).getReg();
2147   Register Src = MI.getOperand(1).getReg();
2148 
2149   const LLT S64 = LLT::scalar(64);
2150   const LLT S32 = LLT::scalar(32);
2151 
2152   const LLT SrcLT = MRI.getType(Src);
2153   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2154 
2155   unsigned Flags = MI.getFlags();
2156 
2157   // The basic idea of converting a floating point number into a pair of 32-bit
2158   // integers is illustrated as follows:
2159   //
2160   //     tf := trunc(val);
2161   //    hif := floor(tf * 2^-32);
2162   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2163   //     hi := fptoi(hif);
2164   //     lo := fptoi(lof);
2165   //
2166   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2167   MachineInstrBuilder Sign;
2168   if (Signed && SrcLT == S32) {
2169     // However, a 32-bit floating point number has only 23 bits mantissa and
2170     // it's not enough to hold all the significant bits of `lof` if val is
2171     // negative. To avoid the loss of precision, We need to take the absolute
2172     // value after truncating and flip the result back based on the original
2173     // signedness.
2174     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2175     Trunc = B.buildFAbs(S32, Trunc, Flags);
2176   }
2177   MachineInstrBuilder K0, K1;
2178   if (SrcLT == S64) {
2179     K0 = B.buildFConstant(S64,
2180                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2181     K1 = B.buildFConstant(S64,
2182                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2183   } else {
2184     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2185     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2186   }
2187 
2188   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2189   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2190   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2191 
2192   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2193                                      : B.buildFPTOUI(S32, FloorMul);
2194   auto Lo = B.buildFPTOUI(S32, Fma);
2195 
2196   if (Signed && SrcLT == S32) {
2197     // Flip the result based on the signedness, which is either all 0s or 1s.
2198     Sign = B.buildMerge(S64, {Sign, Sign});
2199     // r := xor({lo, hi}, sign) - sign;
2200     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2201   } else
2202     B.buildMerge(Dst, {Lo, Hi});
2203   MI.eraseFromParent();
2204 
2205   return true;
2206 }
2207 
2208 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2209                                                MachineInstr &MI) const {
2210   MachineFunction &MF = Helper.MIRBuilder.getMF();
2211   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2212 
2213   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2214                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2215 
2216   // With ieee_mode disabled, the instructions have the correct behavior
2217   // already for G_FMINNUM/G_FMAXNUM
2218   if (!MFI->getMode().IEEE)
2219     return !IsIEEEOp;
2220 
2221   if (IsIEEEOp)
2222     return true;
2223 
2224   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2225 }
2226 
2227 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2228   MachineInstr &MI, MachineRegisterInfo &MRI,
2229   MachineIRBuilder &B) const {
2230   // TODO: Should move some of this into LegalizerHelper.
2231 
2232   // TODO: Promote dynamic indexing of s16 to s32
2233 
2234   // FIXME: Artifact combiner probably should have replaced the truncated
2235   // constant before this, so we shouldn't need
2236   // getIConstantVRegValWithLookThrough.
2237   Optional<ValueAndVReg> MaybeIdxVal =
2238       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2239   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2240     return true;
2241   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2242 
2243   Register Dst = MI.getOperand(0).getReg();
2244   Register Vec = MI.getOperand(1).getReg();
2245 
2246   LLT VecTy = MRI.getType(Vec);
2247   LLT EltTy = VecTy.getElementType();
2248   assert(EltTy == MRI.getType(Dst));
2249 
2250   if (IdxVal < VecTy.getNumElements()) {
2251     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2252     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2253   } else {
2254     B.buildUndef(Dst);
2255   }
2256 
2257   MI.eraseFromParent();
2258   return true;
2259 }
2260 
2261 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2262   MachineInstr &MI, MachineRegisterInfo &MRI,
2263   MachineIRBuilder &B) const {
2264   // TODO: Should move some of this into LegalizerHelper.
2265 
2266   // TODO: Promote dynamic indexing of s16 to s32
2267 
2268   // FIXME: Artifact combiner probably should have replaced the truncated
2269   // constant before this, so we shouldn't need
2270   // getIConstantVRegValWithLookThrough.
2271   Optional<ValueAndVReg> MaybeIdxVal =
2272       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2273   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2274     return true;
2275 
2276   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2277   Register Dst = MI.getOperand(0).getReg();
2278   Register Vec = MI.getOperand(1).getReg();
2279   Register Ins = MI.getOperand(2).getReg();
2280 
2281   LLT VecTy = MRI.getType(Vec);
2282   LLT EltTy = VecTy.getElementType();
2283   assert(EltTy == MRI.getType(Ins));
2284   (void)Ins;
2285 
2286   unsigned NumElts = VecTy.getNumElements();
2287   if (IdxVal < NumElts) {
2288     SmallVector<Register, 8> SrcRegs;
2289     for (unsigned i = 0; i < NumElts; ++i)
2290       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2291     B.buildUnmerge(SrcRegs, Vec);
2292 
2293     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2294     B.buildMerge(Dst, SrcRegs);
2295   } else {
2296     B.buildUndef(Dst);
2297   }
2298 
2299   MI.eraseFromParent();
2300   return true;
2301 }
2302 
2303 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2304   MachineInstr &MI, MachineRegisterInfo &MRI,
2305   MachineIRBuilder &B) const {
2306   const LLT V2S16 = LLT::fixed_vector(2, 16);
2307 
2308   Register Dst = MI.getOperand(0).getReg();
2309   Register Src0 = MI.getOperand(1).getReg();
2310   LLT DstTy = MRI.getType(Dst);
2311   LLT SrcTy = MRI.getType(Src0);
2312 
2313   if (SrcTy == V2S16 && DstTy == V2S16 &&
2314       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2315     return true;
2316 
2317   MachineIRBuilder HelperBuilder(MI);
2318   GISelObserverWrapper DummyObserver;
2319   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2320   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2321 }
2322 
2323 bool AMDGPULegalizerInfo::legalizeSinCos(
2324   MachineInstr &MI, MachineRegisterInfo &MRI,
2325   MachineIRBuilder &B) const {
2326 
2327   Register DstReg = MI.getOperand(0).getReg();
2328   Register SrcReg = MI.getOperand(1).getReg();
2329   LLT Ty = MRI.getType(DstReg);
2330   unsigned Flags = MI.getFlags();
2331 
2332   Register TrigVal;
2333   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2334   if (ST.hasTrigReducedRange()) {
2335     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2336     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2337       .addUse(MulVal.getReg(0))
2338       .setMIFlags(Flags).getReg(0);
2339   } else
2340     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2341 
2342   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2343     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2344   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2345     .addUse(TrigVal)
2346     .setMIFlags(Flags);
2347   MI.eraseFromParent();
2348   return true;
2349 }
2350 
2351 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2352                                                   MachineIRBuilder &B,
2353                                                   const GlobalValue *GV,
2354                                                   int64_t Offset,
2355                                                   unsigned GAFlags) const {
2356   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2357   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2358   // to the following code sequence:
2359   //
2360   // For constant address space:
2361   //   s_getpc_b64 s[0:1]
2362   //   s_add_u32 s0, s0, $symbol
2363   //   s_addc_u32 s1, s1, 0
2364   //
2365   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2366   //   a fixup or relocation is emitted to replace $symbol with a literal
2367   //   constant, which is a pc-relative offset from the encoding of the $symbol
2368   //   operand to the global variable.
2369   //
2370   // For global address space:
2371   //   s_getpc_b64 s[0:1]
2372   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2373   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2374   //
2375   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2376   //   fixups or relocations are emitted to replace $symbol@*@lo and
2377   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2378   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2379   //   operand to the global variable.
2380   //
2381   // What we want here is an offset from the value returned by s_getpc
2382   // (which is the address of the s_add_u32 instruction) to the global
2383   // variable, but since the encoding of $symbol starts 4 bytes after the start
2384   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2385   // small. This requires us to add 4 to the global variable offset in order to
2386   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2387   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2388   // instruction.
2389 
2390   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2391 
2392   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2393     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2394 
2395   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2396     .addDef(PCReg);
2397 
2398   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2399   if (GAFlags == SIInstrInfo::MO_NONE)
2400     MIB.addImm(0);
2401   else
2402     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2403 
2404   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2405 
2406   if (PtrTy.getSizeInBits() == 32)
2407     B.buildExtract(DstReg, PCReg, 0);
2408   return true;
2409  }
2410 
2411 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2412   MachineInstr &MI, MachineRegisterInfo &MRI,
2413   MachineIRBuilder &B) const {
2414   Register DstReg = MI.getOperand(0).getReg();
2415   LLT Ty = MRI.getType(DstReg);
2416   unsigned AS = Ty.getAddressSpace();
2417 
2418   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2419   MachineFunction &MF = B.getMF();
2420   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2421 
2422   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2423     if (!MFI->isModuleEntryFunction() &&
2424         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2425       const Function &Fn = MF.getFunction();
2426       DiagnosticInfoUnsupported BadLDSDecl(
2427         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2428         DS_Warning);
2429       Fn.getContext().diagnose(BadLDSDecl);
2430 
2431       // We currently don't have a way to correctly allocate LDS objects that
2432       // aren't directly associated with a kernel. We do force inlining of
2433       // functions that use local objects. However, if these dead functions are
2434       // not eliminated, we don't want a compile time error. Just emit a warning
2435       // and a trap, since there should be no callable path here.
2436       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2437       B.buildUndef(DstReg);
2438       MI.eraseFromParent();
2439       return true;
2440     }
2441 
2442     // TODO: We could emit code to handle the initialization somewhere.
2443     // We ignore the initializer for now and legalize it to allow selection.
2444     // The initializer will anyway get errored out during assembly emission.
2445     const SITargetLowering *TLI = ST.getTargetLowering();
2446     if (!TLI->shouldUseLDSConstAddress(GV)) {
2447       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2448       return true; // Leave in place;
2449     }
2450 
2451     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2452       Type *Ty = GV->getValueType();
2453       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2454       // zero-sized type in other languages to declare the dynamic shared
2455       // memory which size is not known at the compile time. They will be
2456       // allocated by the runtime and placed directly after the static
2457       // allocated ones. They all share the same offset.
2458       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2459         // Adjust alignment for that dynamic shared memory array.
2460         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2461         LLT S32 = LLT::scalar(32);
2462         auto Sz =
2463             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2464         B.buildIntToPtr(DstReg, Sz);
2465         MI.eraseFromParent();
2466         return true;
2467       }
2468     }
2469 
2470     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2471                                                    *cast<GlobalVariable>(GV)));
2472     MI.eraseFromParent();
2473     return true;
2474   }
2475 
2476   const SITargetLowering *TLI = ST.getTargetLowering();
2477 
2478   if (TLI->shouldEmitFixup(GV)) {
2479     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2480     MI.eraseFromParent();
2481     return true;
2482   }
2483 
2484   if (TLI->shouldEmitPCReloc(GV)) {
2485     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2486     MI.eraseFromParent();
2487     return true;
2488   }
2489 
2490   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2491   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2492 
2493   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2494   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2495       MachinePointerInfo::getGOT(MF),
2496       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2497           MachineMemOperand::MOInvariant,
2498       LoadTy, Align(8));
2499 
2500   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2501 
2502   if (Ty.getSizeInBits() == 32) {
2503     // Truncate if this is a 32-bit constant address.
2504     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2505     B.buildExtract(DstReg, Load, 0);
2506   } else
2507     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2508 
2509   MI.eraseFromParent();
2510   return true;
2511 }
2512 
2513 static LLT widenToNextPowerOf2(LLT Ty) {
2514   if (Ty.isVector())
2515     return Ty.changeElementCount(
2516         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2517   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2518 }
2519 
2520 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2521                                        MachineInstr &MI) const {
2522   MachineIRBuilder &B = Helper.MIRBuilder;
2523   MachineRegisterInfo &MRI = *B.getMRI();
2524   GISelChangeObserver &Observer = Helper.Observer;
2525 
2526   Register PtrReg = MI.getOperand(1).getReg();
2527   LLT PtrTy = MRI.getType(PtrReg);
2528   unsigned AddrSpace = PtrTy.getAddressSpace();
2529 
2530   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2531     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2532     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2533     Observer.changingInstr(MI);
2534     MI.getOperand(1).setReg(Cast.getReg(0));
2535     Observer.changedInstr(MI);
2536     return true;
2537   }
2538 
2539   if (MI.getOpcode() != AMDGPU::G_LOAD)
2540     return false;
2541 
2542   Register ValReg = MI.getOperand(0).getReg();
2543   LLT ValTy = MRI.getType(ValReg);
2544 
2545   MachineMemOperand *MMO = *MI.memoperands_begin();
2546   const unsigned ValSize = ValTy.getSizeInBits();
2547   const LLT MemTy = MMO->getMemoryType();
2548   const Align MemAlign = MMO->getAlign();
2549   const unsigned MemSize = MemTy.getSizeInBits();
2550   const unsigned AlignInBits = 8 * MemAlign.value();
2551 
2552   // Widen non-power-of-2 loads to the alignment if needed
2553   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2554     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2555 
2556     // This was already the correct extending load result type, so just adjust
2557     // the memory type.
2558     if (WideMemSize == ValSize) {
2559       MachineFunction &MF = B.getMF();
2560 
2561       MachineMemOperand *WideMMO =
2562           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2563       Observer.changingInstr(MI);
2564       MI.setMemRefs(MF, {WideMMO});
2565       Observer.changedInstr(MI);
2566       return true;
2567     }
2568 
2569     // Don't bother handling edge case that should probably never be produced.
2570     if (ValSize > WideMemSize)
2571       return false;
2572 
2573     LLT WideTy = widenToNextPowerOf2(ValTy);
2574 
2575     Register WideLoad;
2576     if (!WideTy.isVector()) {
2577       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2578       B.buildTrunc(ValReg, WideLoad).getReg(0);
2579     } else {
2580       // Extract the subvector.
2581 
2582       if (isRegisterType(ValTy)) {
2583         // If this a case where G_EXTRACT is legal, use it.
2584         // (e.g. <3 x s32> -> <4 x s32>)
2585         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2586         B.buildExtract(ValReg, WideLoad, 0);
2587       } else {
2588         // For cases where the widened type isn't a nice register value, unmerge
2589         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2590         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2591         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2592       }
2593     }
2594 
2595     MI.eraseFromParent();
2596     return true;
2597   }
2598 
2599   return false;
2600 }
2601 
2602 bool AMDGPULegalizerInfo::legalizeFMad(
2603   MachineInstr &MI, MachineRegisterInfo &MRI,
2604   MachineIRBuilder &B) const {
2605   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2606   assert(Ty.isScalar());
2607 
2608   MachineFunction &MF = B.getMF();
2609   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2610 
2611   // TODO: Always legal with future ftz flag.
2612   // FIXME: Do we need just output?
2613   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2614     return true;
2615   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2616     return true;
2617 
2618   MachineIRBuilder HelperBuilder(MI);
2619   GISelObserverWrapper DummyObserver;
2620   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2621   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2622 }
2623 
2624 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2625   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2626   Register DstReg = MI.getOperand(0).getReg();
2627   Register PtrReg = MI.getOperand(1).getReg();
2628   Register CmpVal = MI.getOperand(2).getReg();
2629   Register NewVal = MI.getOperand(3).getReg();
2630 
2631   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2632          "this should not have been custom lowered");
2633 
2634   LLT ValTy = MRI.getType(CmpVal);
2635   LLT VecTy = LLT::fixed_vector(2, ValTy);
2636 
2637   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2638 
2639   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2640     .addDef(DstReg)
2641     .addUse(PtrReg)
2642     .addUse(PackedVal)
2643     .setMemRefs(MI.memoperands());
2644 
2645   MI.eraseFromParent();
2646   return true;
2647 }
2648 
2649 bool AMDGPULegalizerInfo::legalizeFlog(
2650   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2651   Register Dst = MI.getOperand(0).getReg();
2652   Register Src = MI.getOperand(1).getReg();
2653   LLT Ty = B.getMRI()->getType(Dst);
2654   unsigned Flags = MI.getFlags();
2655 
2656   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2657   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2658 
2659   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2660   MI.eraseFromParent();
2661   return true;
2662 }
2663 
2664 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2665                                        MachineIRBuilder &B) const {
2666   Register Dst = MI.getOperand(0).getReg();
2667   Register Src = MI.getOperand(1).getReg();
2668   unsigned Flags = MI.getFlags();
2669   LLT Ty = B.getMRI()->getType(Dst);
2670 
2671   auto K = B.buildFConstant(Ty, numbers::log2e);
2672   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2673   B.buildFExp2(Dst, Mul, Flags);
2674   MI.eraseFromParent();
2675   return true;
2676 }
2677 
2678 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2679                                        MachineIRBuilder &B) const {
2680   Register Dst = MI.getOperand(0).getReg();
2681   Register Src0 = MI.getOperand(1).getReg();
2682   Register Src1 = MI.getOperand(2).getReg();
2683   unsigned Flags = MI.getFlags();
2684   LLT Ty = B.getMRI()->getType(Dst);
2685   const LLT S16 = LLT::scalar(16);
2686   const LLT S32 = LLT::scalar(32);
2687 
2688   if (Ty == S32) {
2689     auto Log = B.buildFLog2(S32, Src0, Flags);
2690     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2691       .addUse(Log.getReg(0))
2692       .addUse(Src1)
2693       .setMIFlags(Flags);
2694     B.buildFExp2(Dst, Mul, Flags);
2695   } else if (Ty == S16) {
2696     // There's no f16 fmul_legacy, so we need to convert for it.
2697     auto Log = B.buildFLog2(S16, Src0, Flags);
2698     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2699     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2700     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2701       .addUse(Ext0.getReg(0))
2702       .addUse(Ext1.getReg(0))
2703       .setMIFlags(Flags);
2704 
2705     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2706   } else
2707     return false;
2708 
2709   MI.eraseFromParent();
2710   return true;
2711 }
2712 
2713 // Find a source register, ignoring any possible source modifiers.
2714 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2715   Register ModSrc = OrigSrc;
2716   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2717     ModSrc = SrcFNeg->getOperand(1).getReg();
2718     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2719       ModSrc = SrcFAbs->getOperand(1).getReg();
2720   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2721     ModSrc = SrcFAbs->getOperand(1).getReg();
2722   return ModSrc;
2723 }
2724 
2725 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2726                                          MachineRegisterInfo &MRI,
2727                                          MachineIRBuilder &B) const {
2728 
2729   const LLT S1 = LLT::scalar(1);
2730   const LLT S64 = LLT::scalar(64);
2731   Register Dst = MI.getOperand(0).getReg();
2732   Register OrigSrc = MI.getOperand(1).getReg();
2733   unsigned Flags = MI.getFlags();
2734   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2735          "this should not have been custom lowered");
2736 
2737   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2738   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2739   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2740   // V_FRACT bug is:
2741   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2742   //
2743   // Convert floor(x) to (x - fract(x))
2744 
2745   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2746     .addUse(OrigSrc)
2747     .setMIFlags(Flags);
2748 
2749   // Give source modifier matching some assistance before obscuring a foldable
2750   // pattern.
2751 
2752   // TODO: We can avoid the neg on the fract? The input sign to fract
2753   // shouldn't matter?
2754   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2755 
2756   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2757 
2758   Register Min = MRI.createGenericVirtualRegister(S64);
2759 
2760   // We don't need to concern ourselves with the snan handling difference, so
2761   // use the one which will directly select.
2762   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2763   if (MFI->getMode().IEEE)
2764     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2765   else
2766     B.buildFMinNum(Min, Fract, Const, Flags);
2767 
2768   Register CorrectedFract = Min;
2769   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2770     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2771     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2772   }
2773 
2774   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2775   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2776 
2777   MI.eraseFromParent();
2778   return true;
2779 }
2780 
2781 // Turn an illegal packed v2s16 build vector into bit operations.
2782 // TODO: This should probably be a bitcast action in LegalizerHelper.
2783 bool AMDGPULegalizerInfo::legalizeBuildVector(
2784   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2785   Register Dst = MI.getOperand(0).getReg();
2786   const LLT S32 = LLT::scalar(32);
2787   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2788 
2789   Register Src0 = MI.getOperand(1).getReg();
2790   Register Src1 = MI.getOperand(2).getReg();
2791   assert(MRI.getType(Src0) == LLT::scalar(16));
2792 
2793   auto Merge = B.buildMerge(S32, {Src0, Src1});
2794   B.buildBitcast(Dst, Merge);
2795 
2796   MI.eraseFromParent();
2797   return true;
2798 }
2799 
2800 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2801 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2802 // case with a single min instruction instead of a compare+select.
2803 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2804                                             MachineRegisterInfo &MRI,
2805                                             MachineIRBuilder &B) const {
2806   Register Dst = MI.getOperand(0).getReg();
2807   Register Src = MI.getOperand(1).getReg();
2808   LLT DstTy = MRI.getType(Dst);
2809   LLT SrcTy = MRI.getType(Src);
2810 
2811   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2812                         ? AMDGPU::G_AMDGPU_FFBH_U32
2813                         : AMDGPU::G_AMDGPU_FFBL_B32;
2814   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2815   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2816 
2817   MI.eraseFromParent();
2818   return true;
2819 }
2820 
2821 // Check that this is a G_XOR x, -1
2822 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2823   if (MI.getOpcode() != TargetOpcode::G_XOR)
2824     return false;
2825   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2826   return ConstVal && *ConstVal == -1;
2827 }
2828 
2829 // Return the use branch instruction, otherwise null if the usage is invalid.
2830 static MachineInstr *
2831 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2832                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2833   Register CondDef = MI.getOperand(0).getReg();
2834   if (!MRI.hasOneNonDBGUse(CondDef))
2835     return nullptr;
2836 
2837   MachineBasicBlock *Parent = MI.getParent();
2838   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2839 
2840   if (isNot(MRI, *UseMI)) {
2841     Register NegatedCond = UseMI->getOperand(0).getReg();
2842     if (!MRI.hasOneNonDBGUse(NegatedCond))
2843       return nullptr;
2844 
2845     // We're deleting the def of this value, so we need to remove it.
2846     eraseInstr(*UseMI, MRI);
2847 
2848     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2849     Negated = true;
2850   }
2851 
2852   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2853     return nullptr;
2854 
2855   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2856   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2857   if (Next == Parent->end()) {
2858     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2859     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2860       return nullptr;
2861     UncondBrTarget = &*NextMBB;
2862   } else {
2863     if (Next->getOpcode() != AMDGPU::G_BR)
2864       return nullptr;
2865     Br = &*Next;
2866     UncondBrTarget = Br->getOperand(0).getMBB();
2867   }
2868 
2869   return UseMI;
2870 }
2871 
2872 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2873                                          const ArgDescriptor *Arg,
2874                                          const TargetRegisterClass *ArgRC,
2875                                          LLT ArgTy) const {
2876   MCRegister SrcReg = Arg->getRegister();
2877   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2878   assert(DstReg.isVirtual() && "Virtual register expected");
2879 
2880   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2881                                              *ArgRC, B.getDebugLoc(), ArgTy);
2882   if (Arg->isMasked()) {
2883     // TODO: Should we try to emit this once in the entry block?
2884     const LLT S32 = LLT::scalar(32);
2885     const unsigned Mask = Arg->getMask();
2886     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2887 
2888     Register AndMaskSrc = LiveIn;
2889 
2890     if (Shift != 0) {
2891       auto ShiftAmt = B.buildConstant(S32, Shift);
2892       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2893     }
2894 
2895     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2896   } else {
2897     B.buildCopy(DstReg, LiveIn);
2898   }
2899 
2900   return true;
2901 }
2902 
2903 bool AMDGPULegalizerInfo::loadInputValue(
2904     Register DstReg, MachineIRBuilder &B,
2905     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2906   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2907   const ArgDescriptor *Arg;
2908   const TargetRegisterClass *ArgRC;
2909   LLT ArgTy;
2910   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2911 
2912   if (!Arg) {
2913     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2914       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2915       // case the pointer argument may be missing and we use null.
2916       B.buildConstant(DstReg, 0);
2917       return true;
2918     }
2919 
2920     // It's undefined behavior if a function marked with the amdgpu-no-*
2921     // attributes uses the corresponding intrinsic.
2922     B.buildUndef(DstReg);
2923     return true;
2924   }
2925 
2926   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2927     return false; // TODO: Handle these
2928   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2929 }
2930 
2931 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2932     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2933     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2934   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2935     return false;
2936 
2937   MI.eraseFromParent();
2938   return true;
2939 }
2940 
2941 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2942                                        MachineRegisterInfo &MRI,
2943                                        MachineIRBuilder &B) const {
2944   Register Dst = MI.getOperand(0).getReg();
2945   LLT DstTy = MRI.getType(Dst);
2946   LLT S16 = LLT::scalar(16);
2947   LLT S32 = LLT::scalar(32);
2948   LLT S64 = LLT::scalar(64);
2949 
2950   if (DstTy == S16)
2951     return legalizeFDIV16(MI, MRI, B);
2952   if (DstTy == S32)
2953     return legalizeFDIV32(MI, MRI, B);
2954   if (DstTy == S64)
2955     return legalizeFDIV64(MI, MRI, B);
2956 
2957   return false;
2958 }
2959 
2960 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2961                                                         Register DstDivReg,
2962                                                         Register DstRemReg,
2963                                                         Register X,
2964                                                         Register Y) const {
2965   const LLT S1 = LLT::scalar(1);
2966   const LLT S32 = LLT::scalar(32);
2967 
2968   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2969   // algorithm used here.
2970 
2971   // Initial estimate of inv(y).
2972   auto FloatY = B.buildUITOFP(S32, Y);
2973   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2974   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2975   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2976   auto Z = B.buildFPTOUI(S32, ScaledY);
2977 
2978   // One round of UNR.
2979   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2980   auto NegYZ = B.buildMul(S32, NegY, Z);
2981   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2982 
2983   // Quotient/remainder estimate.
2984   auto Q = B.buildUMulH(S32, X, Z);
2985   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2986 
2987   // First quotient/remainder refinement.
2988   auto One = B.buildConstant(S32, 1);
2989   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2990   if (DstDivReg)
2991     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2992   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2993 
2994   // Second quotient/remainder refinement.
2995   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2996   if (DstDivReg)
2997     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2998 
2999   if (DstRemReg)
3000     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3001 }
3002 
3003 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3004 //
3005 // Return lo, hi of result
3006 //
3007 // %cvt.lo = G_UITOFP Val.lo
3008 // %cvt.hi = G_UITOFP Val.hi
3009 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3010 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3011 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3012 // %mul2 = G_FMUL %mul1, 2**(-32)
3013 // %trunc = G_INTRINSIC_TRUNC %mul2
3014 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3015 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3016 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3017                                                        Register Val) {
3018   const LLT S32 = LLT::scalar(32);
3019   auto Unmerge = B.buildUnmerge(S32, Val);
3020 
3021   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3022   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3023 
3024   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3025                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3026 
3027   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3028   auto Mul1 =
3029       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3030 
3031   // 2**(-32)
3032   auto Mul2 =
3033       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3034   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3035 
3036   // -(2**32)
3037   auto Mad2 = B.buildFMAD(S32, Trunc,
3038                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3039 
3040   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3041   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3042 
3043   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3044 }
3045 
3046 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3047                                                         Register DstDivReg,
3048                                                         Register DstRemReg,
3049                                                         Register Numer,
3050                                                         Register Denom) const {
3051   const LLT S32 = LLT::scalar(32);
3052   const LLT S64 = LLT::scalar(64);
3053   const LLT S1 = LLT::scalar(1);
3054   Register RcpLo, RcpHi;
3055 
3056   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3057 
3058   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3059 
3060   auto Zero64 = B.buildConstant(S64, 0);
3061   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3062 
3063   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3064   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3065 
3066   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3067   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3068   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3069 
3070   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3071   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3072   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3073 
3074   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3075   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3076   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3077   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3078   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3079 
3080   auto Zero32 = B.buildConstant(S32, 0);
3081   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3082   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3083   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3084 
3085   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3086   Register NumerLo = UnmergeNumer.getReg(0);
3087   Register NumerHi = UnmergeNumer.getReg(1);
3088 
3089   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3090   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3091   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3092   Register Mul3_Lo = UnmergeMul3.getReg(0);
3093   Register Mul3_Hi = UnmergeMul3.getReg(1);
3094   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3095   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3096   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3097   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3098 
3099   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3100   Register DenomLo = UnmergeDenom.getReg(0);
3101   Register DenomHi = UnmergeDenom.getReg(1);
3102 
3103   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3104   auto C1 = B.buildSExt(S32, CmpHi);
3105 
3106   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3107   auto C2 = B.buildSExt(S32, CmpLo);
3108 
3109   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3110   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3111 
3112   // TODO: Here and below portions of the code can be enclosed into if/endif.
3113   // Currently control flow is unconditional and we have 4 selects after
3114   // potential endif to substitute PHIs.
3115 
3116   // if C3 != 0 ...
3117   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3118   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3119   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3120   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3121 
3122   auto One64 = B.buildConstant(S64, 1);
3123   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3124 
3125   auto C4 =
3126       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3127   auto C5 =
3128       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3129   auto C6 = B.buildSelect(
3130       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3131 
3132   // if (C6 != 0)
3133   auto Add4 = B.buildAdd(S64, Add3, One64);
3134   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3135 
3136   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3137   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3138   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3139 
3140   // endif C6
3141   // endif C3
3142 
3143   if (DstDivReg) {
3144     auto Sel1 = B.buildSelect(
3145         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3146     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3147                   Sel1, MulHi3);
3148   }
3149 
3150   if (DstRemReg) {
3151     auto Sel2 = B.buildSelect(
3152         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3153     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3154                   Sel2, Sub1);
3155   }
3156 }
3157 
3158 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3159                                                   MachineRegisterInfo &MRI,
3160                                                   MachineIRBuilder &B) const {
3161   Register DstDivReg, DstRemReg;
3162   switch (MI.getOpcode()) {
3163   default:
3164     llvm_unreachable("Unexpected opcode!");
3165   case AMDGPU::G_UDIV: {
3166     DstDivReg = MI.getOperand(0).getReg();
3167     break;
3168   }
3169   case AMDGPU::G_UREM: {
3170     DstRemReg = MI.getOperand(0).getReg();
3171     break;
3172   }
3173   case AMDGPU::G_UDIVREM: {
3174     DstDivReg = MI.getOperand(0).getReg();
3175     DstRemReg = MI.getOperand(1).getReg();
3176     break;
3177   }
3178   }
3179 
3180   const LLT S64 = LLT::scalar(64);
3181   const LLT S32 = LLT::scalar(32);
3182   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3183   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3184   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3185   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3186 
3187   if (Ty == S32)
3188     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3189   else if (Ty == S64)
3190     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3191   else
3192     return false;
3193 
3194   MI.eraseFromParent();
3195   return true;
3196 }
3197 
3198 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3199                                                 MachineRegisterInfo &MRI,
3200                                                 MachineIRBuilder &B) const {
3201   const LLT S64 = LLT::scalar(64);
3202   const LLT S32 = LLT::scalar(32);
3203 
3204   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3205   if (Ty != S32 && Ty != S64)
3206     return false;
3207 
3208   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3209   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3210   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3211 
3212   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3213   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3214   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3215 
3216   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3217   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3218 
3219   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3220   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3221 
3222   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3223   switch (MI.getOpcode()) {
3224   default:
3225     llvm_unreachable("Unexpected opcode!");
3226   case AMDGPU::G_SDIV: {
3227     DstDivReg = MI.getOperand(0).getReg();
3228     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3229     break;
3230   }
3231   case AMDGPU::G_SREM: {
3232     DstRemReg = MI.getOperand(0).getReg();
3233     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3234     break;
3235   }
3236   case AMDGPU::G_SDIVREM: {
3237     DstDivReg = MI.getOperand(0).getReg();
3238     DstRemReg = MI.getOperand(1).getReg();
3239     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3240     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3241     break;
3242   }
3243   }
3244 
3245   if (Ty == S32)
3246     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3247   else
3248     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3249 
3250   if (DstDivReg) {
3251     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3252     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3253     B.buildSub(DstDivReg, SignXor, Sign);
3254   }
3255 
3256   if (DstRemReg) {
3257     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3258     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3259     B.buildSub(DstRemReg, SignXor, Sign);
3260   }
3261 
3262   MI.eraseFromParent();
3263   return true;
3264 }
3265 
3266 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3267                                                  MachineRegisterInfo &MRI,
3268                                                  MachineIRBuilder &B) const {
3269   Register Res = MI.getOperand(0).getReg();
3270   Register LHS = MI.getOperand(1).getReg();
3271   Register RHS = MI.getOperand(2).getReg();
3272   uint16_t Flags = MI.getFlags();
3273   LLT ResTy = MRI.getType(Res);
3274 
3275   const MachineFunction &MF = B.getMF();
3276   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3277                             MI.getFlag(MachineInstr::FmAfn);
3278 
3279   if (!AllowInaccurateRcp)
3280     return false;
3281 
3282   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3283     // 1 / x -> RCP(x)
3284     if (CLHS->isExactlyValue(1.0)) {
3285       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3286         .addUse(RHS)
3287         .setMIFlags(Flags);
3288 
3289       MI.eraseFromParent();
3290       return true;
3291     }
3292 
3293     // -1 / x -> RCP( FNEG(x) )
3294     if (CLHS->isExactlyValue(-1.0)) {
3295       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3296       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3297         .addUse(FNeg.getReg(0))
3298         .setMIFlags(Flags);
3299 
3300       MI.eraseFromParent();
3301       return true;
3302     }
3303   }
3304 
3305   // x / y -> x * (1.0 / y)
3306   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3307     .addUse(RHS)
3308     .setMIFlags(Flags);
3309   B.buildFMul(Res, LHS, RCP, Flags);
3310 
3311   MI.eraseFromParent();
3312   return true;
3313 }
3314 
3315 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3316                                                    MachineRegisterInfo &MRI,
3317                                                    MachineIRBuilder &B) const {
3318   Register Res = MI.getOperand(0).getReg();
3319   Register X = MI.getOperand(1).getReg();
3320   Register Y = MI.getOperand(2).getReg();
3321   uint16_t Flags = MI.getFlags();
3322   LLT ResTy = MRI.getType(Res);
3323 
3324   const MachineFunction &MF = B.getMF();
3325   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3326                             MI.getFlag(MachineInstr::FmAfn);
3327 
3328   if (!AllowInaccurateRcp)
3329     return false;
3330 
3331   auto NegY = B.buildFNeg(ResTy, Y);
3332   auto One = B.buildFConstant(ResTy, 1.0);
3333 
3334   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3335     .addUse(Y)
3336     .setMIFlags(Flags);
3337 
3338   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3339   R = B.buildFMA(ResTy, Tmp0, R, R);
3340 
3341   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3342   R = B.buildFMA(ResTy, Tmp1, R, R);
3343 
3344   auto Ret = B.buildFMul(ResTy, X, R);
3345   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3346 
3347   B.buildFMA(Res, Tmp2, R, Ret);
3348   MI.eraseFromParent();
3349   return true;
3350 }
3351 
3352 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3353                                          MachineRegisterInfo &MRI,
3354                                          MachineIRBuilder &B) const {
3355   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3356     return true;
3357 
3358   Register Res = MI.getOperand(0).getReg();
3359   Register LHS = MI.getOperand(1).getReg();
3360   Register RHS = MI.getOperand(2).getReg();
3361 
3362   uint16_t Flags = MI.getFlags();
3363 
3364   LLT S16 = LLT::scalar(16);
3365   LLT S32 = LLT::scalar(32);
3366 
3367   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3368   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3369 
3370   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3371     .addUse(RHSExt.getReg(0))
3372     .setMIFlags(Flags);
3373 
3374   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3375   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3376 
3377   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3378     .addUse(RDst.getReg(0))
3379     .addUse(RHS)
3380     .addUse(LHS)
3381     .setMIFlags(Flags);
3382 
3383   MI.eraseFromParent();
3384   return true;
3385 }
3386 
3387 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3388 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3389 static void toggleSPDenormMode(bool Enable,
3390                                MachineIRBuilder &B,
3391                                const GCNSubtarget &ST,
3392                                AMDGPU::SIModeRegisterDefaults Mode) {
3393   // Set SP denorm mode to this value.
3394   unsigned SPDenormMode =
3395     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3396 
3397   if (ST.hasDenormModeInst()) {
3398     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3399     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3400 
3401     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3402     B.buildInstr(AMDGPU::S_DENORM_MODE)
3403       .addImm(NewDenormModeValue);
3404 
3405   } else {
3406     // Select FP32 bit field in mode register.
3407     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3408                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3409                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3410 
3411     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3412       .addImm(SPDenormMode)
3413       .addImm(SPDenormModeBitField);
3414   }
3415 }
3416 
3417 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3418                                          MachineRegisterInfo &MRI,
3419                                          MachineIRBuilder &B) const {
3420   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3421     return true;
3422 
3423   Register Res = MI.getOperand(0).getReg();
3424   Register LHS = MI.getOperand(1).getReg();
3425   Register RHS = MI.getOperand(2).getReg();
3426   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3427   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3428 
3429   uint16_t Flags = MI.getFlags();
3430 
3431   LLT S32 = LLT::scalar(32);
3432   LLT S1 = LLT::scalar(1);
3433 
3434   auto One = B.buildFConstant(S32, 1.0f);
3435 
3436   auto DenominatorScaled =
3437     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3438       .addUse(LHS)
3439       .addUse(RHS)
3440       .addImm(0)
3441       .setMIFlags(Flags);
3442   auto NumeratorScaled =
3443     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3444       .addUse(LHS)
3445       .addUse(RHS)
3446       .addImm(1)
3447       .setMIFlags(Flags);
3448 
3449   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3450     .addUse(DenominatorScaled.getReg(0))
3451     .setMIFlags(Flags);
3452   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3453 
3454   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3455   // aren't modeled as reading it.
3456   if (!Mode.allFP32Denormals())
3457     toggleSPDenormMode(true, B, ST, Mode);
3458 
3459   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3460   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3461   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3462   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3463   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3464   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3465 
3466   if (!Mode.allFP32Denormals())
3467     toggleSPDenormMode(false, B, ST, Mode);
3468 
3469   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3470     .addUse(Fma4.getReg(0))
3471     .addUse(Fma1.getReg(0))
3472     .addUse(Fma3.getReg(0))
3473     .addUse(NumeratorScaled.getReg(1))
3474     .setMIFlags(Flags);
3475 
3476   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3477     .addUse(Fmas.getReg(0))
3478     .addUse(RHS)
3479     .addUse(LHS)
3480     .setMIFlags(Flags);
3481 
3482   MI.eraseFromParent();
3483   return true;
3484 }
3485 
3486 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3487                                          MachineRegisterInfo &MRI,
3488                                          MachineIRBuilder &B) const {
3489   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3490     return true;
3491 
3492   Register Res = MI.getOperand(0).getReg();
3493   Register LHS = MI.getOperand(1).getReg();
3494   Register RHS = MI.getOperand(2).getReg();
3495 
3496   uint16_t Flags = MI.getFlags();
3497 
3498   LLT S64 = LLT::scalar(64);
3499   LLT S1 = LLT::scalar(1);
3500 
3501   auto One = B.buildFConstant(S64, 1.0);
3502 
3503   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3504     .addUse(LHS)
3505     .addUse(RHS)
3506     .addImm(0)
3507     .setMIFlags(Flags);
3508 
3509   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3510 
3511   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3512     .addUse(DivScale0.getReg(0))
3513     .setMIFlags(Flags);
3514 
3515   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3516   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3517   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3518 
3519   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3520     .addUse(LHS)
3521     .addUse(RHS)
3522     .addImm(1)
3523     .setMIFlags(Flags);
3524 
3525   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3526   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3527   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3528 
3529   Register Scale;
3530   if (!ST.hasUsableDivScaleConditionOutput()) {
3531     // Workaround a hardware bug on SI where the condition output from div_scale
3532     // is not usable.
3533 
3534     LLT S32 = LLT::scalar(32);
3535 
3536     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3537     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3538     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3539     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3540 
3541     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3542                               Scale1Unmerge.getReg(1));
3543     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3544                               Scale0Unmerge.getReg(1));
3545     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3546   } else {
3547     Scale = DivScale1.getReg(1);
3548   }
3549 
3550   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3551     .addUse(Fma4.getReg(0))
3552     .addUse(Fma3.getReg(0))
3553     .addUse(Mul.getReg(0))
3554     .addUse(Scale)
3555     .setMIFlags(Flags);
3556 
3557   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3558     .addUse(Fmas.getReg(0))
3559     .addUse(RHS)
3560     .addUse(LHS)
3561     .setMIFlags(Flags);
3562 
3563   MI.eraseFromParent();
3564   return true;
3565 }
3566 
3567 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3568                                                  MachineRegisterInfo &MRI,
3569                                                  MachineIRBuilder &B) const {
3570   Register Res = MI.getOperand(0).getReg();
3571   Register LHS = MI.getOperand(2).getReg();
3572   Register RHS = MI.getOperand(3).getReg();
3573   uint16_t Flags = MI.getFlags();
3574 
3575   LLT S32 = LLT::scalar(32);
3576   LLT S1 = LLT::scalar(1);
3577 
3578   auto Abs = B.buildFAbs(S32, RHS, Flags);
3579   const APFloat C0Val(1.0f);
3580 
3581   auto C0 = B.buildConstant(S32, 0x6f800000);
3582   auto C1 = B.buildConstant(S32, 0x2f800000);
3583   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3584 
3585   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3586   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3587 
3588   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3589 
3590   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3591     .addUse(Mul0.getReg(0))
3592     .setMIFlags(Flags);
3593 
3594   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3595 
3596   B.buildFMul(Res, Sel, Mul1, Flags);
3597 
3598   MI.eraseFromParent();
3599   return true;
3600 }
3601 
3602 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3603 // FIXME: Why do we handle this one but not other removed instructions?
3604 //
3605 // Reciprocal square root.  The clamp prevents infinite results, clamping
3606 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3607 // +-max_float.
3608 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3609                                                     MachineRegisterInfo &MRI,
3610                                                     MachineIRBuilder &B) const {
3611   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3612     return true;
3613 
3614   Register Dst = MI.getOperand(0).getReg();
3615   Register Src = MI.getOperand(2).getReg();
3616   auto Flags = MI.getFlags();
3617 
3618   LLT Ty = MRI.getType(Dst);
3619 
3620   const fltSemantics *FltSemantics;
3621   if (Ty == LLT::scalar(32))
3622     FltSemantics = &APFloat::IEEEsingle();
3623   else if (Ty == LLT::scalar(64))
3624     FltSemantics = &APFloat::IEEEdouble();
3625   else
3626     return false;
3627 
3628   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3629     .addUse(Src)
3630     .setMIFlags(Flags);
3631 
3632   // We don't need to concern ourselves with the snan handling difference, since
3633   // the rsq quieted (or not) so use the one which will directly select.
3634   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3635   const bool UseIEEE = MFI->getMode().IEEE;
3636 
3637   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3638   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3639                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3640 
3641   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3642 
3643   if (UseIEEE)
3644     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3645   else
3646     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3647   MI.eraseFromParent();
3648   return true;
3649 }
3650 
3651 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3652   switch (IID) {
3653   case Intrinsic::amdgcn_ds_fadd:
3654     return AMDGPU::G_ATOMICRMW_FADD;
3655   case Intrinsic::amdgcn_ds_fmin:
3656     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3657   case Intrinsic::amdgcn_ds_fmax:
3658     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3659   default:
3660     llvm_unreachable("not a DS FP intrinsic");
3661   }
3662 }
3663 
3664 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3665                                                       MachineInstr &MI,
3666                                                       Intrinsic::ID IID) const {
3667   GISelChangeObserver &Observer = Helper.Observer;
3668   Observer.changingInstr(MI);
3669 
3670   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3671 
3672   // The remaining operands were used to set fields in the MemOperand on
3673   // construction.
3674   for (int I = 6; I > 3; --I)
3675     MI.RemoveOperand(I);
3676 
3677   MI.RemoveOperand(1); // Remove the intrinsic ID.
3678   Observer.changedInstr(MI);
3679   return true;
3680 }
3681 
3682 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3683                                             MachineRegisterInfo &MRI,
3684                                             MachineIRBuilder &B) const {
3685   uint64_t Offset =
3686     ST.getTargetLowering()->getImplicitParameterOffset(
3687       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3688   LLT DstTy = MRI.getType(DstReg);
3689   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3690 
3691   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3692   if (!loadInputValue(KernargPtrReg, B,
3693                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3694     return false;
3695 
3696   // FIXME: This should be nuw
3697   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3698   return true;
3699 }
3700 
3701 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3702                                                  MachineRegisterInfo &MRI,
3703                                                  MachineIRBuilder &B) const {
3704   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3705   if (!MFI->isEntryFunction()) {
3706     return legalizePreloadedArgIntrin(MI, MRI, B,
3707                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3708   }
3709 
3710   Register DstReg = MI.getOperand(0).getReg();
3711   if (!getImplicitArgPtr(DstReg, MRI, B))
3712     return false;
3713 
3714   MI.eraseFromParent();
3715   return true;
3716 }
3717 
3718 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3719                                               MachineRegisterInfo &MRI,
3720                                               MachineIRBuilder &B,
3721                                               unsigned AddrSpace) const {
3722   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3723   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3724   Register Hi32 = Unmerge.getReg(1);
3725 
3726   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3727   MI.eraseFromParent();
3728   return true;
3729 }
3730 
3731 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3732 // offset (the offset that is included in bounds checking and swizzling, to be
3733 // split between the instruction's voffset and immoffset fields) and soffset
3734 // (the offset that is excluded from bounds checking and swizzling, to go in
3735 // the instruction's soffset field).  This function takes the first kind of
3736 // offset and figures out how to split it between voffset and immoffset.
3737 std::pair<Register, unsigned>
3738 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3739                                         Register OrigOffset) const {
3740   const unsigned MaxImm = 4095;
3741   Register BaseReg;
3742   unsigned ImmOffset;
3743   const LLT S32 = LLT::scalar(32);
3744   MachineRegisterInfo &MRI = *B.getMRI();
3745 
3746   std::tie(BaseReg, ImmOffset) =
3747       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3748 
3749   // If BaseReg is a pointer, convert it to int.
3750   if (MRI.getType(BaseReg).isPointer())
3751     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3752 
3753   // If the immediate value is too big for the immoffset field, put the value
3754   // and -4096 into the immoffset field so that the value that is copied/added
3755   // for the voffset field is a multiple of 4096, and it stands more chance
3756   // of being CSEd with the copy/add for another similar load/store.
3757   // However, do not do that rounding down to a multiple of 4096 if that is a
3758   // negative number, as it appears to be illegal to have a negative offset
3759   // in the vgpr, even if adding the immediate offset makes it positive.
3760   unsigned Overflow = ImmOffset & ~MaxImm;
3761   ImmOffset -= Overflow;
3762   if ((int32_t)Overflow < 0) {
3763     Overflow += ImmOffset;
3764     ImmOffset = 0;
3765   }
3766 
3767   if (Overflow != 0) {
3768     if (!BaseReg) {
3769       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3770     } else {
3771       auto OverflowVal = B.buildConstant(S32, Overflow);
3772       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3773     }
3774   }
3775 
3776   if (!BaseReg)
3777     BaseReg = B.buildConstant(S32, 0).getReg(0);
3778 
3779   return std::make_pair(BaseReg, ImmOffset);
3780 }
3781 
3782 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3783 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3784                                           Register VOffset, Register SOffset,
3785                                           unsigned ImmOffset, Register VIndex,
3786                                           MachineRegisterInfo &MRI) const {
3787   Optional<ValueAndVReg> MaybeVOffsetVal =
3788       getIConstantVRegValWithLookThrough(VOffset, MRI);
3789   Optional<ValueAndVReg> MaybeSOffsetVal =
3790       getIConstantVRegValWithLookThrough(SOffset, MRI);
3791   Optional<ValueAndVReg> MaybeVIndexVal =
3792       getIConstantVRegValWithLookThrough(VIndex, MRI);
3793   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3794   // update the MMO with that offset. The stride is unknown so we can only do
3795   // this if VIndex is constant 0.
3796   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3797       MaybeVIndexVal->Value == 0) {
3798     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3799                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3800     MMO->setOffset(TotalOffset);
3801   } else {
3802     // We don't have a constant combined offset to use in the MMO. Give up.
3803     MMO->setValue((Value *)nullptr);
3804   }
3805 }
3806 
3807 /// Handle register layout difference for f16 images for some subtargets.
3808 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3809                                              MachineRegisterInfo &MRI,
3810                                              Register Reg,
3811                                              bool ImageStore) const {
3812   const LLT S16 = LLT::scalar(16);
3813   const LLT S32 = LLT::scalar(32);
3814   LLT StoreVT = MRI.getType(Reg);
3815   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3816 
3817   if (ST.hasUnpackedD16VMem()) {
3818     auto Unmerge = B.buildUnmerge(S16, Reg);
3819 
3820     SmallVector<Register, 4> WideRegs;
3821     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3822       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3823 
3824     int NumElts = StoreVT.getNumElements();
3825 
3826     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3827         .getReg(0);
3828   }
3829 
3830   if (ImageStore && ST.hasImageStoreD16Bug()) {
3831     if (StoreVT.getNumElements() == 2) {
3832       SmallVector<Register, 4> PackedRegs;
3833       Reg = B.buildBitcast(S32, Reg).getReg(0);
3834       PackedRegs.push_back(Reg);
3835       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3836       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3837           .getReg(0);
3838     }
3839 
3840     if (StoreVT.getNumElements() == 3) {
3841       SmallVector<Register, 4> PackedRegs;
3842       auto Unmerge = B.buildUnmerge(S16, Reg);
3843       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3844         PackedRegs.push_back(Unmerge.getReg(I));
3845       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3846       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3847       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3848     }
3849 
3850     if (StoreVT.getNumElements() == 4) {
3851       SmallVector<Register, 4> PackedRegs;
3852       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3853       auto Unmerge = B.buildUnmerge(S32, Reg);
3854       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3855         PackedRegs.push_back(Unmerge.getReg(I));
3856       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3857       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3858           .getReg(0);
3859     }
3860 
3861     llvm_unreachable("invalid data type");
3862   }
3863 
3864   if (StoreVT == LLT::fixed_vector(3, S16)) {
3865     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3866               .getReg(0);
3867   }
3868   return Reg;
3869 }
3870 
3871 Register AMDGPULegalizerInfo::fixStoreSourceType(
3872   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3873   MachineRegisterInfo *MRI = B.getMRI();
3874   LLT Ty = MRI->getType(VData);
3875 
3876   const LLT S16 = LLT::scalar(16);
3877 
3878   // Fixup illegal register types for i8 stores.
3879   if (Ty == LLT::scalar(8) || Ty == S16) {
3880     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3881     return AnyExt;
3882   }
3883 
3884   if (Ty.isVector()) {
3885     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3886       if (IsFormat)
3887         return handleD16VData(B, *MRI, VData);
3888     }
3889   }
3890 
3891   return VData;
3892 }
3893 
3894 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3895                                               MachineRegisterInfo &MRI,
3896                                               MachineIRBuilder &B,
3897                                               bool IsTyped,
3898                                               bool IsFormat) const {
3899   Register VData = MI.getOperand(1).getReg();
3900   LLT Ty = MRI.getType(VData);
3901   LLT EltTy = Ty.getScalarType();
3902   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3903   const LLT S32 = LLT::scalar(32);
3904 
3905   VData = fixStoreSourceType(B, VData, IsFormat);
3906   Register RSrc = MI.getOperand(2).getReg();
3907 
3908   MachineMemOperand *MMO = *MI.memoperands_begin();
3909   const int MemSize = MMO->getSize();
3910 
3911   unsigned ImmOffset;
3912 
3913   // The typed intrinsics add an immediate after the registers.
3914   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3915 
3916   // The struct intrinsic variants add one additional operand over raw.
3917   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3918   Register VIndex;
3919   int OpOffset = 0;
3920   if (HasVIndex) {
3921     VIndex = MI.getOperand(3).getReg();
3922     OpOffset = 1;
3923   } else {
3924     VIndex = B.buildConstant(S32, 0).getReg(0);
3925   }
3926 
3927   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3928   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3929 
3930   unsigned Format = 0;
3931   if (IsTyped) {
3932     Format = MI.getOperand(5 + OpOffset).getImm();
3933     ++OpOffset;
3934   }
3935 
3936   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3937 
3938   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3939   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3940 
3941   unsigned Opc;
3942   if (IsTyped) {
3943     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3944                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3945   } else if (IsFormat) {
3946     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3947                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3948   } else {
3949     switch (MemSize) {
3950     case 1:
3951       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3952       break;
3953     case 2:
3954       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3955       break;
3956     default:
3957       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3958       break;
3959     }
3960   }
3961 
3962   auto MIB = B.buildInstr(Opc)
3963     .addUse(VData)              // vdata
3964     .addUse(RSrc)               // rsrc
3965     .addUse(VIndex)             // vindex
3966     .addUse(VOffset)            // voffset
3967     .addUse(SOffset)            // soffset
3968     .addImm(ImmOffset);         // offset(imm)
3969 
3970   if (IsTyped)
3971     MIB.addImm(Format);
3972 
3973   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3974      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3975      .addMemOperand(MMO);
3976 
3977   MI.eraseFromParent();
3978   return true;
3979 }
3980 
3981 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3982                                              MachineRegisterInfo &MRI,
3983                                              MachineIRBuilder &B,
3984                                              bool IsFormat,
3985                                              bool IsTyped) const {
3986   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3987   MachineMemOperand *MMO = *MI.memoperands_begin();
3988   const LLT MemTy = MMO->getMemoryType();
3989   const LLT S32 = LLT::scalar(32);
3990 
3991   Register Dst = MI.getOperand(0).getReg();
3992   Register RSrc = MI.getOperand(2).getReg();
3993 
3994   // The typed intrinsics add an immediate after the registers.
3995   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3996 
3997   // The struct intrinsic variants add one additional operand over raw.
3998   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3999   Register VIndex;
4000   int OpOffset = 0;
4001   if (HasVIndex) {
4002     VIndex = MI.getOperand(3).getReg();
4003     OpOffset = 1;
4004   } else {
4005     VIndex = B.buildConstant(S32, 0).getReg(0);
4006   }
4007 
4008   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4009   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4010 
4011   unsigned Format = 0;
4012   if (IsTyped) {
4013     Format = MI.getOperand(5 + OpOffset).getImm();
4014     ++OpOffset;
4015   }
4016 
4017   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4018   unsigned ImmOffset;
4019 
4020   LLT Ty = MRI.getType(Dst);
4021   LLT EltTy = Ty.getScalarType();
4022   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4023   const bool Unpacked = ST.hasUnpackedD16VMem();
4024 
4025   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4026   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4027 
4028   unsigned Opc;
4029 
4030   if (IsTyped) {
4031     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4032                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4033   } else if (IsFormat) {
4034     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4035                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4036   } else {
4037     switch (MemTy.getSizeInBits()) {
4038     case 8:
4039       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4040       break;
4041     case 16:
4042       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4043       break;
4044     default:
4045       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4046       break;
4047     }
4048   }
4049 
4050   Register LoadDstReg;
4051 
4052   bool IsExtLoad =
4053       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4054   LLT UnpackedTy = Ty.changeElementSize(32);
4055 
4056   if (IsExtLoad)
4057     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4058   else if (Unpacked && IsD16 && Ty.isVector())
4059     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4060   else
4061     LoadDstReg = Dst;
4062 
4063   auto MIB = B.buildInstr(Opc)
4064     .addDef(LoadDstReg)         // vdata
4065     .addUse(RSrc)               // rsrc
4066     .addUse(VIndex)             // vindex
4067     .addUse(VOffset)            // voffset
4068     .addUse(SOffset)            // soffset
4069     .addImm(ImmOffset);         // offset(imm)
4070 
4071   if (IsTyped)
4072     MIB.addImm(Format);
4073 
4074   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4075      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4076      .addMemOperand(MMO);
4077 
4078   if (LoadDstReg != Dst) {
4079     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4080 
4081     // Widen result for extending loads was widened.
4082     if (IsExtLoad)
4083       B.buildTrunc(Dst, LoadDstReg);
4084     else {
4085       // Repack to original 16-bit vector result
4086       // FIXME: G_TRUNC should work, but legalization currently fails
4087       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4088       SmallVector<Register, 4> Repack;
4089       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4090         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4091       B.buildMerge(Dst, Repack);
4092     }
4093   }
4094 
4095   MI.eraseFromParent();
4096   return true;
4097 }
4098 
4099 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4100                                                MachineIRBuilder &B,
4101                                                bool IsInc) const {
4102   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4103                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4104   B.buildInstr(Opc)
4105     .addDef(MI.getOperand(0).getReg())
4106     .addUse(MI.getOperand(2).getReg())
4107     .addUse(MI.getOperand(3).getReg())
4108     .cloneMemRefs(MI);
4109   MI.eraseFromParent();
4110   return true;
4111 }
4112 
4113 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4114   switch (IntrID) {
4115   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4116   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4117     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4118   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4119   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4120     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4121   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4122   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4123     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4124   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4125   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4126     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4127   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4128   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4129     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4130   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4131   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4132     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4133   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4134   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4135     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4136   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4137   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4138     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4139   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4140   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4141     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4142   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4143   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4144     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4145   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4146   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4147     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4148   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4149   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4150     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4151   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4152   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4153     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4154   case Intrinsic::amdgcn_buffer_atomic_fadd:
4155   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4156   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4157     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4158   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4159   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4160     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4161   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4162   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4163     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4164   default:
4165     llvm_unreachable("unhandled atomic opcode");
4166   }
4167 }
4168 
4169 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4170                                                MachineIRBuilder &B,
4171                                                Intrinsic::ID IID) const {
4172   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4173                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4174   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4175 
4176   Register Dst;
4177 
4178   int OpOffset = 0;
4179   if (HasReturn) {
4180     // A few FP atomics do not support return values.
4181     Dst = MI.getOperand(0).getReg();
4182   } else {
4183     OpOffset = -1;
4184   }
4185 
4186   Register VData = MI.getOperand(2 + OpOffset).getReg();
4187   Register CmpVal;
4188 
4189   if (IsCmpSwap) {
4190     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4191     ++OpOffset;
4192   }
4193 
4194   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4195   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4196 
4197   // The struct intrinsic variants add one additional operand over raw.
4198   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4199   Register VIndex;
4200   if (HasVIndex) {
4201     VIndex = MI.getOperand(4 + OpOffset).getReg();
4202     ++OpOffset;
4203   } else {
4204     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4205   }
4206 
4207   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4208   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4209   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4210 
4211   MachineMemOperand *MMO = *MI.memoperands_begin();
4212 
4213   unsigned ImmOffset;
4214   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4215   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4216 
4217   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4218 
4219   if (HasReturn)
4220     MIB.addDef(Dst);
4221 
4222   MIB.addUse(VData); // vdata
4223 
4224   if (IsCmpSwap)
4225     MIB.addReg(CmpVal);
4226 
4227   MIB.addUse(RSrc)               // rsrc
4228      .addUse(VIndex)             // vindex
4229      .addUse(VOffset)            // voffset
4230      .addUse(SOffset)            // soffset
4231      .addImm(ImmOffset)          // offset(imm)
4232      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4233      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4234      .addMemOperand(MMO);
4235 
4236   MI.eraseFromParent();
4237   return true;
4238 }
4239 
4240 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4241 /// vector with s16 typed elements.
4242 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4243                                       SmallVectorImpl<Register> &PackedAddrs,
4244                                       unsigned ArgOffset,
4245                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4246                                       bool IsA16, bool IsG16) {
4247   const LLT S16 = LLT::scalar(16);
4248   const LLT V2S16 = LLT::fixed_vector(2, 16);
4249   auto EndIdx = Intr->VAddrEnd;
4250 
4251   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4252     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4253     if (!SrcOp.isReg())
4254       continue; // _L to _LZ may have eliminated this.
4255 
4256     Register AddrReg = SrcOp.getReg();
4257 
4258     if ((I < Intr->GradientStart) ||
4259         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4260         (I >= Intr->CoordStart && !IsA16)) {
4261       // Handle any gradient or coordinate operands that should not be packed
4262       if ((I < Intr->GradientStart) && IsA16 &&
4263           (B.getMRI()->getType(AddrReg) == S16)) {
4264         // Special handling of bias when A16 is on. Bias is of type half but
4265         // occupies full 32-bit.
4266         PackedAddrs.push_back(
4267             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4268                 .getReg(0));
4269       } else {
4270         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4271         PackedAddrs.push_back(AddrReg);
4272       }
4273     } else {
4274       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4275       // derivatives dx/dh and dx/dv are packed with undef.
4276       if (((I + 1) >= EndIdx) ||
4277           ((Intr->NumGradients / 2) % 2 == 1 &&
4278            (I == static_cast<unsigned>(Intr->GradientStart +
4279                                        (Intr->NumGradients / 2) - 1) ||
4280             I == static_cast<unsigned>(Intr->GradientStart +
4281                                        Intr->NumGradients - 1))) ||
4282           // Check for _L to _LZ optimization
4283           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4284         PackedAddrs.push_back(
4285             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4286                 .getReg(0));
4287       } else {
4288         PackedAddrs.push_back(
4289             B.buildBuildVector(
4290                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4291                 .getReg(0));
4292         ++I;
4293       }
4294     }
4295   }
4296 }
4297 
4298 /// Convert from separate vaddr components to a single vector address register,
4299 /// and replace the remaining operands with $noreg.
4300 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4301                                      int DimIdx, int NumVAddrs) {
4302   const LLT S32 = LLT::scalar(32);
4303 
4304   SmallVector<Register, 8> AddrRegs;
4305   for (int I = 0; I != NumVAddrs; ++I) {
4306     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4307     if (SrcOp.isReg()) {
4308       AddrRegs.push_back(SrcOp.getReg());
4309       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4310     }
4311   }
4312 
4313   int NumAddrRegs = AddrRegs.size();
4314   if (NumAddrRegs != 1) {
4315     // Above 8 elements round up to next power of 2 (i.e. 16).
4316     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4317       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4318       auto Undef = B.buildUndef(S32);
4319       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4320       NumAddrRegs = RoundedNumRegs;
4321     }
4322 
4323     auto VAddr =
4324         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4325     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4326   }
4327 
4328   for (int I = 1; I != NumVAddrs; ++I) {
4329     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4330     if (SrcOp.isReg())
4331       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4332   }
4333 }
4334 
4335 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4336 ///
4337 /// Depending on the subtarget, load/store with 16-bit element data need to be
4338 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4339 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4340 /// registers.
4341 ///
4342 /// We don't want to directly select image instructions just yet, but also want
4343 /// to exposes all register repacking to the legalizer/combiners. We also don't
4344 /// want a selected instrution entering RegBankSelect. In order to avoid
4345 /// defining a multitude of intermediate image instructions, directly hack on
4346 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4347 /// padding now unnecessary arguments with $noreg.
4348 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4349     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4350     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4351 
4352   const unsigned NumDefs = MI.getNumExplicitDefs();
4353   const unsigned ArgOffset = NumDefs + 1;
4354   bool IsTFE = NumDefs == 2;
4355   // We are only processing the operands of d16 image operations on subtargets
4356   // that use the unpacked register layout, or need to repack the TFE result.
4357 
4358   // TODO: Do we need to guard against already legalized intrinsics?
4359   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4360       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4361 
4362   MachineRegisterInfo *MRI = B.getMRI();
4363   const LLT S32 = LLT::scalar(32);
4364   const LLT S16 = LLT::scalar(16);
4365   const LLT V2S16 = LLT::fixed_vector(2, 16);
4366 
4367   unsigned DMask = 0;
4368   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4369   LLT Ty = MRI->getType(VData);
4370 
4371   // Check for 16 bit addresses and pack if true.
4372   LLT GradTy =
4373       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4374   LLT AddrTy =
4375       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4376   const bool IsG16 = GradTy == S16;
4377   const bool IsA16 = AddrTy == S16;
4378   const bool IsD16 = Ty.getScalarType() == S16;
4379 
4380   int DMaskLanes = 0;
4381   if (!BaseOpcode->Atomic) {
4382     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4383     if (BaseOpcode->Gather4) {
4384       DMaskLanes = 4;
4385     } else if (DMask != 0) {
4386       DMaskLanes = countPopulation(DMask);
4387     } else if (!IsTFE && !BaseOpcode->Store) {
4388       // If dmask is 0, this is a no-op load. This can be eliminated.
4389       B.buildUndef(MI.getOperand(0));
4390       MI.eraseFromParent();
4391       return true;
4392     }
4393   }
4394 
4395   Observer.changingInstr(MI);
4396   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4397 
4398   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4399                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4400   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4401                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4402   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4403 
4404   // Track that we legalized this
4405   MI.setDesc(B.getTII().get(NewOpcode));
4406 
4407   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4408   // dmask to be at least 1 otherwise the instruction will fail
4409   if (IsTFE && DMask == 0) {
4410     DMask = 0x1;
4411     DMaskLanes = 1;
4412     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4413   }
4414 
4415   if (BaseOpcode->Atomic) {
4416     Register VData0 = MI.getOperand(2).getReg();
4417     LLT Ty = MRI->getType(VData0);
4418 
4419     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4420     if (Ty.isVector())
4421       return false;
4422 
4423     if (BaseOpcode->AtomicX2) {
4424       Register VData1 = MI.getOperand(3).getReg();
4425       // The two values are packed in one register.
4426       LLT PackedTy = LLT::fixed_vector(2, Ty);
4427       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4428       MI.getOperand(2).setReg(Concat.getReg(0));
4429       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4430     }
4431   }
4432 
4433   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4434 
4435   // Optimize _L to _LZ when _L is zero
4436   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4437           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4438     const ConstantFP *ConstantLod;
4439 
4440     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4441                  m_GFCst(ConstantLod))) {
4442       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4443         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4444         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4445             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4446                                                      Intr->Dim);
4447 
4448         // The starting indexes should remain in the same place.
4449         --CorrectedNumVAddrs;
4450 
4451         MI.getOperand(NumDefs).setIntrinsicID(
4452             static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4453         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4454         Intr = NewImageDimIntr;
4455       }
4456     }
4457   }
4458 
4459   // Optimize _mip away, when 'lod' is zero
4460   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4461     int64_t ConstantLod;
4462     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4463                  m_ICst(ConstantLod))) {
4464       if (ConstantLod == 0) {
4465         // TODO: Change intrinsic opcode and remove operand instead or replacing
4466         // it with 0, as the _L to _LZ handling is done above.
4467         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4468         --CorrectedNumVAddrs;
4469       }
4470     }
4471   }
4472 
4473   // Rewrite the addressing register layout before doing anything else.
4474   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4475     // 16 bit gradients are supported, but are tied to the A16 control
4476     // so both gradients and addresses must be 16 bit
4477     return false;
4478   }
4479 
4480   if (IsA16 && !ST.hasA16()) {
4481     // A16 not supported
4482     return false;
4483   }
4484 
4485   if (IsA16 || IsG16) {
4486     if (Intr->NumVAddrs > 1) {
4487       SmallVector<Register, 4> PackedRegs;
4488 
4489       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4490                                 IsG16);
4491 
4492       // See also below in the non-a16 branch
4493       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4494                           PackedRegs.size() <= ST.getNSAMaxSize();
4495 
4496       if (!UseNSA && PackedRegs.size() > 1) {
4497         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4498         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4499         PackedRegs[0] = Concat.getReg(0);
4500         PackedRegs.resize(1);
4501       }
4502 
4503       const unsigned NumPacked = PackedRegs.size();
4504       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4505         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4506         if (!SrcOp.isReg()) {
4507           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4508           continue;
4509         }
4510 
4511         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4512 
4513         if (I - Intr->VAddrStart < NumPacked)
4514           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4515         else
4516           SrcOp.setReg(AMDGPU::NoRegister);
4517       }
4518     }
4519   } else {
4520     // If the register allocator cannot place the address registers contiguously
4521     // without introducing moves, then using the non-sequential address encoding
4522     // is always preferable, since it saves VALU instructions and is usually a
4523     // wash in terms of code size or even better.
4524     //
4525     // However, we currently have no way of hinting to the register allocator
4526     // that MIMG addresses should be placed contiguously when it is possible to
4527     // do so, so force non-NSA for the common 2-address case as a heuristic.
4528     //
4529     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4530     // allocation when possible.
4531     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4532                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4533 
4534     if (!UseNSA && Intr->NumVAddrs > 1)
4535       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4536                                Intr->NumVAddrs);
4537   }
4538 
4539   int Flags = 0;
4540   if (IsA16)
4541     Flags |= 1;
4542   if (IsG16)
4543     Flags |= 2;
4544   MI.addOperand(MachineOperand::CreateImm(Flags));
4545 
4546   if (BaseOpcode->Store) { // No TFE for stores?
4547     // TODO: Handle dmask trim
4548     if (!Ty.isVector() || !IsD16)
4549       return true;
4550 
4551     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4552     if (RepackedReg != VData) {
4553       MI.getOperand(1).setReg(RepackedReg);
4554     }
4555 
4556     return true;
4557   }
4558 
4559   Register DstReg = MI.getOperand(0).getReg();
4560   const LLT EltTy = Ty.getScalarType();
4561   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4562 
4563   // Confirm that the return type is large enough for the dmask specified
4564   if (NumElts < DMaskLanes)
4565     return false;
4566 
4567   if (NumElts > 4 || DMaskLanes > 4)
4568     return false;
4569 
4570   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4571   const LLT AdjustedTy =
4572       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4573 
4574   // The raw dword aligned data component of the load. The only legal cases
4575   // where this matters should be when using the packed D16 format, for
4576   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4577   LLT RoundedTy;
4578 
4579   // S32 vector to to cover all data, plus TFE result element.
4580   LLT TFETy;
4581 
4582   // Register type to use for each loaded component. Will be S32 or V2S16.
4583   LLT RegTy;
4584 
4585   if (IsD16 && ST.hasUnpackedD16VMem()) {
4586     RoundedTy =
4587         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4588     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4589     RegTy = S32;
4590   } else {
4591     unsigned EltSize = EltTy.getSizeInBits();
4592     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4593     unsigned RoundedSize = 32 * RoundedElts;
4594     RoundedTy = LLT::scalarOrVector(
4595         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4596     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4597     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4598   }
4599 
4600   // The return type does not need adjustment.
4601   // TODO: Should we change s16 case to s32 or <2 x s16>?
4602   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4603     return true;
4604 
4605   Register Dst1Reg;
4606 
4607   // Insert after the instruction.
4608   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4609 
4610   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4611   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4612   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4613   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4614 
4615   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4616 
4617   MI.getOperand(0).setReg(NewResultReg);
4618 
4619   // In the IR, TFE is supposed to be used with a 2 element struct return
4620   // type. The instruction really returns these two values in one contiguous
4621   // register, with one additional dword beyond the loaded data. Rewrite the
4622   // return type to use a single register result.
4623 
4624   if (IsTFE) {
4625     Dst1Reg = MI.getOperand(1).getReg();
4626     if (MRI->getType(Dst1Reg) != S32)
4627       return false;
4628 
4629     // TODO: Make sure the TFE operand bit is set.
4630     MI.RemoveOperand(1);
4631 
4632     // Handle the easy case that requires no repack instructions.
4633     if (Ty == S32) {
4634       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4635       return true;
4636     }
4637   }
4638 
4639   // Now figure out how to copy the new result register back into the old
4640   // result.
4641   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4642 
4643   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4644 
4645   if (ResultNumRegs == 1) {
4646     assert(!IsTFE);
4647     ResultRegs[0] = NewResultReg;
4648   } else {
4649     // We have to repack into a new vector of some kind.
4650     for (int I = 0; I != NumDataRegs; ++I)
4651       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4652     B.buildUnmerge(ResultRegs, NewResultReg);
4653 
4654     // Drop the final TFE element to get the data part. The TFE result is
4655     // directly written to the right place already.
4656     if (IsTFE)
4657       ResultRegs.resize(NumDataRegs);
4658   }
4659 
4660   // For an s16 scalar result, we form an s32 result with a truncate regardless
4661   // of packed vs. unpacked.
4662   if (IsD16 && !Ty.isVector()) {
4663     B.buildTrunc(DstReg, ResultRegs[0]);
4664     return true;
4665   }
4666 
4667   // Avoid a build/concat_vector of 1 entry.
4668   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4669     B.buildBitcast(DstReg, ResultRegs[0]);
4670     return true;
4671   }
4672 
4673   assert(Ty.isVector());
4674 
4675   if (IsD16) {
4676     // For packed D16 results with TFE enabled, all the data components are
4677     // S32. Cast back to the expected type.
4678     //
4679     // TODO: We don't really need to use load s32 elements. We would only need one
4680     // cast for the TFE result if a multiple of v2s16 was used.
4681     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4682       for (Register &Reg : ResultRegs)
4683         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4684     } else if (ST.hasUnpackedD16VMem()) {
4685       for (Register &Reg : ResultRegs)
4686         Reg = B.buildTrunc(S16, Reg).getReg(0);
4687     }
4688   }
4689 
4690   auto padWithUndef = [&](LLT Ty, int NumElts) {
4691     if (NumElts == 0)
4692       return;
4693     Register Undef = B.buildUndef(Ty).getReg(0);
4694     for (int I = 0; I != NumElts; ++I)
4695       ResultRegs.push_back(Undef);
4696   };
4697 
4698   // Pad out any elements eliminated due to the dmask.
4699   LLT ResTy = MRI->getType(ResultRegs[0]);
4700   if (!ResTy.isVector()) {
4701     padWithUndef(ResTy, NumElts - ResultRegs.size());
4702     B.buildBuildVector(DstReg, ResultRegs);
4703     return true;
4704   }
4705 
4706   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4707   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4708 
4709   // Deal with the one annoying legal case.
4710   const LLT V3S16 = LLT::fixed_vector(3, 16);
4711   if (Ty == V3S16) {
4712     if (IsTFE) {
4713       if (ResultRegs.size() == 1) {
4714         NewResultReg = ResultRegs[0];
4715       } else if (ResultRegs.size() == 2) {
4716         LLT V4S16 = LLT::fixed_vector(4, 16);
4717         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4718       } else {
4719         return false;
4720       }
4721     }
4722 
4723     if (MRI->getType(DstReg).getNumElements() <
4724         MRI->getType(NewResultReg).getNumElements()) {
4725       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4726     } else {
4727       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4728     }
4729     return true;
4730   }
4731 
4732   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4733   B.buildConcatVectors(DstReg, ResultRegs);
4734   return true;
4735 }
4736 
4737 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4738   LegalizerHelper &Helper, MachineInstr &MI) const {
4739   MachineIRBuilder &B = Helper.MIRBuilder;
4740   GISelChangeObserver &Observer = Helper.Observer;
4741 
4742   Register Dst = MI.getOperand(0).getReg();
4743   LLT Ty = B.getMRI()->getType(Dst);
4744   unsigned Size = Ty.getSizeInBits();
4745   MachineFunction &MF = B.getMF();
4746 
4747   Observer.changingInstr(MI);
4748 
4749   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4750     Ty = getBitcastRegisterType(Ty);
4751     Helper.bitcastDst(MI, Ty, 0);
4752     Dst = MI.getOperand(0).getReg();
4753     B.setInsertPt(B.getMBB(), MI);
4754   }
4755 
4756   // FIXME: We don't really need this intermediate instruction. The intrinsic
4757   // should be fixed to have a memory operand. Since it's readnone, we're not
4758   // allowed to add one.
4759   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4760   MI.RemoveOperand(1); // Remove intrinsic ID
4761 
4762   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4763   // TODO: Should this use datalayout alignment?
4764   const unsigned MemSize = (Size + 7) / 8;
4765   const Align MemAlign(4);
4766   MachineMemOperand *MMO = MF.getMachineMemOperand(
4767       MachinePointerInfo(),
4768       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4769           MachineMemOperand::MOInvariant,
4770       MemSize, MemAlign);
4771   MI.addMemOperand(MF, MMO);
4772 
4773   // There are no 96-bit result scalar loads, but widening to 128-bit should
4774   // always be legal. We may need to restore this to a 96-bit result if it turns
4775   // out this needs to be converted to a vector load during RegBankSelect.
4776   if (!isPowerOf2_32(Size)) {
4777     if (Ty.isVector())
4778       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4779     else
4780       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4781   }
4782 
4783   Observer.changedInstr(MI);
4784   return true;
4785 }
4786 
4787 // TODO: Move to selection
4788 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4789                                                 MachineRegisterInfo &MRI,
4790                                                 MachineIRBuilder &B) const {
4791   if (!ST.isTrapHandlerEnabled() ||
4792       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4793     return legalizeTrapEndpgm(MI, MRI, B);
4794 
4795   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4796     switch (*HsaAbiVer) {
4797     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4798     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4799       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4800     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4801       return ST.supportsGetDoorbellID() ?
4802           legalizeTrapHsa(MI, MRI, B) :
4803           legalizeTrapHsaQueuePtr(MI, MRI, B);
4804     }
4805   }
4806 
4807   llvm_unreachable("Unknown trap handler");
4808 }
4809 
4810 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4811     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4812   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4813   MI.eraseFromParent();
4814   return true;
4815 }
4816 
4817 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4818     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4819   // Pass queue pointer to trap handler as input, and insert trap instruction
4820   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4821   Register LiveIn =
4822     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4823   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4824     return false;
4825 
4826   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4827   B.buildCopy(SGPR01, LiveIn);
4828   B.buildInstr(AMDGPU::S_TRAP)
4829       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4830       .addReg(SGPR01, RegState::Implicit);
4831 
4832   MI.eraseFromParent();
4833   return true;
4834 }
4835 
4836 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4837     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4838   B.buildInstr(AMDGPU::S_TRAP)
4839       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4840   MI.eraseFromParent();
4841   return true;
4842 }
4843 
4844 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4845     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4846   // Is non-HSA path or trap-handler disabled? Then, report a warning
4847   // accordingly
4848   if (!ST.isTrapHandlerEnabled() ||
4849       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4850     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4851                                      "debugtrap handler not supported",
4852                                      MI.getDebugLoc(), DS_Warning);
4853     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4854     Ctx.diagnose(NoTrap);
4855   } else {
4856     // Insert debug-trap instruction
4857     B.buildInstr(AMDGPU::S_TRAP)
4858         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4859   }
4860 
4861   MI.eraseFromParent();
4862   return true;
4863 }
4864 
4865 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4866                                                MachineIRBuilder &B) const {
4867   MachineRegisterInfo &MRI = *B.getMRI();
4868   const LLT S16 = LLT::scalar(16);
4869   const LLT S32 = LLT::scalar(32);
4870 
4871   Register DstReg = MI.getOperand(0).getReg();
4872   Register NodePtr = MI.getOperand(2).getReg();
4873   Register RayExtent = MI.getOperand(3).getReg();
4874   Register RayOrigin = MI.getOperand(4).getReg();
4875   Register RayDir = MI.getOperand(5).getReg();
4876   Register RayInvDir = MI.getOperand(6).getReg();
4877   Register TDescr = MI.getOperand(7).getReg();
4878 
4879   if (!ST.hasGFX10_AEncoding()) {
4880     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4881                                         "intrinsic not supported on subtarget",
4882                                         MI.getDebugLoc());
4883     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4884     return false;
4885   }
4886 
4887   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4888   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4889   const unsigned NumVDataDwords = 4;
4890   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4891   const bool UseNSA =
4892       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4893   const unsigned BaseOpcodes[2][2] = {
4894       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4895       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4896        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4897   int Opcode;
4898   if (UseNSA) {
4899     Opcode =
4900         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4901                               NumVDataDwords, NumVAddrDwords);
4902   } else {
4903     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4904                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4905                                    PowerOf2Ceil(NumVAddrDwords));
4906   }
4907   assert(Opcode != -1);
4908 
4909   SmallVector<Register, 12> Ops;
4910   if (Is64) {
4911     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4912     Ops.push_back(Unmerge.getReg(0));
4913     Ops.push_back(Unmerge.getReg(1));
4914   } else {
4915     Ops.push_back(NodePtr);
4916   }
4917   Ops.push_back(RayExtent);
4918 
4919   auto packLanes = [&Ops, &S32, &B](Register Src) {
4920     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4921     Ops.push_back(Unmerge.getReg(0));
4922     Ops.push_back(Unmerge.getReg(1));
4923     Ops.push_back(Unmerge.getReg(2));
4924   };
4925 
4926   packLanes(RayOrigin);
4927   if (IsA16) {
4928     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4929     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4930     Register R1 = MRI.createGenericVirtualRegister(S32);
4931     Register R2 = MRI.createGenericVirtualRegister(S32);
4932     Register R3 = MRI.createGenericVirtualRegister(S32);
4933     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4934     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4935     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4936     Ops.push_back(R1);
4937     Ops.push_back(R2);
4938     Ops.push_back(R3);
4939   } else {
4940     packLanes(RayDir);
4941     packLanes(RayInvDir);
4942   }
4943 
4944   if (!UseNSA) {
4945     // Build a single vector containing all the operands so far prepared.
4946     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4947     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4948     Ops.clear();
4949     Ops.push_back(MergedOps);
4950   }
4951 
4952   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4953     .addDef(DstReg)
4954     .addImm(Opcode);
4955 
4956   for (Register R : Ops) {
4957     MIB.addUse(R);
4958   }
4959 
4960   MIB.addUse(TDescr)
4961      .addImm(IsA16 ? 1 : 0)
4962      .cloneMemRefs(MI);
4963 
4964   MI.eraseFromParent();
4965   return true;
4966 }
4967 
4968 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4969                                             MachineInstr &MI) const {
4970   MachineIRBuilder &B = Helper.MIRBuilder;
4971   MachineRegisterInfo &MRI = *B.getMRI();
4972 
4973   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4974   auto IntrID = MI.getIntrinsicID();
4975   switch (IntrID) {
4976   case Intrinsic::amdgcn_if:
4977   case Intrinsic::amdgcn_else: {
4978     MachineInstr *Br = nullptr;
4979     MachineBasicBlock *UncondBrTarget = nullptr;
4980     bool Negated = false;
4981     if (MachineInstr *BrCond =
4982             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4983       const SIRegisterInfo *TRI
4984         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4985 
4986       Register Def = MI.getOperand(1).getReg();
4987       Register Use = MI.getOperand(3).getReg();
4988 
4989       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4990 
4991       if (Negated)
4992         std::swap(CondBrTarget, UncondBrTarget);
4993 
4994       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4995       if (IntrID == Intrinsic::amdgcn_if) {
4996         B.buildInstr(AMDGPU::SI_IF)
4997           .addDef(Def)
4998           .addUse(Use)
4999           .addMBB(UncondBrTarget);
5000       } else {
5001         B.buildInstr(AMDGPU::SI_ELSE)
5002             .addDef(Def)
5003             .addUse(Use)
5004             .addMBB(UncondBrTarget);
5005       }
5006 
5007       if (Br) {
5008         Br->getOperand(0).setMBB(CondBrTarget);
5009       } else {
5010         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5011         // since we're swapping branch targets it needs to be reinserted.
5012         // FIXME: IRTranslator should probably not do this
5013         B.buildBr(*CondBrTarget);
5014       }
5015 
5016       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5017       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5018       MI.eraseFromParent();
5019       BrCond->eraseFromParent();
5020       return true;
5021     }
5022 
5023     return false;
5024   }
5025   case Intrinsic::amdgcn_loop: {
5026     MachineInstr *Br = nullptr;
5027     MachineBasicBlock *UncondBrTarget = nullptr;
5028     bool Negated = false;
5029     if (MachineInstr *BrCond =
5030             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5031       const SIRegisterInfo *TRI
5032         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5033 
5034       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5035       Register Reg = MI.getOperand(2).getReg();
5036 
5037       if (Negated)
5038         std::swap(CondBrTarget, UncondBrTarget);
5039 
5040       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5041       B.buildInstr(AMDGPU::SI_LOOP)
5042         .addUse(Reg)
5043         .addMBB(UncondBrTarget);
5044 
5045       if (Br)
5046         Br->getOperand(0).setMBB(CondBrTarget);
5047       else
5048         B.buildBr(*CondBrTarget);
5049 
5050       MI.eraseFromParent();
5051       BrCond->eraseFromParent();
5052       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5053       return true;
5054     }
5055 
5056     return false;
5057   }
5058   case Intrinsic::amdgcn_kernarg_segment_ptr:
5059     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5060       // This only makes sense to call in a kernel, so just lower to null.
5061       B.buildConstant(MI.getOperand(0).getReg(), 0);
5062       MI.eraseFromParent();
5063       return true;
5064     }
5065 
5066     return legalizePreloadedArgIntrin(
5067       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5068   case Intrinsic::amdgcn_implicitarg_ptr:
5069     return legalizeImplicitArgPtr(MI, MRI, B);
5070   case Intrinsic::amdgcn_workitem_id_x:
5071     return legalizePreloadedArgIntrin(MI, MRI, B,
5072                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5073   case Intrinsic::amdgcn_workitem_id_y:
5074     return legalizePreloadedArgIntrin(MI, MRI, B,
5075                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5076   case Intrinsic::amdgcn_workitem_id_z:
5077     return legalizePreloadedArgIntrin(MI, MRI, B,
5078                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5079   case Intrinsic::amdgcn_workgroup_id_x:
5080     return legalizePreloadedArgIntrin(MI, MRI, B,
5081                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5082   case Intrinsic::amdgcn_workgroup_id_y:
5083     return legalizePreloadedArgIntrin(MI, MRI, B,
5084                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5085   case Intrinsic::amdgcn_workgroup_id_z:
5086     return legalizePreloadedArgIntrin(MI, MRI, B,
5087                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5088   case Intrinsic::amdgcn_dispatch_ptr:
5089     return legalizePreloadedArgIntrin(MI, MRI, B,
5090                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5091   case Intrinsic::amdgcn_queue_ptr:
5092     return legalizePreloadedArgIntrin(MI, MRI, B,
5093                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5094   case Intrinsic::amdgcn_implicit_buffer_ptr:
5095     return legalizePreloadedArgIntrin(
5096       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5097   case Intrinsic::amdgcn_dispatch_id:
5098     return legalizePreloadedArgIntrin(MI, MRI, B,
5099                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5100   case Intrinsic::amdgcn_fdiv_fast:
5101     return legalizeFDIVFastIntrin(MI, MRI, B);
5102   case Intrinsic::amdgcn_is_shared:
5103     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5104   case Intrinsic::amdgcn_is_private:
5105     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5106   case Intrinsic::amdgcn_wavefrontsize: {
5107     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5108     MI.eraseFromParent();
5109     return true;
5110   }
5111   case Intrinsic::amdgcn_s_buffer_load:
5112     return legalizeSBufferLoad(Helper, MI);
5113   case Intrinsic::amdgcn_raw_buffer_store:
5114   case Intrinsic::amdgcn_struct_buffer_store:
5115     return legalizeBufferStore(MI, MRI, B, false, false);
5116   case Intrinsic::amdgcn_raw_buffer_store_format:
5117   case Intrinsic::amdgcn_struct_buffer_store_format:
5118     return legalizeBufferStore(MI, MRI, B, false, true);
5119   case Intrinsic::amdgcn_raw_tbuffer_store:
5120   case Intrinsic::amdgcn_struct_tbuffer_store:
5121     return legalizeBufferStore(MI, MRI, B, true, true);
5122   case Intrinsic::amdgcn_raw_buffer_load:
5123   case Intrinsic::amdgcn_struct_buffer_load:
5124     return legalizeBufferLoad(MI, MRI, B, false, false);
5125   case Intrinsic::amdgcn_raw_buffer_load_format:
5126   case Intrinsic::amdgcn_struct_buffer_load_format:
5127     return legalizeBufferLoad(MI, MRI, B, true, false);
5128   case Intrinsic::amdgcn_raw_tbuffer_load:
5129   case Intrinsic::amdgcn_struct_tbuffer_load:
5130     return legalizeBufferLoad(MI, MRI, B, true, true);
5131   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5132   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5133   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5134   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5135   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5136   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5137   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5138   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5139   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5140   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5141   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5142   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5143   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5144   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5145   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5146   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5147   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5148   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5149   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5150   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5151   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5152   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5153   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5154   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5155   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5156   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5157   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5158   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5159   case Intrinsic::amdgcn_buffer_atomic_fadd:
5160   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5161   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5162   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5163   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5164     return legalizeBufferAtomic(MI, B, IntrID);
5165   case Intrinsic::amdgcn_atomic_inc:
5166     return legalizeAtomicIncDec(MI, B, true);
5167   case Intrinsic::amdgcn_atomic_dec:
5168     return legalizeAtomicIncDec(MI, B, false);
5169   case Intrinsic::trap:
5170     return legalizeTrapIntrinsic(MI, MRI, B);
5171   case Intrinsic::debugtrap:
5172     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5173   case Intrinsic::amdgcn_rsq_clamp:
5174     return legalizeRsqClampIntrinsic(MI, MRI, B);
5175   case Intrinsic::amdgcn_ds_fadd:
5176   case Intrinsic::amdgcn_ds_fmin:
5177   case Intrinsic::amdgcn_ds_fmax:
5178     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5179   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5180     return legalizeBVHIntrinsic(MI, B);
5181   default: {
5182     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5183             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5184       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5185     return true;
5186   }
5187   }
5188 
5189   return true;
5190 }
5191