1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElementsStrict(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 
1056     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057       return true;
1058 
1059     const LLT PtrTy = Query.Types[1];
1060     unsigned AS = PtrTy.getAddressSpace();
1061     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062       return true;
1063 
1064     // Catch weird sized loads that don't evenly divide into the access sizes
1065     // TODO: May be able to widen depending on alignment etc.
1066     unsigned NumRegs = (MemSize + 31) / 32;
1067     if (NumRegs == 3) {
1068       if (!ST.hasDwordx3LoadStores())
1069         return true;
1070     } else {
1071       // If the alignment allows, these should have been widened.
1072       if (!isPowerOf2_32(NumRegs))
1073         return true;
1074     }
1075 
1076     return false;
1077   };
1078 
1079   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084   // LDS
1085   // TODO: Unsupported flat for SI.
1086 
1087   for (unsigned Op : {G_LOAD, G_STORE}) {
1088     const bool IsStore = Op == G_STORE;
1089 
1090     auto &Actions = getActionDefinitionsBuilder(Op);
1091     // Explicitly list some common cases.
1092     // TODO: Does this help compile time at all?
1093     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096                                       {S64, GlobalPtr, S64, GlobalAlign32},
1097                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099                                       {S32, GlobalPtr, S8, GlobalAlign8},
1100                                       {S32, GlobalPtr, S16, GlobalAlign16},
1101 
1102                                       {S32, LocalPtr, S32, 32},
1103                                       {S64, LocalPtr, S64, 32},
1104                                       {V2S32, LocalPtr, V2S32, 32},
1105                                       {S32, LocalPtr, S8, 8},
1106                                       {S32, LocalPtr, S16, 16},
1107                                       {V2S16, LocalPtr, S32, 32},
1108 
1109                                       {S32, PrivatePtr, S32, 32},
1110                                       {S32, PrivatePtr, S8, 8},
1111                                       {S32, PrivatePtr, S16, 16},
1112                                       {V2S16, PrivatePtr, S32, 32},
1113 
1114                                       {S32, ConstantPtr, S32, GlobalAlign32},
1115                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117                                       {S64, ConstantPtr, S64, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119     Actions.legalIf(
1120       [=](const LegalityQuery &Query) -> bool {
1121         return isLoadStoreLegal(ST, Query);
1122       });
1123 
1124     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125     // 64-bits.
1126     //
1127     // TODO: Should generalize bitcast action into coerce, which will also cover
1128     // inserting addrspacecasts.
1129     Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131     // Turn any illegal element vectors into something easier to deal
1132     // with. These will ultimately produce 32-bit scalar shifts to extract the
1133     // parts anyway.
1134     //
1135     // For odd 16-bit element vectors, prefer to split those into pieces with
1136     // 16-bit vector parts.
1137     Actions.bitcastIf(
1138       [=](const LegalityQuery &Query) -> bool {
1139         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140                                           Query.MMODescrs[0].MemoryTy);
1141       }, bitcastToRegisterType(0));
1142 
1143     if (!IsStore) {
1144       // Widen suitably aligned loads by loading extra bytes. The standard
1145       // legalization actions can't properly express widening memory operands.
1146       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147         return shouldWidenLoad(ST, Query, G_LOAD);
1148       });
1149     }
1150 
1151     // FIXME: load/store narrowing should be moved to lower action
1152     Actions
1153         .narrowScalarIf(
1154             [=](const LegalityQuery &Query) -> bool {
1155               return !Query.Types[0].isVector() &&
1156                      needToSplitMemOp(Query, Op == G_LOAD);
1157             },
1158             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159               const LLT DstTy = Query.Types[0];
1160               const LLT PtrTy = Query.Types[1];
1161 
1162               const unsigned DstSize = DstTy.getSizeInBits();
1163               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164 
1165               // Split extloads.
1166               if (DstSize > MemSize)
1167                 return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169               unsigned MaxSize = maxSizeForAddrSpace(ST,
1170                                                      PtrTy.getAddressSpace(),
1171                                                      Op == G_LOAD);
1172               if (MemSize > MaxSize)
1173                 return std::make_pair(0, LLT::scalar(MaxSize));
1174 
1175               unsigned Align = Query.MMODescrs[0].AlignInBits;
1176               return std::make_pair(0, LLT::scalar(Align));
1177             })
1178         .fewerElementsIf(
1179             [=](const LegalityQuery &Query) -> bool {
1180               return Query.Types[0].isVector() &&
1181                      needToSplitMemOp(Query, Op == G_LOAD);
1182             },
1183             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184               const LLT DstTy = Query.Types[0];
1185               const LLT PtrTy = Query.Types[1];
1186 
1187               LLT EltTy = DstTy.getElementType();
1188               unsigned MaxSize = maxSizeForAddrSpace(ST,
1189                                                      PtrTy.getAddressSpace(),
1190                                                      Op == G_LOAD);
1191 
1192               // FIXME: Handle widened to power of 2 results better. This ends
1193               // up scalarizing.
1194               // FIXME: 3 element stores scalarized on SI
1195 
1196               // Split if it's too large for the address space.
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198               if (MemSize > MaxSize) {
1199                 unsigned NumElts = DstTy.getNumElements();
1200                 unsigned EltSize = EltTy.getSizeInBits();
1201 
1202                 if (MaxSize % EltSize == 0) {
1203                   return std::make_pair(
1204                       0, LLT::scalarOrVector(
1205                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = MemSize / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(
1217                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               if (DstTy.getSizeInBits() > MemSize)
1222                 return std::make_pair(0, EltTy);
1223 
1224               unsigned EltSize = EltTy.getSizeInBits();
1225               unsigned DstSize = DstTy.getSizeInBits();
1226               if (!isPowerOf2_32(DstSize)) {
1227                 // We're probably decomposing an odd sized store. Try to split
1228                 // to the widest type. TODO: Account for alignment. As-is it
1229                 // should be OK, since the new parts will be further legalized.
1230                 unsigned FloorSize = PowerOf2Floor(DstSize);
1231                 return std::make_pair(
1232                     0, LLT::scalarOrVector(
1233                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234               }
1235 
1236               // May need relegalization for the scalars.
1237               return std::make_pair(0, EltTy);
1238             })
1239     .minScalar(0, S32)
1240     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241     .widenScalarToNextPow2(0)
1242     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243     .lower();
1244   }
1245 
1246   // FIXME: Unaligned accesses not lowered.
1247   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249                                                   {S32, GlobalPtr, S16, 2 * 8},
1250                                                   {S32, LocalPtr, S8, 8},
1251                                                   {S32, LocalPtr, S16, 16},
1252                                                   {S32, PrivatePtr, S8, 8},
1253                                                   {S32, PrivatePtr, S16, 16},
1254                                                   {S32, ConstantPtr, S8, 8},
1255                                                   {S32, ConstantPtr, S16, 2 * 8}})
1256                        .legalIf(
1257                          [=](const LegalityQuery &Query) -> bool {
1258                            return isLoadStoreLegal(ST, Query);
1259                          });
1260 
1261   if (ST.hasFlatAddressSpace()) {
1262     ExtLoads.legalForTypesWithMemDesc(
1263         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264   }
1265 
1266   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267   // 64-bits.
1268   //
1269   // TODO: Should generalize bitcast action into coerce, which will also cover
1270   // inserting addrspacecasts.
1271   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272 
1273   ExtLoads.clampScalar(0, S32, S32)
1274           .widenScalarToNextPow2(0)
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290   if (ST.hasLDSFPAtomicAdd()) {
1291     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292     if (ST.hasGFX90AInsts())
1293       Atomic.legalFor({{S64, LocalPtr}});
1294   }
1295   if (ST.hasAtomicFaddInsts())
1296     Atomic.legalFor({{S32, GlobalPtr}});
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310                                  LocalPtr, FlatPtr, PrivatePtr,
1311                                  LLT::fixed_vector(2, LocalPtr),
1312                                  LLT::fixed_vector(2, PrivatePtr)},
1313                                 {S1, S32})
1314       .clampScalar(0, S16, S64)
1315       .scalarize(1)
1316       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318       .clampMaxNumElements(0, S32, 2)
1319       .clampMaxNumElements(0, LocalPtr, 2)
1320       .clampMaxNumElements(0, PrivatePtr, 2)
1321       .scalarize(0)
1322       .widenScalarToNextPow2(0)
1323       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324 
1325   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326   // be more flexible with the shift amount type.
1327   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328     .legalFor({{S32, S32}, {S64, S32}});
1329   if (ST.has16BitInsts()) {
1330     if (ST.hasVOP3PInsts()) {
1331       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332             .clampMaxNumElements(0, S16, 2);
1333     } else
1334       Shifts.legalFor({{S16, S16}});
1335 
1336     // TODO: Support 16-bit shift amounts for all types
1337     Shifts.widenScalarIf(
1338       [=](const LegalityQuery &Query) {
1339         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340         // 32-bit amount.
1341         const LLT ValTy = Query.Types[0];
1342         const LLT AmountTy = Query.Types[1];
1343         return ValTy.getSizeInBits() <= 16 &&
1344                AmountTy.getSizeInBits() < 16;
1345       }, changeTo(1, S16));
1346     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347     Shifts.clampScalar(1, S32, S32);
1348     Shifts.clampScalar(0, S16, S64);
1349     Shifts.widenScalarToNextPow2(0, 16);
1350 
1351     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352       .minScalar(0, S16)
1353       .scalarize(0)
1354       .lower();
1355   } else {
1356     // Make sure we legalize the shift amount type first, as the general
1357     // expansion for the shifted type will produce much worse code if it hasn't
1358     // been truncated already.
1359     Shifts.clampScalar(1, S32, S32);
1360     Shifts.clampScalar(0, S32, S64);
1361     Shifts.widenScalarToNextPow2(0, 32);
1362 
1363     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364       .minScalar(0, S32)
1365       .scalarize(0)
1366       .lower();
1367   }
1368   Shifts.scalarize(0);
1369 
1370   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373     unsigned IdxTypeIdx = 2;
1374 
1375     getActionDefinitionsBuilder(Op)
1376       .customIf([=](const LegalityQuery &Query) {
1377           const LLT EltTy = Query.Types[EltTypeIdx];
1378           const LLT VecTy = Query.Types[VecTypeIdx];
1379           const LLT IdxTy = Query.Types[IdxTypeIdx];
1380           const unsigned EltSize = EltTy.getSizeInBits();
1381           return (EltSize == 32 || EltSize == 64) &&
1382                   VecTy.getSizeInBits() % 32 == 0 &&
1383                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1384                   IdxTy.getSizeInBits() == 32;
1385         })
1386       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387                  bitcastToVectorElement32(VecTypeIdx))
1388       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389       .bitcastIf(
1390         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391         [=](const LegalityQuery &Query) {
1392           // For > 64-bit element types, try to turn this into a 64-bit
1393           // element vector since we may be able to do better indexing
1394           // if this is scalar. If not, fall back to 32.
1395           const LLT EltTy = Query.Types[EltTypeIdx];
1396           const LLT VecTy = Query.Types[VecTypeIdx];
1397           const unsigned DstEltSize = EltTy.getSizeInBits();
1398           const unsigned VecSize = VecTy.getSizeInBits();
1399 
1400           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401           return std::make_pair(
1402               VecTypeIdx,
1403               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404         })
1405       .clampScalar(EltTypeIdx, S32, S64)
1406       .clampScalar(VecTypeIdx, S32, S64)
1407       .clampScalar(IdxTypeIdx, S32, S32)
1408       .clampMaxNumElements(VecTypeIdx, S32, 32)
1409       // TODO: Clamp elements for 64-bit vectors?
1410       // It should only be necessary with variable indexes.
1411       // As a last resort, lower to the stack
1412       .lower();
1413   }
1414 
1415   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416     .unsupportedIf([=](const LegalityQuery &Query) {
1417         const LLT &EltTy = Query.Types[1].getElementType();
1418         return Query.Types[0] != EltTy;
1419       });
1420 
1421   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425     // FIXME: Doesn't handle extract of illegal sizes.
1426     getActionDefinitionsBuilder(Op)
1427       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428       .lowerIf([=](const LegalityQuery &Query) {
1429           // Sub-vector(or single element) insert and extract.
1430           // TODO: verify immediate offset here since lower only works with
1431           // whole elements.
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           return BigTy.isVector();
1434         })
1435       // FIXME: Multiples of 16 should not be legal.
1436       .legalIf([=](const LegalityQuery &Query) {
1437           const LLT BigTy = Query.Types[BigTyIdx];
1438           const LLT LitTy = Query.Types[LitTyIdx];
1439           return (BigTy.getSizeInBits() % 32 == 0) &&
1440                  (LitTy.getSizeInBits() % 16 == 0);
1441         })
1442       .widenScalarIf(
1443         [=](const LegalityQuery &Query) {
1444           const LLT BigTy = Query.Types[BigTyIdx];
1445           return (BigTy.getScalarSizeInBits() < 16);
1446         },
1447         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1448       .widenScalarIf(
1449         [=](const LegalityQuery &Query) {
1450           const LLT LitTy = Query.Types[LitTyIdx];
1451           return (LitTy.getScalarSizeInBits() < 16);
1452         },
1453         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1454       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455       .widenScalarToNextPow2(BigTyIdx, 32);
1456 
1457   }
1458 
1459   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460     .legalForCartesianProduct(AllS32Vectors, {S32})
1461     .legalForCartesianProduct(AllS64Vectors, {S64})
1462     .clampNumElements(0, V16S32, V32S32)
1463     .clampNumElements(0, V2S64, V16S64)
1464     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465 
1466   if (ST.hasScalarPackInsts()) {
1467     BuildVector
1468       // FIXME: Should probably widen s1 vectors straight to s32
1469       .minScalarOrElt(0, S16)
1470       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471       .minScalar(1, S32);
1472 
1473     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474       .legalFor({V2S16, S32})
1475       .lower();
1476     BuildVector.minScalarOrElt(0, S32);
1477   } else {
1478     BuildVector.customFor({V2S16, S16});
1479     BuildVector.minScalarOrElt(0, S32);
1480 
1481     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482       .customFor({V2S16, S32})
1483       .lower();
1484   }
1485 
1486   BuildVector.legalIf(isRegisterType(0));
1487 
1488   // FIXME: Clamp maximum size
1489   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491     .clampMaxNumElements(0, S32, 32)
1492     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493     .clampMaxNumElements(0, S16, 64);
1494 
1495   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496   // pre-legalize.
1497   if (ST.hasVOP3PInsts()) {
1498     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499       .customFor({V2S16, V2S16})
1500       .lower();
1501   } else
1502     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503 
1504   // Merge/Unmerge
1505   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508 
1509     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510       const LLT Ty = Query.Types[TypeIdx];
1511       if (Ty.isVector()) {
1512         const LLT &EltTy = Ty.getElementType();
1513         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514           return true;
1515         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516           return true;
1517       }
1518       return false;
1519     };
1520 
1521     auto &Builder = getActionDefinitionsBuilder(Op)
1522       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523       .lowerFor({{S16, V2S16}})
1524       .lowerIf([=](const LegalityQuery &Query) {
1525           const LLT BigTy = Query.Types[BigTyIdx];
1526           return BigTy.getSizeInBits() == 32;
1527         })
1528       // Try to widen to s16 first for small types.
1529       // TODO: Only do this on targets with legal s16 shifts
1530       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534                            elementTypeIs(1, S16)),
1535                        changeTo(1, V2S16))
1536       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538       // valid.
1539       .clampScalar(LitTyIdx, S32, S512)
1540       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541       // Break up vectors with weird elements into scalars
1542       .fewerElementsIf(
1543         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544         scalarize(0))
1545       .fewerElementsIf(
1546         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547         scalarize(1))
1548       .clampScalar(BigTyIdx, S32, MaxScalar);
1549 
1550     if (Op == G_MERGE_VALUES) {
1551       Builder.widenScalarIf(
1552         // TODO: Use 16-bit shifts if legal for 8-bit values?
1553         [=](const LegalityQuery &Query) {
1554           const LLT Ty = Query.Types[LitTyIdx];
1555           return Ty.getSizeInBits() < 32;
1556         },
1557         changeTo(LitTyIdx, S32));
1558     }
1559 
1560     Builder.widenScalarIf(
1561       [=](const LegalityQuery &Query) {
1562         const LLT Ty = Query.Types[BigTyIdx];
1563         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564           Ty.getSizeInBits() % 16 != 0;
1565       },
1566       [=](const LegalityQuery &Query) {
1567         // Pick the next power of 2, or a multiple of 64 over 128.
1568         // Whichever is smaller.
1569         const LLT &Ty = Query.Types[BigTyIdx];
1570         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571         if (NewSizeInBits >= 256) {
1572           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573           if (RoundedTo < NewSizeInBits)
1574             NewSizeInBits = RoundedTo;
1575         }
1576         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577       })
1578       // Any vectors left are the wrong size. Scalarize them.
1579       .scalarize(0)
1580       .scalarize(1);
1581   }
1582 
1583   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584   // RegBankSelect.
1585   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586     .legalFor({{S32}, {S64}});
1587 
1588   if (ST.hasVOP3PInsts()) {
1589     SextInReg.lowerFor({{V2S16}})
1590       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591       // get more vector shift opportunities, since we'll get those when
1592       // expanded.
1593       .clampMaxNumElementsStrict(0, S16, 2);
1594   } else if (ST.has16BitInsts()) {
1595     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596   } else {
1597     // Prefer to promote to s32 before lowering if we don't have 16-bit
1598     // shifts. This avoid a lot of intermediate truncate and extend operations.
1599     SextInReg.lowerFor({{S32}, {S64}});
1600   }
1601 
1602   SextInReg
1603     .scalarize(0)
1604     .clampScalar(0, S32, S64)
1605     .lower();
1606 
1607   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608     .scalarize(0)
1609     .lower();
1610 
1611   // TODO: Only Try to form v2s16 with legal packed instructions.
1612   getActionDefinitionsBuilder(G_FSHR)
1613     .legalFor({{S32, S32}})
1614     .lowerFor({{V2S16, V2S16}})
1615     .clampMaxNumElementsStrict(0, S16, 2)
1616     .scalarize(0)
1617     .lower();
1618 
1619   if (ST.hasVOP3PInsts()) {
1620     getActionDefinitionsBuilder(G_FSHL)
1621       .lowerFor({{V2S16, V2S16}})
1622       .clampMaxNumElementsStrict(0, S16, 2)
1623       .scalarize(0)
1624       .lower();
1625   } else {
1626     getActionDefinitionsBuilder(G_FSHL)
1627       .scalarize(0)
1628       .lower();
1629   }
1630 
1631   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632     .legalFor({S64});
1633 
1634   getActionDefinitionsBuilder(G_FENCE)
1635     .alwaysLegal();
1636 
1637   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638       .scalarize(0)
1639       .minScalar(0, S32)
1640       .lower();
1641 
1642   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643       .legalFor({{S32, S32}, {S64, S32}})
1644       .clampScalar(1, S32, S32)
1645       .clampScalar(0, S32, S64)
1646       .widenScalarToNextPow2(0)
1647       .scalarize(0);
1648 
1649   getActionDefinitionsBuilder({
1650       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651       G_FCOPYSIGN,
1652 
1653       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654       G_ATOMICRMW_NAND,
1655       G_ATOMICRMW_FSUB,
1656       G_READ_REGISTER,
1657       G_WRITE_REGISTER,
1658 
1659       G_SADDO, G_SSUBO,
1660 
1661        // TODO: Implement
1662       G_FMINIMUM, G_FMAXIMUM}).lower();
1663 
1664   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665       .lower();
1666 
1667   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670     .unsupported();
1671 
1672   getLegacyLegalizerInfo().computeTables();
1673   verify(*ST.getInstrInfo());
1674 }
1675 
1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677                                          MachineInstr &MI) const {
1678   MachineIRBuilder &B = Helper.MIRBuilder;
1679   MachineRegisterInfo &MRI = *B.getMRI();
1680 
1681   switch (MI.getOpcode()) {
1682   case TargetOpcode::G_ADDRSPACE_CAST:
1683     return legalizeAddrSpaceCast(MI, MRI, B);
1684   case TargetOpcode::G_FRINT:
1685     return legalizeFrint(MI, MRI, B);
1686   case TargetOpcode::G_FCEIL:
1687     return legalizeFceil(MI, MRI, B);
1688   case TargetOpcode::G_FREM:
1689     return legalizeFrem(MI, MRI, B);
1690   case TargetOpcode::G_INTRINSIC_TRUNC:
1691     return legalizeIntrinsicTrunc(MI, MRI, B);
1692   case TargetOpcode::G_SITOFP:
1693     return legalizeITOFP(MI, MRI, B, true);
1694   case TargetOpcode::G_UITOFP:
1695     return legalizeITOFP(MI, MRI, B, false);
1696   case TargetOpcode::G_FPTOSI:
1697     return legalizeFPTOI(MI, MRI, B, true);
1698   case TargetOpcode::G_FPTOUI:
1699     return legalizeFPTOI(MI, MRI, B, false);
1700   case TargetOpcode::G_FMINNUM:
1701   case TargetOpcode::G_FMAXNUM:
1702   case TargetOpcode::G_FMINNUM_IEEE:
1703   case TargetOpcode::G_FMAXNUM_IEEE:
1704     return legalizeMinNumMaxNum(Helper, MI);
1705   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706     return legalizeExtractVectorElt(MI, MRI, B);
1707   case TargetOpcode::G_INSERT_VECTOR_ELT:
1708     return legalizeInsertVectorElt(MI, MRI, B);
1709   case TargetOpcode::G_SHUFFLE_VECTOR:
1710     return legalizeShuffleVector(MI, MRI, B);
1711   case TargetOpcode::G_FSIN:
1712   case TargetOpcode::G_FCOS:
1713     return legalizeSinCos(MI, MRI, B);
1714   case TargetOpcode::G_GLOBAL_VALUE:
1715     return legalizeGlobalValue(MI, MRI, B);
1716   case TargetOpcode::G_LOAD:
1717   case TargetOpcode::G_SEXTLOAD:
1718   case TargetOpcode::G_ZEXTLOAD:
1719     return legalizeLoad(Helper, MI);
1720   case TargetOpcode::G_FMAD:
1721     return legalizeFMad(MI, MRI, B);
1722   case TargetOpcode::G_FDIV:
1723     return legalizeFDIV(MI, MRI, B);
1724   case TargetOpcode::G_UDIV:
1725   case TargetOpcode::G_UREM:
1726   case TargetOpcode::G_UDIVREM:
1727     return legalizeUnsignedDIV_REM(MI, MRI, B);
1728   case TargetOpcode::G_SDIV:
1729   case TargetOpcode::G_SREM:
1730   case TargetOpcode::G_SDIVREM:
1731     return legalizeSignedDIV_REM(MI, MRI, B);
1732   case TargetOpcode::G_ATOMIC_CMPXCHG:
1733     return legalizeAtomicCmpXChg(MI, MRI, B);
1734   case TargetOpcode::G_FLOG:
1735     return legalizeFlog(MI, B, numbers::ln2f);
1736   case TargetOpcode::G_FLOG10:
1737     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738   case TargetOpcode::G_FEXP:
1739     return legalizeFExp(MI, B);
1740   case TargetOpcode::G_FPOW:
1741     return legalizeFPow(MI, B);
1742   case TargetOpcode::G_FFLOOR:
1743     return legalizeFFloor(MI, MRI, B);
1744   case TargetOpcode::G_BUILD_VECTOR:
1745     return legalizeBuildVector(MI, MRI, B);
1746   case TargetOpcode::G_CTLZ:
1747   case TargetOpcode::G_CTTZ:
1748     return legalizeCTLZ_CTTZ(MI, MRI, B);
1749   default:
1750     return false;
1751   }
1752 
1753   llvm_unreachable("expected switch to return");
1754 }
1755 
1756 Register AMDGPULegalizerInfo::getSegmentAperture(
1757   unsigned AS,
1758   MachineRegisterInfo &MRI,
1759   MachineIRBuilder &B) const {
1760   MachineFunction &MF = B.getMF();
1761   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762   const LLT S32 = LLT::scalar(32);
1763 
1764   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1765 
1766   if (ST.hasApertureRegs()) {
1767     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768     // getreg.
1769     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1771         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1772     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1773         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1774         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1775     unsigned Encoding =
1776         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1777         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1778         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779 
1780     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781 
1782     B.buildInstr(AMDGPU::S_GETREG_B32)
1783       .addDef(GetReg)
1784       .addImm(Encoding);
1785     MRI.setType(GetReg, S32);
1786 
1787     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789   }
1790 
1791   Register QueuePtr = MRI.createGenericVirtualRegister(
1792     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1793 
1794   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1795     return Register();
1796 
1797   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798   // private_segment_aperture_base_hi.
1799   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800 
1801   // TODO: can we be smarter about machine pointer info?
1802   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1803   MachineMemOperand *MMO = MF.getMachineMemOperand(
1804       PtrInfo,
1805       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1806           MachineMemOperand::MOInvariant,
1807       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808 
1809   Register LoadAddr;
1810 
1811   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813 }
1814 
1815 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B) const {
1818   MachineFunction &MF = B.getMF();
1819 
1820   const LLT S32 = LLT::scalar(32);
1821   Register Dst = MI.getOperand(0).getReg();
1822   Register Src = MI.getOperand(1).getReg();
1823 
1824   LLT DstTy = MRI.getType(Dst);
1825   LLT SrcTy = MRI.getType(Src);
1826   unsigned DestAS = DstTy.getAddressSpace();
1827   unsigned SrcAS = SrcTy.getAddressSpace();
1828 
1829   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1830   // vector element.
1831   assert(!DstTy.isVector());
1832 
1833   const AMDGPUTargetMachine &TM
1834     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1835 
1836   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1837     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1838     return true;
1839   }
1840 
1841   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1842     // Truncate.
1843     B.buildExtract(Dst, Src, 0);
1844     MI.eraseFromParent();
1845     return true;
1846   }
1847 
1848   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1849     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1850     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1851 
1852     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1853     // another. Merge operands are required to be the same type, but creating an
1854     // extra ptrtoint would be kind of pointless.
1855     auto HighAddr = B.buildConstant(
1856       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1857     B.buildMerge(Dst, {Src, HighAddr});
1858     MI.eraseFromParent();
1859     return true;
1860   }
1861 
1862   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1863     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1864            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1865     unsigned NullVal = TM.getNullPointerValue(DestAS);
1866 
1867     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1868     auto FlatNull = B.buildConstant(SrcTy, 0);
1869 
1870     // Extract low 32-bits of the pointer.
1871     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1872 
1873     auto CmpRes =
1874         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1875     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1876 
1877     MI.eraseFromParent();
1878     return true;
1879   }
1880 
1881   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1882     return false;
1883 
1884   if (!ST.hasFlatAddressSpace())
1885     return false;
1886 
1887   auto SegmentNull =
1888       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1889   auto FlatNull =
1890       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1891 
1892   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1893   if (!ApertureReg.isValid())
1894     return false;
1895 
1896   auto CmpRes =
1897       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1898 
1899   // Coerce the type of the low half of the result so we can use merge_values.
1900   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1901 
1902   // TODO: Should we allow mismatched types but matching sizes in merges to
1903   // avoid the ptrtoint?
1904   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1905   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1906 
1907   MI.eraseFromParent();
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFrint(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914   Register Src = MI.getOperand(1).getReg();
1915   LLT Ty = MRI.getType(Src);
1916   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1917 
1918   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1919   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1920 
1921   auto C1 = B.buildFConstant(Ty, C1Val);
1922   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1923 
1924   // TODO: Should this propagate fast-math-flags?
1925   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1926   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1927 
1928   auto C2 = B.buildFConstant(Ty, C2Val);
1929   auto Fabs = B.buildFAbs(Ty, Src);
1930 
1931   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1932   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1933   MI.eraseFromParent();
1934   return true;
1935 }
1936 
1937 bool AMDGPULegalizerInfo::legalizeFceil(
1938   MachineInstr &MI, MachineRegisterInfo &MRI,
1939   MachineIRBuilder &B) const {
1940 
1941   const LLT S1 = LLT::scalar(1);
1942   const LLT S64 = LLT::scalar(64);
1943 
1944   Register Src = MI.getOperand(1).getReg();
1945   assert(MRI.getType(Src) == S64);
1946 
1947   // result = trunc(src)
1948   // if (src > 0.0 && src != result)
1949   //   result += 1.0
1950 
1951   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1952 
1953   const auto Zero = B.buildFConstant(S64, 0.0);
1954   const auto One = B.buildFConstant(S64, 1.0);
1955   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1956   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1957   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1958   auto Add = B.buildSelect(S64, And, One, Zero);
1959 
1960   // TODO: Should this propagate fast-math-flags?
1961   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1962   return true;
1963 }
1964 
1965 bool AMDGPULegalizerInfo::legalizeFrem(
1966   MachineInstr &MI, MachineRegisterInfo &MRI,
1967   MachineIRBuilder &B) const {
1968     Register DstReg = MI.getOperand(0).getReg();
1969     Register Src0Reg = MI.getOperand(1).getReg();
1970     Register Src1Reg = MI.getOperand(2).getReg();
1971     auto Flags = MI.getFlags();
1972     LLT Ty = MRI.getType(DstReg);
1973 
1974     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1975     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1976     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1977     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1978     MI.eraseFromParent();
1979     return true;
1980 }
1981 
1982 static MachineInstrBuilder extractF64Exponent(Register Hi,
1983                                               MachineIRBuilder &B) {
1984   const unsigned FractBits = 52;
1985   const unsigned ExpBits = 11;
1986   LLT S32 = LLT::scalar(32);
1987 
1988   auto Const0 = B.buildConstant(S32, FractBits - 32);
1989   auto Const1 = B.buildConstant(S32, ExpBits);
1990 
1991   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1992     .addUse(Hi)
1993     .addUse(Const0.getReg(0))
1994     .addUse(Const1.getReg(0));
1995 
1996   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1997 }
1998 
1999 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2000   MachineInstr &MI, MachineRegisterInfo &MRI,
2001   MachineIRBuilder &B) const {
2002   const LLT S1 = LLT::scalar(1);
2003   const LLT S32 = LLT::scalar(32);
2004   const LLT S64 = LLT::scalar(64);
2005 
2006   Register Src = MI.getOperand(1).getReg();
2007   assert(MRI.getType(Src) == S64);
2008 
2009   // TODO: Should this use extract since the low half is unused?
2010   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2011   Register Hi = Unmerge.getReg(1);
2012 
2013   // Extract the upper half, since this is where we will find the sign and
2014   // exponent.
2015   auto Exp = extractF64Exponent(Hi, B);
2016 
2017   const unsigned FractBits = 52;
2018 
2019   // Extract the sign bit.
2020   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2021   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2022 
2023   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2024 
2025   const auto Zero32 = B.buildConstant(S32, 0);
2026 
2027   // Extend back to 64-bits.
2028   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2029 
2030   auto Shr = B.buildAShr(S64, FractMask, Exp);
2031   auto Not = B.buildNot(S64, Shr);
2032   auto Tmp0 = B.buildAnd(S64, Src, Not);
2033   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2034 
2035   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2036   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2037 
2038   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2039   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2040   MI.eraseFromParent();
2041   return true;
2042 }
2043 
2044 bool AMDGPULegalizerInfo::legalizeITOFP(
2045   MachineInstr &MI, MachineRegisterInfo &MRI,
2046   MachineIRBuilder &B, bool Signed) const {
2047 
2048   Register Dst = MI.getOperand(0).getReg();
2049   Register Src = MI.getOperand(1).getReg();
2050 
2051   const LLT S64 = LLT::scalar(64);
2052   const LLT S32 = LLT::scalar(32);
2053 
2054   assert(MRI.getType(Src) == S64);
2055 
2056   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2057   auto ThirtyTwo = B.buildConstant(S32, 32);
2058 
2059   if (MRI.getType(Dst) == S64) {
2060     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2061                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2062 
2063     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2064     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2065                      .addUse(CvtHi.getReg(0))
2066                      .addUse(ThirtyTwo.getReg(0));
2067 
2068     // TODO: Should this propagate fast-math-flags?
2069     B.buildFAdd(Dst, LdExp, CvtLo);
2070     MI.eraseFromParent();
2071     return true;
2072   }
2073 
2074   assert(MRI.getType(Dst) == S32);
2075 
2076   auto One = B.buildConstant(S32, 1);
2077 
2078   MachineInstrBuilder ShAmt;
2079   if (Signed) {
2080     auto ThirtyOne = B.buildConstant(S32, 31);
2081     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2082     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2083     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2084     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2085                                /*HasSideEffects=*/false)
2086                   .addUse(Unmerge.getReg(1));
2087     auto LS2 = B.buildSub(S32, LS, One);
2088     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2089   } else
2090     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2091   auto Norm = B.buildShl(S64, Src, ShAmt);
2092   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2093   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2094   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2095   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2096   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2097   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2098                    /*HasSideEffects=*/false)
2099       .addUse(FVal.getReg(0))
2100       .addUse(Scale.getReg(0));
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104 
2105 // TODO: Copied from DAG implementation. Verify logic and document how this
2106 // actually works.
2107 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2108                                         MachineRegisterInfo &MRI,
2109                                         MachineIRBuilder &B,
2110                                         bool Signed) const {
2111 
2112   Register Dst = MI.getOperand(0).getReg();
2113   Register Src = MI.getOperand(1).getReg();
2114 
2115   const LLT S64 = LLT::scalar(64);
2116   const LLT S32 = LLT::scalar(32);
2117 
2118   const LLT SrcLT = MRI.getType(Src);
2119   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2120 
2121   unsigned Flags = MI.getFlags();
2122 
2123   // The basic idea of converting a floating point number into a pair of 32-bit
2124   // integers is illustrated as follows:
2125   //
2126   //     tf := trunc(val);
2127   //    hif := floor(tf * 2^-32);
2128   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2129   //     hi := fptoi(hif);
2130   //     lo := fptoi(lof);
2131   //
2132   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2133   MachineInstrBuilder Sign;
2134   if (Signed && SrcLT == S32) {
2135     // However, a 32-bit floating point number has only 23 bits mantissa and
2136     // it's not enough to hold all the significant bits of `lof` if val is
2137     // negative. To avoid the loss of precision, We need to take the absolute
2138     // value after truncating and flip the result back based on the original
2139     // signedness.
2140     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2141     Trunc = B.buildFAbs(S32, Trunc, Flags);
2142   }
2143   MachineInstrBuilder K0, K1;
2144   if (SrcLT == S64) {
2145     K0 = B.buildFConstant(S64,
2146                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2147     K1 = B.buildFConstant(S64,
2148                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2149   } else {
2150     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2151     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2152   }
2153 
2154   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2155   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2156   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2157 
2158   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2159                                      : B.buildFPTOUI(S32, FloorMul);
2160   auto Lo = B.buildFPTOUI(S32, Fma);
2161 
2162   if (Signed && SrcLT == S32) {
2163     // Flip the result based on the signedness, which is either all 0s or 1s.
2164     Sign = B.buildMerge(S64, {Sign, Sign});
2165     // r := xor({lo, hi}, sign) - sign;
2166     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2167   } else
2168     B.buildMerge(Dst, {Lo, Hi});
2169   MI.eraseFromParent();
2170 
2171   return true;
2172 }
2173 
2174 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2175                                                MachineInstr &MI) const {
2176   MachineFunction &MF = Helper.MIRBuilder.getMF();
2177   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2178 
2179   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2180                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2181 
2182   // With ieee_mode disabled, the instructions have the correct behavior
2183   // already for G_FMINNUM/G_FMAXNUM
2184   if (!MFI->getMode().IEEE)
2185     return !IsIEEEOp;
2186 
2187   if (IsIEEEOp)
2188     return true;
2189 
2190   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2191 }
2192 
2193 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2194   MachineInstr &MI, MachineRegisterInfo &MRI,
2195   MachineIRBuilder &B) const {
2196   // TODO: Should move some of this into LegalizerHelper.
2197 
2198   // TODO: Promote dynamic indexing of s16 to s32
2199 
2200   // FIXME: Artifact combiner probably should have replaced the truncated
2201   // constant before this, so we shouldn't need
2202   // getIConstantVRegValWithLookThrough.
2203   Optional<ValueAndVReg> MaybeIdxVal =
2204       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2205   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2206     return true;
2207   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2208 
2209   Register Dst = MI.getOperand(0).getReg();
2210   Register Vec = MI.getOperand(1).getReg();
2211 
2212   LLT VecTy = MRI.getType(Vec);
2213   LLT EltTy = VecTy.getElementType();
2214   assert(EltTy == MRI.getType(Dst));
2215 
2216   if (IdxVal < VecTy.getNumElements()) {
2217     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2218     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2219   } else {
2220     B.buildUndef(Dst);
2221   }
2222 
2223   MI.eraseFromParent();
2224   return true;
2225 }
2226 
2227 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2228   MachineInstr &MI, MachineRegisterInfo &MRI,
2229   MachineIRBuilder &B) const {
2230   // TODO: Should move some of this into LegalizerHelper.
2231 
2232   // TODO: Promote dynamic indexing of s16 to s32
2233 
2234   // FIXME: Artifact combiner probably should have replaced the truncated
2235   // constant before this, so we shouldn't need
2236   // getIConstantVRegValWithLookThrough.
2237   Optional<ValueAndVReg> MaybeIdxVal =
2238       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2239   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2240     return true;
2241 
2242   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2243   Register Dst = MI.getOperand(0).getReg();
2244   Register Vec = MI.getOperand(1).getReg();
2245   Register Ins = MI.getOperand(2).getReg();
2246 
2247   LLT VecTy = MRI.getType(Vec);
2248   LLT EltTy = VecTy.getElementType();
2249   assert(EltTy == MRI.getType(Ins));
2250   (void)Ins;
2251 
2252   unsigned NumElts = VecTy.getNumElements();
2253   if (IdxVal < NumElts) {
2254     SmallVector<Register, 8> SrcRegs;
2255     for (unsigned i = 0; i < NumElts; ++i)
2256       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2257     B.buildUnmerge(SrcRegs, Vec);
2258 
2259     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2260     B.buildMerge(Dst, SrcRegs);
2261   } else {
2262     B.buildUndef(Dst);
2263   }
2264 
2265   MI.eraseFromParent();
2266   return true;
2267 }
2268 
2269 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2270   MachineInstr &MI, MachineRegisterInfo &MRI,
2271   MachineIRBuilder &B) const {
2272   const LLT V2S16 = LLT::fixed_vector(2, 16);
2273 
2274   Register Dst = MI.getOperand(0).getReg();
2275   Register Src0 = MI.getOperand(1).getReg();
2276   LLT DstTy = MRI.getType(Dst);
2277   LLT SrcTy = MRI.getType(Src0);
2278 
2279   if (SrcTy == V2S16 && DstTy == V2S16 &&
2280       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2281     return true;
2282 
2283   MachineIRBuilder HelperBuilder(MI);
2284   GISelObserverWrapper DummyObserver;
2285   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2286   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2287 }
2288 
2289 bool AMDGPULegalizerInfo::legalizeSinCos(
2290   MachineInstr &MI, MachineRegisterInfo &MRI,
2291   MachineIRBuilder &B) const {
2292 
2293   Register DstReg = MI.getOperand(0).getReg();
2294   Register SrcReg = MI.getOperand(1).getReg();
2295   LLT Ty = MRI.getType(DstReg);
2296   unsigned Flags = MI.getFlags();
2297 
2298   Register TrigVal;
2299   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2300   if (ST.hasTrigReducedRange()) {
2301     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2302     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2303       .addUse(MulVal.getReg(0))
2304       .setMIFlags(Flags).getReg(0);
2305   } else
2306     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2307 
2308   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2309     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2310   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2311     .addUse(TrigVal)
2312     .setMIFlags(Flags);
2313   MI.eraseFromParent();
2314   return true;
2315 }
2316 
2317 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2318                                                   MachineIRBuilder &B,
2319                                                   const GlobalValue *GV,
2320                                                   int64_t Offset,
2321                                                   unsigned GAFlags) const {
2322   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2323   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2324   // to the following code sequence:
2325   //
2326   // For constant address space:
2327   //   s_getpc_b64 s[0:1]
2328   //   s_add_u32 s0, s0, $symbol
2329   //   s_addc_u32 s1, s1, 0
2330   //
2331   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2332   //   a fixup or relocation is emitted to replace $symbol with a literal
2333   //   constant, which is a pc-relative offset from the encoding of the $symbol
2334   //   operand to the global variable.
2335   //
2336   // For global address space:
2337   //   s_getpc_b64 s[0:1]
2338   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2339   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2340   //
2341   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2342   //   fixups or relocations are emitted to replace $symbol@*@lo and
2343   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2344   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2345   //   operand to the global variable.
2346   //
2347   // What we want here is an offset from the value returned by s_getpc
2348   // (which is the address of the s_add_u32 instruction) to the global
2349   // variable, but since the encoding of $symbol starts 4 bytes after the start
2350   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2351   // small. This requires us to add 4 to the global variable offset in order to
2352   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2353   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2354   // instruction.
2355 
2356   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2357 
2358   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2359     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2360 
2361   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2362     .addDef(PCReg);
2363 
2364   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2365   if (GAFlags == SIInstrInfo::MO_NONE)
2366     MIB.addImm(0);
2367   else
2368     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2369 
2370   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2371 
2372   if (PtrTy.getSizeInBits() == 32)
2373     B.buildExtract(DstReg, PCReg, 0);
2374   return true;
2375  }
2376 
2377 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2378   MachineInstr &MI, MachineRegisterInfo &MRI,
2379   MachineIRBuilder &B) const {
2380   Register DstReg = MI.getOperand(0).getReg();
2381   LLT Ty = MRI.getType(DstReg);
2382   unsigned AS = Ty.getAddressSpace();
2383 
2384   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2385   MachineFunction &MF = B.getMF();
2386   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2387 
2388   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2389     if (!MFI->isModuleEntryFunction() &&
2390         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2391       const Function &Fn = MF.getFunction();
2392       DiagnosticInfoUnsupported BadLDSDecl(
2393         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2394         DS_Warning);
2395       Fn.getContext().diagnose(BadLDSDecl);
2396 
2397       // We currently don't have a way to correctly allocate LDS objects that
2398       // aren't directly associated with a kernel. We do force inlining of
2399       // functions that use local objects. However, if these dead functions are
2400       // not eliminated, we don't want a compile time error. Just emit a warning
2401       // and a trap, since there should be no callable path here.
2402       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2403       B.buildUndef(DstReg);
2404       MI.eraseFromParent();
2405       return true;
2406     }
2407 
2408     // TODO: We could emit code to handle the initialization somewhere.
2409     // We ignore the initializer for now and legalize it to allow selection.
2410     // The initializer will anyway get errored out during assembly emission.
2411     const SITargetLowering *TLI = ST.getTargetLowering();
2412     if (!TLI->shouldUseLDSConstAddress(GV)) {
2413       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2414       return true; // Leave in place;
2415     }
2416 
2417     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2418       Type *Ty = GV->getValueType();
2419       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2420       // zero-sized type in other languages to declare the dynamic shared
2421       // memory which size is not known at the compile time. They will be
2422       // allocated by the runtime and placed directly after the static
2423       // allocated ones. They all share the same offset.
2424       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2425         // Adjust alignment for that dynamic shared memory array.
2426         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2427         LLT S32 = LLT::scalar(32);
2428         auto Sz =
2429             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2430         B.buildIntToPtr(DstReg, Sz);
2431         MI.eraseFromParent();
2432         return true;
2433       }
2434     }
2435 
2436     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2437                                                    *cast<GlobalVariable>(GV)));
2438     MI.eraseFromParent();
2439     return true;
2440   }
2441 
2442   const SITargetLowering *TLI = ST.getTargetLowering();
2443 
2444   if (TLI->shouldEmitFixup(GV)) {
2445     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2446     MI.eraseFromParent();
2447     return true;
2448   }
2449 
2450   if (TLI->shouldEmitPCReloc(GV)) {
2451     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2452     MI.eraseFromParent();
2453     return true;
2454   }
2455 
2456   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2457   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2458 
2459   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2460   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2461       MachinePointerInfo::getGOT(MF),
2462       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2463           MachineMemOperand::MOInvariant,
2464       LoadTy, Align(8));
2465 
2466   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2467 
2468   if (Ty.getSizeInBits() == 32) {
2469     // Truncate if this is a 32-bit constant address.
2470     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2471     B.buildExtract(DstReg, Load, 0);
2472   } else
2473     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2474 
2475   MI.eraseFromParent();
2476   return true;
2477 }
2478 
2479 static LLT widenToNextPowerOf2(LLT Ty) {
2480   if (Ty.isVector())
2481     return Ty.changeElementCount(
2482         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2483   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2484 }
2485 
2486 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2487                                        MachineInstr &MI) const {
2488   MachineIRBuilder &B = Helper.MIRBuilder;
2489   MachineRegisterInfo &MRI = *B.getMRI();
2490   GISelChangeObserver &Observer = Helper.Observer;
2491 
2492   Register PtrReg = MI.getOperand(1).getReg();
2493   LLT PtrTy = MRI.getType(PtrReg);
2494   unsigned AddrSpace = PtrTy.getAddressSpace();
2495 
2496   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2497     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2498     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2499     Observer.changingInstr(MI);
2500     MI.getOperand(1).setReg(Cast.getReg(0));
2501     Observer.changedInstr(MI);
2502     return true;
2503   }
2504 
2505   if (MI.getOpcode() != AMDGPU::G_LOAD)
2506     return false;
2507 
2508   Register ValReg = MI.getOperand(0).getReg();
2509   LLT ValTy = MRI.getType(ValReg);
2510 
2511   MachineMemOperand *MMO = *MI.memoperands_begin();
2512   const unsigned ValSize = ValTy.getSizeInBits();
2513   const LLT MemTy = MMO->getMemoryType();
2514   const Align MemAlign = MMO->getAlign();
2515   const unsigned MemSize = MemTy.getSizeInBits();
2516   const unsigned AlignInBits = 8 * MemAlign.value();
2517 
2518   // Widen non-power-of-2 loads to the alignment if needed
2519   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2520     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2521 
2522     // This was already the correct extending load result type, so just adjust
2523     // the memory type.
2524     if (WideMemSize == ValSize) {
2525       MachineFunction &MF = B.getMF();
2526 
2527       MachineMemOperand *WideMMO =
2528           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2529       Observer.changingInstr(MI);
2530       MI.setMemRefs(MF, {WideMMO});
2531       Observer.changedInstr(MI);
2532       return true;
2533     }
2534 
2535     // Don't bother handling edge case that should probably never be produced.
2536     if (ValSize > WideMemSize)
2537       return false;
2538 
2539     LLT WideTy = widenToNextPowerOf2(ValTy);
2540 
2541     Register WideLoad;
2542     if (!WideTy.isVector()) {
2543       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2544       B.buildTrunc(ValReg, WideLoad).getReg(0);
2545     } else {
2546       // Extract the subvector.
2547 
2548       if (isRegisterType(ValTy)) {
2549         // If this a case where G_EXTRACT is legal, use it.
2550         // (e.g. <3 x s32> -> <4 x s32>)
2551         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2552         B.buildExtract(ValReg, WideLoad, 0);
2553       } else {
2554         // For cases where the widened type isn't a nice register value, unmerge
2555         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2556         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2557         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2558       }
2559     }
2560 
2561     MI.eraseFromParent();
2562     return true;
2563   }
2564 
2565   return false;
2566 }
2567 
2568 bool AMDGPULegalizerInfo::legalizeFMad(
2569   MachineInstr &MI, MachineRegisterInfo &MRI,
2570   MachineIRBuilder &B) const {
2571   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2572   assert(Ty.isScalar());
2573 
2574   MachineFunction &MF = B.getMF();
2575   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2576 
2577   // TODO: Always legal with future ftz flag.
2578   // FIXME: Do we need just output?
2579   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2580     return true;
2581   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2582     return true;
2583 
2584   MachineIRBuilder HelperBuilder(MI);
2585   GISelObserverWrapper DummyObserver;
2586   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2587   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2588 }
2589 
2590 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2591   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2592   Register DstReg = MI.getOperand(0).getReg();
2593   Register PtrReg = MI.getOperand(1).getReg();
2594   Register CmpVal = MI.getOperand(2).getReg();
2595   Register NewVal = MI.getOperand(3).getReg();
2596 
2597   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2598          "this should not have been custom lowered");
2599 
2600   LLT ValTy = MRI.getType(CmpVal);
2601   LLT VecTy = LLT::fixed_vector(2, ValTy);
2602 
2603   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2604 
2605   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2606     .addDef(DstReg)
2607     .addUse(PtrReg)
2608     .addUse(PackedVal)
2609     .setMemRefs(MI.memoperands());
2610 
2611   MI.eraseFromParent();
2612   return true;
2613 }
2614 
2615 bool AMDGPULegalizerInfo::legalizeFlog(
2616   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2617   Register Dst = MI.getOperand(0).getReg();
2618   Register Src = MI.getOperand(1).getReg();
2619   LLT Ty = B.getMRI()->getType(Dst);
2620   unsigned Flags = MI.getFlags();
2621 
2622   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2623   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2624 
2625   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2626   MI.eraseFromParent();
2627   return true;
2628 }
2629 
2630 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2631                                        MachineIRBuilder &B) const {
2632   Register Dst = MI.getOperand(0).getReg();
2633   Register Src = MI.getOperand(1).getReg();
2634   unsigned Flags = MI.getFlags();
2635   LLT Ty = B.getMRI()->getType(Dst);
2636 
2637   auto K = B.buildFConstant(Ty, numbers::log2e);
2638   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2639   B.buildFExp2(Dst, Mul, Flags);
2640   MI.eraseFromParent();
2641   return true;
2642 }
2643 
2644 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2645                                        MachineIRBuilder &B) const {
2646   Register Dst = MI.getOperand(0).getReg();
2647   Register Src0 = MI.getOperand(1).getReg();
2648   Register Src1 = MI.getOperand(2).getReg();
2649   unsigned Flags = MI.getFlags();
2650   LLT Ty = B.getMRI()->getType(Dst);
2651   const LLT S16 = LLT::scalar(16);
2652   const LLT S32 = LLT::scalar(32);
2653 
2654   if (Ty == S32) {
2655     auto Log = B.buildFLog2(S32, Src0, Flags);
2656     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2657       .addUse(Log.getReg(0))
2658       .addUse(Src1)
2659       .setMIFlags(Flags);
2660     B.buildFExp2(Dst, Mul, Flags);
2661   } else if (Ty == S16) {
2662     // There's no f16 fmul_legacy, so we need to convert for it.
2663     auto Log = B.buildFLog2(S16, Src0, Flags);
2664     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2665     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2666     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2667       .addUse(Ext0.getReg(0))
2668       .addUse(Ext1.getReg(0))
2669       .setMIFlags(Flags);
2670 
2671     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2672   } else
2673     return false;
2674 
2675   MI.eraseFromParent();
2676   return true;
2677 }
2678 
2679 // Find a source register, ignoring any possible source modifiers.
2680 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2681   Register ModSrc = OrigSrc;
2682   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2683     ModSrc = SrcFNeg->getOperand(1).getReg();
2684     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2685       ModSrc = SrcFAbs->getOperand(1).getReg();
2686   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2687     ModSrc = SrcFAbs->getOperand(1).getReg();
2688   return ModSrc;
2689 }
2690 
2691 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2692                                          MachineRegisterInfo &MRI,
2693                                          MachineIRBuilder &B) const {
2694 
2695   const LLT S1 = LLT::scalar(1);
2696   const LLT S64 = LLT::scalar(64);
2697   Register Dst = MI.getOperand(0).getReg();
2698   Register OrigSrc = MI.getOperand(1).getReg();
2699   unsigned Flags = MI.getFlags();
2700   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2701          "this should not have been custom lowered");
2702 
2703   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2704   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2705   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2706   // V_FRACT bug is:
2707   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2708   //
2709   // Convert floor(x) to (x - fract(x))
2710 
2711   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2712     .addUse(OrigSrc)
2713     .setMIFlags(Flags);
2714 
2715   // Give source modifier matching some assistance before obscuring a foldable
2716   // pattern.
2717 
2718   // TODO: We can avoid the neg on the fract? The input sign to fract
2719   // shouldn't matter?
2720   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2721 
2722   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2723 
2724   Register Min = MRI.createGenericVirtualRegister(S64);
2725 
2726   // We don't need to concern ourselves with the snan handling difference, so
2727   // use the one which will directly select.
2728   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2729   if (MFI->getMode().IEEE)
2730     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2731   else
2732     B.buildFMinNum(Min, Fract, Const, Flags);
2733 
2734   Register CorrectedFract = Min;
2735   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2736     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2737     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2738   }
2739 
2740   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2741   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2742 
2743   MI.eraseFromParent();
2744   return true;
2745 }
2746 
2747 // Turn an illegal packed v2s16 build vector into bit operations.
2748 // TODO: This should probably be a bitcast action in LegalizerHelper.
2749 bool AMDGPULegalizerInfo::legalizeBuildVector(
2750   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2751   Register Dst = MI.getOperand(0).getReg();
2752   const LLT S32 = LLT::scalar(32);
2753   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2754 
2755   Register Src0 = MI.getOperand(1).getReg();
2756   Register Src1 = MI.getOperand(2).getReg();
2757   assert(MRI.getType(Src0) == LLT::scalar(16));
2758 
2759   auto Merge = B.buildMerge(S32, {Src0, Src1});
2760   B.buildBitcast(Dst, Merge);
2761 
2762   MI.eraseFromParent();
2763   return true;
2764 }
2765 
2766 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2767 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2768 // case with a single min instruction instead of a compare+select.
2769 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2770                                             MachineRegisterInfo &MRI,
2771                                             MachineIRBuilder &B) const {
2772   Register Dst = MI.getOperand(0).getReg();
2773   Register Src = MI.getOperand(1).getReg();
2774   LLT DstTy = MRI.getType(Dst);
2775   LLT SrcTy = MRI.getType(Src);
2776 
2777   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2778                         ? AMDGPU::G_AMDGPU_FFBH_U32
2779                         : AMDGPU::G_AMDGPU_FFBL_B32;
2780   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2781   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2782 
2783   MI.eraseFromParent();
2784   return true;
2785 }
2786 
2787 // Check that this is a G_XOR x, -1
2788 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2789   if (MI.getOpcode() != TargetOpcode::G_XOR)
2790     return false;
2791   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2792   return ConstVal && *ConstVal == -1;
2793 }
2794 
2795 // Return the use branch instruction, otherwise null if the usage is invalid.
2796 static MachineInstr *
2797 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2798                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2799   Register CondDef = MI.getOperand(0).getReg();
2800   if (!MRI.hasOneNonDBGUse(CondDef))
2801     return nullptr;
2802 
2803   MachineBasicBlock *Parent = MI.getParent();
2804   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2805 
2806   if (isNot(MRI, *UseMI)) {
2807     Register NegatedCond = UseMI->getOperand(0).getReg();
2808     if (!MRI.hasOneNonDBGUse(NegatedCond))
2809       return nullptr;
2810 
2811     // We're deleting the def of this value, so we need to remove it.
2812     eraseInstr(*UseMI, MRI);
2813 
2814     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2815     Negated = true;
2816   }
2817 
2818   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2819     return nullptr;
2820 
2821   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2822   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2823   if (Next == Parent->end()) {
2824     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2825     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2826       return nullptr;
2827     UncondBrTarget = &*NextMBB;
2828   } else {
2829     if (Next->getOpcode() != AMDGPU::G_BR)
2830       return nullptr;
2831     Br = &*Next;
2832     UncondBrTarget = Br->getOperand(0).getMBB();
2833   }
2834 
2835   return UseMI;
2836 }
2837 
2838 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2839                                          const ArgDescriptor *Arg,
2840                                          const TargetRegisterClass *ArgRC,
2841                                          LLT ArgTy) const {
2842   MCRegister SrcReg = Arg->getRegister();
2843   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2844   assert(DstReg.isVirtual() && "Virtual register expected");
2845 
2846   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2847                                              ArgTy);
2848   if (Arg->isMasked()) {
2849     // TODO: Should we try to emit this once in the entry block?
2850     const LLT S32 = LLT::scalar(32);
2851     const unsigned Mask = Arg->getMask();
2852     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2853 
2854     Register AndMaskSrc = LiveIn;
2855 
2856     if (Shift != 0) {
2857       auto ShiftAmt = B.buildConstant(S32, Shift);
2858       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2859     }
2860 
2861     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2862   } else {
2863     B.buildCopy(DstReg, LiveIn);
2864   }
2865 
2866   return true;
2867 }
2868 
2869 bool AMDGPULegalizerInfo::loadInputValue(
2870     Register DstReg, MachineIRBuilder &B,
2871     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2872   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2873   const ArgDescriptor *Arg;
2874   const TargetRegisterClass *ArgRC;
2875   LLT ArgTy;
2876   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2877 
2878   if (!Arg) {
2879     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2880       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2881       // case the pointer argument may be missing and we use null.
2882       B.buildConstant(DstReg, 0);
2883       return true;
2884     }
2885 
2886     // It's undefined behavior if a function marked with the amdgpu-no-*
2887     // attributes uses the corresponding intrinsic.
2888     B.buildUndef(DstReg);
2889     return true;
2890   }
2891 
2892   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2893     return false; // TODO: Handle these
2894   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2895 }
2896 
2897 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2898     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2899     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2900   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2901     return false;
2902 
2903   MI.eraseFromParent();
2904   return true;
2905 }
2906 
2907 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2908                                        MachineRegisterInfo &MRI,
2909                                        MachineIRBuilder &B) const {
2910   Register Dst = MI.getOperand(0).getReg();
2911   LLT DstTy = MRI.getType(Dst);
2912   LLT S16 = LLT::scalar(16);
2913   LLT S32 = LLT::scalar(32);
2914   LLT S64 = LLT::scalar(64);
2915 
2916   if (DstTy == S16)
2917     return legalizeFDIV16(MI, MRI, B);
2918   if (DstTy == S32)
2919     return legalizeFDIV32(MI, MRI, B);
2920   if (DstTy == S64)
2921     return legalizeFDIV64(MI, MRI, B);
2922 
2923   return false;
2924 }
2925 
2926 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2927                                                         Register DstDivReg,
2928                                                         Register DstRemReg,
2929                                                         Register X,
2930                                                         Register Y) const {
2931   const LLT S1 = LLT::scalar(1);
2932   const LLT S32 = LLT::scalar(32);
2933 
2934   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2935   // algorithm used here.
2936 
2937   // Initial estimate of inv(y).
2938   auto FloatY = B.buildUITOFP(S32, Y);
2939   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2940   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2941   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2942   auto Z = B.buildFPTOUI(S32, ScaledY);
2943 
2944   // One round of UNR.
2945   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2946   auto NegYZ = B.buildMul(S32, NegY, Z);
2947   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2948 
2949   // Quotient/remainder estimate.
2950   auto Q = B.buildUMulH(S32, X, Z);
2951   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2952 
2953   // First quotient/remainder refinement.
2954   auto One = B.buildConstant(S32, 1);
2955   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2956   if (DstDivReg)
2957     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2958   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2959 
2960   // Second quotient/remainder refinement.
2961   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2962   if (DstDivReg)
2963     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2964 
2965   if (DstRemReg)
2966     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2967 }
2968 
2969 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2970 //
2971 // Return lo, hi of result
2972 //
2973 // %cvt.lo = G_UITOFP Val.lo
2974 // %cvt.hi = G_UITOFP Val.hi
2975 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2976 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2977 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2978 // %mul2 = G_FMUL %mul1, 2**(-32)
2979 // %trunc = G_INTRINSIC_TRUNC %mul2
2980 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2981 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2982 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2983                                                        Register Val) {
2984   const LLT S32 = LLT::scalar(32);
2985   auto Unmerge = B.buildUnmerge(S32, Val);
2986 
2987   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2988   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2989 
2990   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2991                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2992 
2993   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2994   auto Mul1 =
2995       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2996 
2997   // 2**(-32)
2998   auto Mul2 =
2999       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3000   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3001 
3002   // -(2**32)
3003   auto Mad2 = B.buildFMAD(S32, Trunc,
3004                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3005 
3006   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3007   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3008 
3009   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3010 }
3011 
3012 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3013                                                         Register DstDivReg,
3014                                                         Register DstRemReg,
3015                                                         Register Numer,
3016                                                         Register Denom) const {
3017   const LLT S32 = LLT::scalar(32);
3018   const LLT S64 = LLT::scalar(64);
3019   const LLT S1 = LLT::scalar(1);
3020   Register RcpLo, RcpHi;
3021 
3022   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3023 
3024   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3025 
3026   auto Zero64 = B.buildConstant(S64, 0);
3027   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3028 
3029   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3030   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3031 
3032   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3033   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3034   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3035 
3036   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3037   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3038   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3039 
3040   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3041   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3042   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3043   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3044   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3045 
3046   auto Zero32 = B.buildConstant(S32, 0);
3047   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3048   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3049   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3050 
3051   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3052   Register NumerLo = UnmergeNumer.getReg(0);
3053   Register NumerHi = UnmergeNumer.getReg(1);
3054 
3055   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3056   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3057   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3058   Register Mul3_Lo = UnmergeMul3.getReg(0);
3059   Register Mul3_Hi = UnmergeMul3.getReg(1);
3060   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3061   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3062   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3063   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3064 
3065   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3066   Register DenomLo = UnmergeDenom.getReg(0);
3067   Register DenomHi = UnmergeDenom.getReg(1);
3068 
3069   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3070   auto C1 = B.buildSExt(S32, CmpHi);
3071 
3072   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3073   auto C2 = B.buildSExt(S32, CmpLo);
3074 
3075   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3076   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3077 
3078   // TODO: Here and below portions of the code can be enclosed into if/endif.
3079   // Currently control flow is unconditional and we have 4 selects after
3080   // potential endif to substitute PHIs.
3081 
3082   // if C3 != 0 ...
3083   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3084   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3085   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3086   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3087 
3088   auto One64 = B.buildConstant(S64, 1);
3089   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3090 
3091   auto C4 =
3092       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3093   auto C5 =
3094       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3095   auto C6 = B.buildSelect(
3096       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3097 
3098   // if (C6 != 0)
3099   auto Add4 = B.buildAdd(S64, Add3, One64);
3100   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3101 
3102   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3103   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3104   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3105 
3106   // endif C6
3107   // endif C3
3108 
3109   if (DstDivReg) {
3110     auto Sel1 = B.buildSelect(
3111         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3112     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3113                   Sel1, MulHi3);
3114   }
3115 
3116   if (DstRemReg) {
3117     auto Sel2 = B.buildSelect(
3118         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3119     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3120                   Sel2, Sub1);
3121   }
3122 }
3123 
3124 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3125                                                   MachineRegisterInfo &MRI,
3126                                                   MachineIRBuilder &B) const {
3127   Register DstDivReg, DstRemReg;
3128   switch (MI.getOpcode()) {
3129   default:
3130     llvm_unreachable("Unexpected opcode!");
3131   case AMDGPU::G_UDIV: {
3132     DstDivReg = MI.getOperand(0).getReg();
3133     break;
3134   }
3135   case AMDGPU::G_UREM: {
3136     DstRemReg = MI.getOperand(0).getReg();
3137     break;
3138   }
3139   case AMDGPU::G_UDIVREM: {
3140     DstDivReg = MI.getOperand(0).getReg();
3141     DstRemReg = MI.getOperand(1).getReg();
3142     break;
3143   }
3144   }
3145 
3146   const LLT S64 = LLT::scalar(64);
3147   const LLT S32 = LLT::scalar(32);
3148   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3149   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3150   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3151   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3152 
3153   if (Ty == S32)
3154     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3155   else if (Ty == S64)
3156     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3157   else
3158     return false;
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3165                                                 MachineRegisterInfo &MRI,
3166                                                 MachineIRBuilder &B) const {
3167   const LLT S64 = LLT::scalar(64);
3168   const LLT S32 = LLT::scalar(32);
3169 
3170   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3171   if (Ty != S32 && Ty != S64)
3172     return false;
3173 
3174   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3175   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3176   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3177 
3178   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3179   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3180   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3181 
3182   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3183   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3184 
3185   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3186   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3187 
3188   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3189   switch (MI.getOpcode()) {
3190   default:
3191     llvm_unreachable("Unexpected opcode!");
3192   case AMDGPU::G_SDIV: {
3193     DstDivReg = MI.getOperand(0).getReg();
3194     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3195     break;
3196   }
3197   case AMDGPU::G_SREM: {
3198     DstRemReg = MI.getOperand(0).getReg();
3199     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3200     break;
3201   }
3202   case AMDGPU::G_SDIVREM: {
3203     DstDivReg = MI.getOperand(0).getReg();
3204     DstRemReg = MI.getOperand(1).getReg();
3205     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3206     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3207     break;
3208   }
3209   }
3210 
3211   if (Ty == S32)
3212     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3213   else
3214     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3215 
3216   if (DstDivReg) {
3217     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3218     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3219     B.buildSub(DstDivReg, SignXor, Sign);
3220   }
3221 
3222   if (DstRemReg) {
3223     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3224     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3225     B.buildSub(DstRemReg, SignXor, Sign);
3226   }
3227 
3228   MI.eraseFromParent();
3229   return true;
3230 }
3231 
3232 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3233                                                  MachineRegisterInfo &MRI,
3234                                                  MachineIRBuilder &B) const {
3235   Register Res = MI.getOperand(0).getReg();
3236   Register LHS = MI.getOperand(1).getReg();
3237   Register RHS = MI.getOperand(2).getReg();
3238   uint16_t Flags = MI.getFlags();
3239   LLT ResTy = MRI.getType(Res);
3240 
3241   const MachineFunction &MF = B.getMF();
3242   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3243                             MI.getFlag(MachineInstr::FmAfn);
3244 
3245   if (!AllowInaccurateRcp)
3246     return false;
3247 
3248   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3249     // 1 / x -> RCP(x)
3250     if (CLHS->isExactlyValue(1.0)) {
3251       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3252         .addUse(RHS)
3253         .setMIFlags(Flags);
3254 
3255       MI.eraseFromParent();
3256       return true;
3257     }
3258 
3259     // -1 / x -> RCP( FNEG(x) )
3260     if (CLHS->isExactlyValue(-1.0)) {
3261       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3262       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3263         .addUse(FNeg.getReg(0))
3264         .setMIFlags(Flags);
3265 
3266       MI.eraseFromParent();
3267       return true;
3268     }
3269   }
3270 
3271   // x / y -> x * (1.0 / y)
3272   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3273     .addUse(RHS)
3274     .setMIFlags(Flags);
3275   B.buildFMul(Res, LHS, RCP, Flags);
3276 
3277   MI.eraseFromParent();
3278   return true;
3279 }
3280 
3281 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3282                                                    MachineRegisterInfo &MRI,
3283                                                    MachineIRBuilder &B) const {
3284   Register Res = MI.getOperand(0).getReg();
3285   Register X = MI.getOperand(1).getReg();
3286   Register Y = MI.getOperand(2).getReg();
3287   uint16_t Flags = MI.getFlags();
3288   LLT ResTy = MRI.getType(Res);
3289 
3290   const MachineFunction &MF = B.getMF();
3291   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3292                             MI.getFlag(MachineInstr::FmAfn);
3293 
3294   if (!AllowInaccurateRcp)
3295     return false;
3296 
3297   auto NegY = B.buildFNeg(ResTy, Y);
3298   auto One = B.buildFConstant(ResTy, 1.0);
3299 
3300   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3301     .addUse(Y)
3302     .setMIFlags(Flags);
3303 
3304   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3305   R = B.buildFMA(ResTy, Tmp0, R, R);
3306 
3307   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3308   R = B.buildFMA(ResTy, Tmp1, R, R);
3309 
3310   auto Ret = B.buildFMul(ResTy, X, R);
3311   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3312 
3313   B.buildFMA(Res, Tmp2, R, Ret);
3314   MI.eraseFromParent();
3315   return true;
3316 }
3317 
3318 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3319                                          MachineRegisterInfo &MRI,
3320                                          MachineIRBuilder &B) const {
3321   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3322     return true;
3323 
3324   Register Res = MI.getOperand(0).getReg();
3325   Register LHS = MI.getOperand(1).getReg();
3326   Register RHS = MI.getOperand(2).getReg();
3327 
3328   uint16_t Flags = MI.getFlags();
3329 
3330   LLT S16 = LLT::scalar(16);
3331   LLT S32 = LLT::scalar(32);
3332 
3333   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3334   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3335 
3336   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3337     .addUse(RHSExt.getReg(0))
3338     .setMIFlags(Flags);
3339 
3340   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3341   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3342 
3343   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3344     .addUse(RDst.getReg(0))
3345     .addUse(RHS)
3346     .addUse(LHS)
3347     .setMIFlags(Flags);
3348 
3349   MI.eraseFromParent();
3350   return true;
3351 }
3352 
3353 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3354 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3355 static void toggleSPDenormMode(bool Enable,
3356                                MachineIRBuilder &B,
3357                                const GCNSubtarget &ST,
3358                                AMDGPU::SIModeRegisterDefaults Mode) {
3359   // Set SP denorm mode to this value.
3360   unsigned SPDenormMode =
3361     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3362 
3363   if (ST.hasDenormModeInst()) {
3364     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3365     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3366 
3367     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3368     B.buildInstr(AMDGPU::S_DENORM_MODE)
3369       .addImm(NewDenormModeValue);
3370 
3371   } else {
3372     // Select FP32 bit field in mode register.
3373     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3374                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3375                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3376 
3377     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3378       .addImm(SPDenormMode)
3379       .addImm(SPDenormModeBitField);
3380   }
3381 }
3382 
3383 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3384                                          MachineRegisterInfo &MRI,
3385                                          MachineIRBuilder &B) const {
3386   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3387     return true;
3388 
3389   Register Res = MI.getOperand(0).getReg();
3390   Register LHS = MI.getOperand(1).getReg();
3391   Register RHS = MI.getOperand(2).getReg();
3392   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3393   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3394 
3395   uint16_t Flags = MI.getFlags();
3396 
3397   LLT S32 = LLT::scalar(32);
3398   LLT S1 = LLT::scalar(1);
3399 
3400   auto One = B.buildFConstant(S32, 1.0f);
3401 
3402   auto DenominatorScaled =
3403     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3404       .addUse(LHS)
3405       .addUse(RHS)
3406       .addImm(0)
3407       .setMIFlags(Flags);
3408   auto NumeratorScaled =
3409     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3410       .addUse(LHS)
3411       .addUse(RHS)
3412       .addImm(1)
3413       .setMIFlags(Flags);
3414 
3415   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3416     .addUse(DenominatorScaled.getReg(0))
3417     .setMIFlags(Flags);
3418   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3419 
3420   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3421   // aren't modeled as reading it.
3422   if (!Mode.allFP32Denormals())
3423     toggleSPDenormMode(true, B, ST, Mode);
3424 
3425   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3426   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3427   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3428   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3429   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3430   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3431 
3432   if (!Mode.allFP32Denormals())
3433     toggleSPDenormMode(false, B, ST, Mode);
3434 
3435   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3436     .addUse(Fma4.getReg(0))
3437     .addUse(Fma1.getReg(0))
3438     .addUse(Fma3.getReg(0))
3439     .addUse(NumeratorScaled.getReg(1))
3440     .setMIFlags(Flags);
3441 
3442   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3443     .addUse(Fmas.getReg(0))
3444     .addUse(RHS)
3445     .addUse(LHS)
3446     .setMIFlags(Flags);
3447 
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3453                                          MachineRegisterInfo &MRI,
3454                                          MachineIRBuilder &B) const {
3455   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3456     return true;
3457 
3458   Register Res = MI.getOperand(0).getReg();
3459   Register LHS = MI.getOperand(1).getReg();
3460   Register RHS = MI.getOperand(2).getReg();
3461 
3462   uint16_t Flags = MI.getFlags();
3463 
3464   LLT S64 = LLT::scalar(64);
3465   LLT S1 = LLT::scalar(1);
3466 
3467   auto One = B.buildFConstant(S64, 1.0);
3468 
3469   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3470     .addUse(LHS)
3471     .addUse(RHS)
3472     .addImm(0)
3473     .setMIFlags(Flags);
3474 
3475   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3476 
3477   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3478     .addUse(DivScale0.getReg(0))
3479     .setMIFlags(Flags);
3480 
3481   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3482   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3483   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3484 
3485   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3486     .addUse(LHS)
3487     .addUse(RHS)
3488     .addImm(1)
3489     .setMIFlags(Flags);
3490 
3491   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3492   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3493   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3494 
3495   Register Scale;
3496   if (!ST.hasUsableDivScaleConditionOutput()) {
3497     // Workaround a hardware bug on SI where the condition output from div_scale
3498     // is not usable.
3499 
3500     LLT S32 = LLT::scalar(32);
3501 
3502     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3503     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3504     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3505     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3506 
3507     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3508                               Scale1Unmerge.getReg(1));
3509     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3510                               Scale0Unmerge.getReg(1));
3511     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3512   } else {
3513     Scale = DivScale1.getReg(1);
3514   }
3515 
3516   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3517     .addUse(Fma4.getReg(0))
3518     .addUse(Fma3.getReg(0))
3519     .addUse(Mul.getReg(0))
3520     .addUse(Scale)
3521     .setMIFlags(Flags);
3522 
3523   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3524     .addUse(Fmas.getReg(0))
3525     .addUse(RHS)
3526     .addUse(LHS)
3527     .setMIFlags(Flags);
3528 
3529   MI.eraseFromParent();
3530   return true;
3531 }
3532 
3533 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3534                                                  MachineRegisterInfo &MRI,
3535                                                  MachineIRBuilder &B) const {
3536   Register Res = MI.getOperand(0).getReg();
3537   Register LHS = MI.getOperand(2).getReg();
3538   Register RHS = MI.getOperand(3).getReg();
3539   uint16_t Flags = MI.getFlags();
3540 
3541   LLT S32 = LLT::scalar(32);
3542   LLT S1 = LLT::scalar(1);
3543 
3544   auto Abs = B.buildFAbs(S32, RHS, Flags);
3545   const APFloat C0Val(1.0f);
3546 
3547   auto C0 = B.buildConstant(S32, 0x6f800000);
3548   auto C1 = B.buildConstant(S32, 0x2f800000);
3549   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3550 
3551   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3552   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3553 
3554   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3555 
3556   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3557     .addUse(Mul0.getReg(0))
3558     .setMIFlags(Flags);
3559 
3560   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3561 
3562   B.buildFMul(Res, Sel, Mul1, Flags);
3563 
3564   MI.eraseFromParent();
3565   return true;
3566 }
3567 
3568 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3569 // FIXME: Why do we handle this one but not other removed instructions?
3570 //
3571 // Reciprocal square root.  The clamp prevents infinite results, clamping
3572 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3573 // +-max_float.
3574 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3575                                                     MachineRegisterInfo &MRI,
3576                                                     MachineIRBuilder &B) const {
3577   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3578     return true;
3579 
3580   Register Dst = MI.getOperand(0).getReg();
3581   Register Src = MI.getOperand(2).getReg();
3582   auto Flags = MI.getFlags();
3583 
3584   LLT Ty = MRI.getType(Dst);
3585 
3586   const fltSemantics *FltSemantics;
3587   if (Ty == LLT::scalar(32))
3588     FltSemantics = &APFloat::IEEEsingle();
3589   else if (Ty == LLT::scalar(64))
3590     FltSemantics = &APFloat::IEEEdouble();
3591   else
3592     return false;
3593 
3594   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3595     .addUse(Src)
3596     .setMIFlags(Flags);
3597 
3598   // We don't need to concern ourselves with the snan handling difference, since
3599   // the rsq quieted (or not) so use the one which will directly select.
3600   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3601   const bool UseIEEE = MFI->getMode().IEEE;
3602 
3603   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3604   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3605                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3606 
3607   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3608 
3609   if (UseIEEE)
3610     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3611   else
3612     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3613   MI.eraseFromParent();
3614   return true;
3615 }
3616 
3617 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3618   switch (IID) {
3619   case Intrinsic::amdgcn_ds_fadd:
3620     return AMDGPU::G_ATOMICRMW_FADD;
3621   case Intrinsic::amdgcn_ds_fmin:
3622     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3623   case Intrinsic::amdgcn_ds_fmax:
3624     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3625   default:
3626     llvm_unreachable("not a DS FP intrinsic");
3627   }
3628 }
3629 
3630 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3631                                                       MachineInstr &MI,
3632                                                       Intrinsic::ID IID) const {
3633   GISelChangeObserver &Observer = Helper.Observer;
3634   Observer.changingInstr(MI);
3635 
3636   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3637 
3638   // The remaining operands were used to set fields in the MemOperand on
3639   // construction.
3640   for (int I = 6; I > 3; --I)
3641     MI.RemoveOperand(I);
3642 
3643   MI.RemoveOperand(1); // Remove the intrinsic ID.
3644   Observer.changedInstr(MI);
3645   return true;
3646 }
3647 
3648 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3649                                             MachineRegisterInfo &MRI,
3650                                             MachineIRBuilder &B) const {
3651   uint64_t Offset =
3652     ST.getTargetLowering()->getImplicitParameterOffset(
3653       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3654   LLT DstTy = MRI.getType(DstReg);
3655   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3656 
3657   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3658   if (!loadInputValue(KernargPtrReg, B,
3659                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3660     return false;
3661 
3662   // FIXME: This should be nuw
3663   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3664   return true;
3665 }
3666 
3667 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3668                                                  MachineRegisterInfo &MRI,
3669                                                  MachineIRBuilder &B) const {
3670   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3671   if (!MFI->isEntryFunction()) {
3672     return legalizePreloadedArgIntrin(MI, MRI, B,
3673                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3674   }
3675 
3676   Register DstReg = MI.getOperand(0).getReg();
3677   if (!getImplicitArgPtr(DstReg, MRI, B))
3678     return false;
3679 
3680   MI.eraseFromParent();
3681   return true;
3682 }
3683 
3684 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3685                                               MachineRegisterInfo &MRI,
3686                                               MachineIRBuilder &B,
3687                                               unsigned AddrSpace) const {
3688   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3689   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3690   Register Hi32 = Unmerge.getReg(1);
3691 
3692   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3693   MI.eraseFromParent();
3694   return true;
3695 }
3696 
3697 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3698 // offset (the offset that is included in bounds checking and swizzling, to be
3699 // split between the instruction's voffset and immoffset fields) and soffset
3700 // (the offset that is excluded from bounds checking and swizzling, to go in
3701 // the instruction's soffset field).  This function takes the first kind of
3702 // offset and figures out how to split it between voffset and immoffset.
3703 std::pair<Register, unsigned>
3704 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3705                                         Register OrigOffset) const {
3706   const unsigned MaxImm = 4095;
3707   Register BaseReg;
3708   unsigned ImmOffset;
3709   const LLT S32 = LLT::scalar(32);
3710   MachineRegisterInfo &MRI = *B.getMRI();
3711 
3712   std::tie(BaseReg, ImmOffset) =
3713       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3714 
3715   // If BaseReg is a pointer, convert it to int.
3716   if (MRI.getType(BaseReg).isPointer())
3717     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3718 
3719   // If the immediate value is too big for the immoffset field, put the value
3720   // and -4096 into the immoffset field so that the value that is copied/added
3721   // for the voffset field is a multiple of 4096, and it stands more chance
3722   // of being CSEd with the copy/add for another similar load/store.
3723   // However, do not do that rounding down to a multiple of 4096 if that is a
3724   // negative number, as it appears to be illegal to have a negative offset
3725   // in the vgpr, even if adding the immediate offset makes it positive.
3726   unsigned Overflow = ImmOffset & ~MaxImm;
3727   ImmOffset -= Overflow;
3728   if ((int32_t)Overflow < 0) {
3729     Overflow += ImmOffset;
3730     ImmOffset = 0;
3731   }
3732 
3733   if (Overflow != 0) {
3734     if (!BaseReg) {
3735       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3736     } else {
3737       auto OverflowVal = B.buildConstant(S32, Overflow);
3738       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3739     }
3740   }
3741 
3742   if (!BaseReg)
3743     BaseReg = B.buildConstant(S32, 0).getReg(0);
3744 
3745   return std::make_pair(BaseReg, ImmOffset);
3746 }
3747 
3748 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3749 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3750                                           Register VOffset, Register SOffset,
3751                                           unsigned ImmOffset, Register VIndex,
3752                                           MachineRegisterInfo &MRI) const {
3753   Optional<ValueAndVReg> MaybeVOffsetVal =
3754       getIConstantVRegValWithLookThrough(VOffset, MRI);
3755   Optional<ValueAndVReg> MaybeSOffsetVal =
3756       getIConstantVRegValWithLookThrough(SOffset, MRI);
3757   Optional<ValueAndVReg> MaybeVIndexVal =
3758       getIConstantVRegValWithLookThrough(VIndex, MRI);
3759   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3760   // update the MMO with that offset. The stride is unknown so we can only do
3761   // this if VIndex is constant 0.
3762   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3763       MaybeVIndexVal->Value == 0) {
3764     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3765                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3766     MMO->setOffset(TotalOffset);
3767   } else {
3768     // We don't have a constant combined offset to use in the MMO. Give up.
3769     MMO->setValue((Value *)nullptr);
3770   }
3771 }
3772 
3773 /// Handle register layout difference for f16 images for some subtargets.
3774 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3775                                              MachineRegisterInfo &MRI,
3776                                              Register Reg,
3777                                              bool ImageStore) const {
3778   const LLT S16 = LLT::scalar(16);
3779   const LLT S32 = LLT::scalar(32);
3780   LLT StoreVT = MRI.getType(Reg);
3781   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3782 
3783   if (ST.hasUnpackedD16VMem()) {
3784     auto Unmerge = B.buildUnmerge(S16, Reg);
3785 
3786     SmallVector<Register, 4> WideRegs;
3787     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3788       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3789 
3790     int NumElts = StoreVT.getNumElements();
3791 
3792     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3793         .getReg(0);
3794   }
3795 
3796   if (ImageStore && ST.hasImageStoreD16Bug()) {
3797     if (StoreVT.getNumElements() == 2) {
3798       SmallVector<Register, 4> PackedRegs;
3799       Reg = B.buildBitcast(S32, Reg).getReg(0);
3800       PackedRegs.push_back(Reg);
3801       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3802       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3803           .getReg(0);
3804     }
3805 
3806     if (StoreVT.getNumElements() == 3) {
3807       SmallVector<Register, 4> PackedRegs;
3808       auto Unmerge = B.buildUnmerge(S16, Reg);
3809       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3810         PackedRegs.push_back(Unmerge.getReg(I));
3811       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3812       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3813       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3814     }
3815 
3816     if (StoreVT.getNumElements() == 4) {
3817       SmallVector<Register, 4> PackedRegs;
3818       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3819       auto Unmerge = B.buildUnmerge(S32, Reg);
3820       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3821         PackedRegs.push_back(Unmerge.getReg(I));
3822       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3823       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3824           .getReg(0);
3825     }
3826 
3827     llvm_unreachable("invalid data type");
3828   }
3829 
3830   if (StoreVT == LLT::fixed_vector(3, S16)) {
3831     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3832               .getReg(0);
3833   }
3834   return Reg;
3835 }
3836 
3837 Register AMDGPULegalizerInfo::fixStoreSourceType(
3838   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3839   MachineRegisterInfo *MRI = B.getMRI();
3840   LLT Ty = MRI->getType(VData);
3841 
3842   const LLT S16 = LLT::scalar(16);
3843 
3844   // Fixup illegal register types for i8 stores.
3845   if (Ty == LLT::scalar(8) || Ty == S16) {
3846     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3847     return AnyExt;
3848   }
3849 
3850   if (Ty.isVector()) {
3851     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3852       if (IsFormat)
3853         return handleD16VData(B, *MRI, VData);
3854     }
3855   }
3856 
3857   return VData;
3858 }
3859 
3860 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3861                                               MachineRegisterInfo &MRI,
3862                                               MachineIRBuilder &B,
3863                                               bool IsTyped,
3864                                               bool IsFormat) const {
3865   Register VData = MI.getOperand(1).getReg();
3866   LLT Ty = MRI.getType(VData);
3867   LLT EltTy = Ty.getScalarType();
3868   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3869   const LLT S32 = LLT::scalar(32);
3870 
3871   VData = fixStoreSourceType(B, VData, IsFormat);
3872   Register RSrc = MI.getOperand(2).getReg();
3873 
3874   MachineMemOperand *MMO = *MI.memoperands_begin();
3875   const int MemSize = MMO->getSize();
3876 
3877   unsigned ImmOffset;
3878 
3879   // The typed intrinsics add an immediate after the registers.
3880   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3881 
3882   // The struct intrinsic variants add one additional operand over raw.
3883   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3884   Register VIndex;
3885   int OpOffset = 0;
3886   if (HasVIndex) {
3887     VIndex = MI.getOperand(3).getReg();
3888     OpOffset = 1;
3889   } else {
3890     VIndex = B.buildConstant(S32, 0).getReg(0);
3891   }
3892 
3893   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3894   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3895 
3896   unsigned Format = 0;
3897   if (IsTyped) {
3898     Format = MI.getOperand(5 + OpOffset).getImm();
3899     ++OpOffset;
3900   }
3901 
3902   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3903 
3904   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3905   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3906 
3907   unsigned Opc;
3908   if (IsTyped) {
3909     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3910                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3911   } else if (IsFormat) {
3912     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3913                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3914   } else {
3915     switch (MemSize) {
3916     case 1:
3917       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3918       break;
3919     case 2:
3920       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3921       break;
3922     default:
3923       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3924       break;
3925     }
3926   }
3927 
3928   auto MIB = B.buildInstr(Opc)
3929     .addUse(VData)              // vdata
3930     .addUse(RSrc)               // rsrc
3931     .addUse(VIndex)             // vindex
3932     .addUse(VOffset)            // voffset
3933     .addUse(SOffset)            // soffset
3934     .addImm(ImmOffset);         // offset(imm)
3935 
3936   if (IsTyped)
3937     MIB.addImm(Format);
3938 
3939   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3940      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3941      .addMemOperand(MMO);
3942 
3943   MI.eraseFromParent();
3944   return true;
3945 }
3946 
3947 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3948                                              MachineRegisterInfo &MRI,
3949                                              MachineIRBuilder &B,
3950                                              bool IsFormat,
3951                                              bool IsTyped) const {
3952   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3953   MachineMemOperand *MMO = *MI.memoperands_begin();
3954   const LLT MemTy = MMO->getMemoryType();
3955   const LLT S32 = LLT::scalar(32);
3956 
3957   Register Dst = MI.getOperand(0).getReg();
3958   Register RSrc = MI.getOperand(2).getReg();
3959 
3960   // The typed intrinsics add an immediate after the registers.
3961   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3962 
3963   // The struct intrinsic variants add one additional operand over raw.
3964   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3965   Register VIndex;
3966   int OpOffset = 0;
3967   if (HasVIndex) {
3968     VIndex = MI.getOperand(3).getReg();
3969     OpOffset = 1;
3970   } else {
3971     VIndex = B.buildConstant(S32, 0).getReg(0);
3972   }
3973 
3974   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3975   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3976 
3977   unsigned Format = 0;
3978   if (IsTyped) {
3979     Format = MI.getOperand(5 + OpOffset).getImm();
3980     ++OpOffset;
3981   }
3982 
3983   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3984   unsigned ImmOffset;
3985 
3986   LLT Ty = MRI.getType(Dst);
3987   LLT EltTy = Ty.getScalarType();
3988   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3989   const bool Unpacked = ST.hasUnpackedD16VMem();
3990 
3991   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3992   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3993 
3994   unsigned Opc;
3995 
3996   if (IsTyped) {
3997     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3998                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3999   } else if (IsFormat) {
4000     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4001                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4002   } else {
4003     switch (MemTy.getSizeInBits()) {
4004     case 8:
4005       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4006       break;
4007     case 16:
4008       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4009       break;
4010     default:
4011       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4012       break;
4013     }
4014   }
4015 
4016   Register LoadDstReg;
4017 
4018   bool IsExtLoad =
4019       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4020   LLT UnpackedTy = Ty.changeElementSize(32);
4021 
4022   if (IsExtLoad)
4023     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4024   else if (Unpacked && IsD16 && Ty.isVector())
4025     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4026   else
4027     LoadDstReg = Dst;
4028 
4029   auto MIB = B.buildInstr(Opc)
4030     .addDef(LoadDstReg)         // vdata
4031     .addUse(RSrc)               // rsrc
4032     .addUse(VIndex)             // vindex
4033     .addUse(VOffset)            // voffset
4034     .addUse(SOffset)            // soffset
4035     .addImm(ImmOffset);         // offset(imm)
4036 
4037   if (IsTyped)
4038     MIB.addImm(Format);
4039 
4040   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4041      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4042      .addMemOperand(MMO);
4043 
4044   if (LoadDstReg != Dst) {
4045     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4046 
4047     // Widen result for extending loads was widened.
4048     if (IsExtLoad)
4049       B.buildTrunc(Dst, LoadDstReg);
4050     else {
4051       // Repack to original 16-bit vector result
4052       // FIXME: G_TRUNC should work, but legalization currently fails
4053       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4054       SmallVector<Register, 4> Repack;
4055       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4056         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4057       B.buildMerge(Dst, Repack);
4058     }
4059   }
4060 
4061   MI.eraseFromParent();
4062   return true;
4063 }
4064 
4065 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4066                                                MachineIRBuilder &B,
4067                                                bool IsInc) const {
4068   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4069                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4070   B.buildInstr(Opc)
4071     .addDef(MI.getOperand(0).getReg())
4072     .addUse(MI.getOperand(2).getReg())
4073     .addUse(MI.getOperand(3).getReg())
4074     .cloneMemRefs(MI);
4075   MI.eraseFromParent();
4076   return true;
4077 }
4078 
4079 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4080   switch (IntrID) {
4081   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4082   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4083     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4084   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4085   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4086     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4087   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4088   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4089     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4090   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4091   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4092     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4093   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4094   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4095     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4096   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4097   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4098     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4099   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4100   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4101     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4102   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4103   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4104     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4105   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4106   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4107     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4108   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4109   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4110     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4111   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4112   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4113     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4114   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4115   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4116     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4117   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4118   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4119     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4120   case Intrinsic::amdgcn_buffer_atomic_fadd:
4121   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4122   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4123     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4124   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4125   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4126     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4127   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4128   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4129     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4130   default:
4131     llvm_unreachable("unhandled atomic opcode");
4132   }
4133 }
4134 
4135 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4136                                                MachineIRBuilder &B,
4137                                                Intrinsic::ID IID) const {
4138   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4139                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4140   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4141 
4142   Register Dst;
4143 
4144   int OpOffset = 0;
4145   if (HasReturn) {
4146     // A few FP atomics do not support return values.
4147     Dst = MI.getOperand(0).getReg();
4148   } else {
4149     OpOffset = -1;
4150   }
4151 
4152   Register VData = MI.getOperand(2 + OpOffset).getReg();
4153   Register CmpVal;
4154 
4155   if (IsCmpSwap) {
4156     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4157     ++OpOffset;
4158   }
4159 
4160   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4161   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4162 
4163   // The struct intrinsic variants add one additional operand over raw.
4164   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4165   Register VIndex;
4166   if (HasVIndex) {
4167     VIndex = MI.getOperand(4 + OpOffset).getReg();
4168     ++OpOffset;
4169   } else {
4170     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4171   }
4172 
4173   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4174   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4175   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4176 
4177   MachineMemOperand *MMO = *MI.memoperands_begin();
4178 
4179   unsigned ImmOffset;
4180   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4181   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4182 
4183   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4184 
4185   if (HasReturn)
4186     MIB.addDef(Dst);
4187 
4188   MIB.addUse(VData); // vdata
4189 
4190   if (IsCmpSwap)
4191     MIB.addReg(CmpVal);
4192 
4193   MIB.addUse(RSrc)               // rsrc
4194      .addUse(VIndex)             // vindex
4195      .addUse(VOffset)            // voffset
4196      .addUse(SOffset)            // soffset
4197      .addImm(ImmOffset)          // offset(imm)
4198      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4199      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4200      .addMemOperand(MMO);
4201 
4202   MI.eraseFromParent();
4203   return true;
4204 }
4205 
4206 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4207 /// vector with s16 typed elements.
4208 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4209                                       SmallVectorImpl<Register> &PackedAddrs,
4210                                       unsigned ArgOffset,
4211                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4212                                       bool IsA16, bool IsG16) {
4213   const LLT S16 = LLT::scalar(16);
4214   const LLT V2S16 = LLT::fixed_vector(2, 16);
4215   auto EndIdx = Intr->VAddrEnd;
4216 
4217   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4218     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4219     if (!SrcOp.isReg())
4220       continue; // _L to _LZ may have eliminated this.
4221 
4222     Register AddrReg = SrcOp.getReg();
4223 
4224     if ((I < Intr->GradientStart) ||
4225         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4226         (I >= Intr->CoordStart && !IsA16)) {
4227       // Handle any gradient or coordinate operands that should not be packed
4228       if ((I < Intr->GradientStart) && IsA16 &&
4229           (B.getMRI()->getType(AddrReg) == S16)) {
4230         // Special handling of bias when A16 is on. Bias is of type half but
4231         // occupies full 32-bit.
4232         PackedAddrs.push_back(
4233             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4234                 .getReg(0));
4235       } else {
4236         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4237         PackedAddrs.push_back(AddrReg);
4238       }
4239     } else {
4240       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4241       // derivatives dx/dh and dx/dv are packed with undef.
4242       if (((I + 1) >= EndIdx) ||
4243           ((Intr->NumGradients / 2) % 2 == 1 &&
4244            (I == static_cast<unsigned>(Intr->GradientStart +
4245                                        (Intr->NumGradients / 2) - 1) ||
4246             I == static_cast<unsigned>(Intr->GradientStart +
4247                                        Intr->NumGradients - 1))) ||
4248           // Check for _L to _LZ optimization
4249           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4250         PackedAddrs.push_back(
4251             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4252                 .getReg(0));
4253       } else {
4254         PackedAddrs.push_back(
4255             B.buildBuildVector(
4256                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4257                 .getReg(0));
4258         ++I;
4259       }
4260     }
4261   }
4262 }
4263 
4264 /// Convert from separate vaddr components to a single vector address register,
4265 /// and replace the remaining operands with $noreg.
4266 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4267                                      int DimIdx, int NumVAddrs) {
4268   const LLT S32 = LLT::scalar(32);
4269 
4270   SmallVector<Register, 8> AddrRegs;
4271   for (int I = 0; I != NumVAddrs; ++I) {
4272     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4273     if (SrcOp.isReg()) {
4274       AddrRegs.push_back(SrcOp.getReg());
4275       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4276     }
4277   }
4278 
4279   int NumAddrRegs = AddrRegs.size();
4280   if (NumAddrRegs != 1) {
4281     // Above 8 elements round up to next power of 2 (i.e. 16).
4282     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4283       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4284       auto Undef = B.buildUndef(S32);
4285       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4286       NumAddrRegs = RoundedNumRegs;
4287     }
4288 
4289     auto VAddr =
4290         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4291     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4292   }
4293 
4294   for (int I = 1; I != NumVAddrs; ++I) {
4295     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4296     if (SrcOp.isReg())
4297       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4298   }
4299 }
4300 
4301 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4302 ///
4303 /// Depending on the subtarget, load/store with 16-bit element data need to be
4304 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4305 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4306 /// registers.
4307 ///
4308 /// We don't want to directly select image instructions just yet, but also want
4309 /// to exposes all register repacking to the legalizer/combiners. We also don't
4310 /// want a selected instrution entering RegBankSelect. In order to avoid
4311 /// defining a multitude of intermediate image instructions, directly hack on
4312 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4313 /// padding now unnecessary arguments with $noreg.
4314 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4315     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4316     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4317 
4318   const unsigned NumDefs = MI.getNumExplicitDefs();
4319   const unsigned ArgOffset = NumDefs + 1;
4320   bool IsTFE = NumDefs == 2;
4321   // We are only processing the operands of d16 image operations on subtargets
4322   // that use the unpacked register layout, or need to repack the TFE result.
4323 
4324   // TODO: Do we need to guard against already legalized intrinsics?
4325   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4326       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4327 
4328   MachineRegisterInfo *MRI = B.getMRI();
4329   const LLT S32 = LLT::scalar(32);
4330   const LLT S16 = LLT::scalar(16);
4331   const LLT V2S16 = LLT::fixed_vector(2, 16);
4332 
4333   unsigned DMask = 0;
4334 
4335   // Check for 16 bit addresses and pack if true.
4336   LLT GradTy =
4337       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4338   LLT AddrTy =
4339       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4340   const bool IsG16 = GradTy == S16;
4341   const bool IsA16 = AddrTy == S16;
4342 
4343   int DMaskLanes = 0;
4344   if (!BaseOpcode->Atomic) {
4345     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4346     if (BaseOpcode->Gather4) {
4347       DMaskLanes = 4;
4348     } else if (DMask != 0) {
4349       DMaskLanes = countPopulation(DMask);
4350     } else if (!IsTFE && !BaseOpcode->Store) {
4351       // If dmask is 0, this is a no-op load. This can be eliminated.
4352       B.buildUndef(MI.getOperand(0));
4353       MI.eraseFromParent();
4354       return true;
4355     }
4356   }
4357 
4358   Observer.changingInstr(MI);
4359   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4360 
4361   unsigned NewOpcode = NumDefs == 0 ?
4362     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4363 
4364   // Track that we legalized this
4365   MI.setDesc(B.getTII().get(NewOpcode));
4366 
4367   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4368   // dmask to be at least 1 otherwise the instruction will fail
4369   if (IsTFE && DMask == 0) {
4370     DMask = 0x1;
4371     DMaskLanes = 1;
4372     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4373   }
4374 
4375   if (BaseOpcode->Atomic) {
4376     Register VData0 = MI.getOperand(2).getReg();
4377     LLT Ty = MRI->getType(VData0);
4378 
4379     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4380     if (Ty.isVector())
4381       return false;
4382 
4383     if (BaseOpcode->AtomicX2) {
4384       Register VData1 = MI.getOperand(3).getReg();
4385       // The two values are packed in one register.
4386       LLT PackedTy = LLT::fixed_vector(2, Ty);
4387       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4388       MI.getOperand(2).setReg(Concat.getReg(0));
4389       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4390     }
4391   }
4392 
4393   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4394 
4395   // Optimize _L to _LZ when _L is zero
4396   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4397           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4398     const ConstantFP *ConstantLod;
4399 
4400     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4401                  m_GFCst(ConstantLod))) {
4402       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4403         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4404         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4405             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4406                                                      Intr->Dim);
4407 
4408         // The starting indexes should remain in the same place.
4409         --CorrectedNumVAddrs;
4410 
4411         MI.getOperand(MI.getNumExplicitDefs())
4412             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4413         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4414         Intr = NewImageDimIntr;
4415       }
4416     }
4417   }
4418 
4419   // Optimize _mip away, when 'lod' is zero
4420   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4421     int64_t ConstantLod;
4422     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4423                  m_ICst(ConstantLod))) {
4424       if (ConstantLod == 0) {
4425         // TODO: Change intrinsic opcode and remove operand instead or replacing
4426         // it with 0, as the _L to _LZ handling is done above.
4427         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4428         --CorrectedNumVAddrs;
4429       }
4430     }
4431   }
4432 
4433   // Rewrite the addressing register layout before doing anything else.
4434   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4435     // 16 bit gradients are supported, but are tied to the A16 control
4436     // so both gradients and addresses must be 16 bit
4437     return false;
4438   }
4439 
4440   if (IsA16 && !ST.hasA16()) {
4441     // A16 not supported
4442     return false;
4443   }
4444 
4445   if (IsA16 || IsG16) {
4446     if (Intr->NumVAddrs > 1) {
4447       SmallVector<Register, 4> PackedRegs;
4448 
4449       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4450                                 IsG16);
4451 
4452       // See also below in the non-a16 branch
4453       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4454                           PackedRegs.size() <= ST.getNSAMaxSize();
4455 
4456       if (!UseNSA && PackedRegs.size() > 1) {
4457         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4458         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4459         PackedRegs[0] = Concat.getReg(0);
4460         PackedRegs.resize(1);
4461       }
4462 
4463       const unsigned NumPacked = PackedRegs.size();
4464       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4465         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4466         if (!SrcOp.isReg()) {
4467           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4468           continue;
4469         }
4470 
4471         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4472 
4473         if (I - Intr->VAddrStart < NumPacked)
4474           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4475         else
4476           SrcOp.setReg(AMDGPU::NoRegister);
4477       }
4478     }
4479   } else {
4480     // If the register allocator cannot place the address registers contiguously
4481     // without introducing moves, then using the non-sequential address encoding
4482     // is always preferable, since it saves VALU instructions and is usually a
4483     // wash in terms of code size or even better.
4484     //
4485     // However, we currently have no way of hinting to the register allocator
4486     // that MIMG addresses should be placed contiguously when it is possible to
4487     // do so, so force non-NSA for the common 2-address case as a heuristic.
4488     //
4489     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4490     // allocation when possible.
4491     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4492                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4493 
4494     if (!UseNSA && Intr->NumVAddrs > 1)
4495       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4496                                Intr->NumVAddrs);
4497   }
4498 
4499   int Flags = 0;
4500   if (IsA16)
4501     Flags |= 1;
4502   if (IsG16)
4503     Flags |= 2;
4504   MI.addOperand(MachineOperand::CreateImm(Flags));
4505 
4506   if (BaseOpcode->Store) { // No TFE for stores?
4507     // TODO: Handle dmask trim
4508     Register VData = MI.getOperand(1).getReg();
4509     LLT Ty = MRI->getType(VData);
4510     if (!Ty.isVector() || Ty.getElementType() != S16)
4511       return true;
4512 
4513     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4514     if (RepackedReg != VData) {
4515       MI.getOperand(1).setReg(RepackedReg);
4516     }
4517 
4518     return true;
4519   }
4520 
4521   Register DstReg = MI.getOperand(0).getReg();
4522   LLT Ty = MRI->getType(DstReg);
4523   const LLT EltTy = Ty.getScalarType();
4524   const bool IsD16 = Ty.getScalarType() == S16;
4525   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4526 
4527   // Confirm that the return type is large enough for the dmask specified
4528   if (NumElts < DMaskLanes)
4529     return false;
4530 
4531   if (NumElts > 4 || DMaskLanes > 4)
4532     return false;
4533 
4534   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4535   const LLT AdjustedTy =
4536       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4537 
4538   // The raw dword aligned data component of the load. The only legal cases
4539   // where this matters should be when using the packed D16 format, for
4540   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4541   LLT RoundedTy;
4542 
4543   // S32 vector to to cover all data, plus TFE result element.
4544   LLT TFETy;
4545 
4546   // Register type to use for each loaded component. Will be S32 or V2S16.
4547   LLT RegTy;
4548 
4549   if (IsD16 && ST.hasUnpackedD16VMem()) {
4550     RoundedTy =
4551         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4552     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4553     RegTy = S32;
4554   } else {
4555     unsigned EltSize = EltTy.getSizeInBits();
4556     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4557     unsigned RoundedSize = 32 * RoundedElts;
4558     RoundedTy = LLT::scalarOrVector(
4559         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4560     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4561     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4562   }
4563 
4564   // The return type does not need adjustment.
4565   // TODO: Should we change s16 case to s32 or <2 x s16>?
4566   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4567     return true;
4568 
4569   Register Dst1Reg;
4570 
4571   // Insert after the instruction.
4572   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4573 
4574   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4575   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4576   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4577   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4578 
4579   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4580 
4581   MI.getOperand(0).setReg(NewResultReg);
4582 
4583   // In the IR, TFE is supposed to be used with a 2 element struct return
4584   // type. The instruction really returns these two values in one contiguous
4585   // register, with one additional dword beyond the loaded data. Rewrite the
4586   // return type to use a single register result.
4587 
4588   if (IsTFE) {
4589     Dst1Reg = MI.getOperand(1).getReg();
4590     if (MRI->getType(Dst1Reg) != S32)
4591       return false;
4592 
4593     // TODO: Make sure the TFE operand bit is set.
4594     MI.RemoveOperand(1);
4595 
4596     // Handle the easy case that requires no repack instructions.
4597     if (Ty == S32) {
4598       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4599       return true;
4600     }
4601   }
4602 
4603   // Now figure out how to copy the new result register back into the old
4604   // result.
4605   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4606 
4607   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4608 
4609   if (ResultNumRegs == 1) {
4610     assert(!IsTFE);
4611     ResultRegs[0] = NewResultReg;
4612   } else {
4613     // We have to repack into a new vector of some kind.
4614     for (int I = 0; I != NumDataRegs; ++I)
4615       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4616     B.buildUnmerge(ResultRegs, NewResultReg);
4617 
4618     // Drop the final TFE element to get the data part. The TFE result is
4619     // directly written to the right place already.
4620     if (IsTFE)
4621       ResultRegs.resize(NumDataRegs);
4622   }
4623 
4624   // For an s16 scalar result, we form an s32 result with a truncate regardless
4625   // of packed vs. unpacked.
4626   if (IsD16 && !Ty.isVector()) {
4627     B.buildTrunc(DstReg, ResultRegs[0]);
4628     return true;
4629   }
4630 
4631   // Avoid a build/concat_vector of 1 entry.
4632   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4633     B.buildBitcast(DstReg, ResultRegs[0]);
4634     return true;
4635   }
4636 
4637   assert(Ty.isVector());
4638 
4639   if (IsD16) {
4640     // For packed D16 results with TFE enabled, all the data components are
4641     // S32. Cast back to the expected type.
4642     //
4643     // TODO: We don't really need to use load s32 elements. We would only need one
4644     // cast for the TFE result if a multiple of v2s16 was used.
4645     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4646       for (Register &Reg : ResultRegs)
4647         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4648     } else if (ST.hasUnpackedD16VMem()) {
4649       for (Register &Reg : ResultRegs)
4650         Reg = B.buildTrunc(S16, Reg).getReg(0);
4651     }
4652   }
4653 
4654   auto padWithUndef = [&](LLT Ty, int NumElts) {
4655     if (NumElts == 0)
4656       return;
4657     Register Undef = B.buildUndef(Ty).getReg(0);
4658     for (int I = 0; I != NumElts; ++I)
4659       ResultRegs.push_back(Undef);
4660   };
4661 
4662   // Pad out any elements eliminated due to the dmask.
4663   LLT ResTy = MRI->getType(ResultRegs[0]);
4664   if (!ResTy.isVector()) {
4665     padWithUndef(ResTy, NumElts - ResultRegs.size());
4666     B.buildBuildVector(DstReg, ResultRegs);
4667     return true;
4668   }
4669 
4670   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4671   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4672 
4673   // Deal with the one annoying legal case.
4674   const LLT V3S16 = LLT::fixed_vector(3, 16);
4675   if (Ty == V3S16) {
4676     if (IsTFE) {
4677       if (ResultRegs.size() == 1) {
4678         NewResultReg = ResultRegs[0];
4679       } else if (ResultRegs.size() == 2) {
4680         LLT V4S16 = LLT::fixed_vector(4, 16);
4681         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4682       } else {
4683         return false;
4684       }
4685     }
4686 
4687     if (MRI->getType(DstReg).getNumElements() <
4688         MRI->getType(NewResultReg).getNumElements()) {
4689       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4690     } else {
4691       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4692     }
4693     return true;
4694   }
4695 
4696   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4697   B.buildConcatVectors(DstReg, ResultRegs);
4698   return true;
4699 }
4700 
4701 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4702   LegalizerHelper &Helper, MachineInstr &MI) const {
4703   MachineIRBuilder &B = Helper.MIRBuilder;
4704   GISelChangeObserver &Observer = Helper.Observer;
4705 
4706   Register Dst = MI.getOperand(0).getReg();
4707   LLT Ty = B.getMRI()->getType(Dst);
4708   unsigned Size = Ty.getSizeInBits();
4709   MachineFunction &MF = B.getMF();
4710 
4711   Observer.changingInstr(MI);
4712 
4713   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4714     Ty = getBitcastRegisterType(Ty);
4715     Helper.bitcastDst(MI, Ty, 0);
4716     Dst = MI.getOperand(0).getReg();
4717     B.setInsertPt(B.getMBB(), MI);
4718   }
4719 
4720   // FIXME: We don't really need this intermediate instruction. The intrinsic
4721   // should be fixed to have a memory operand. Since it's readnone, we're not
4722   // allowed to add one.
4723   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4724   MI.RemoveOperand(1); // Remove intrinsic ID
4725 
4726   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4727   // TODO: Should this use datalayout alignment?
4728   const unsigned MemSize = (Size + 7) / 8;
4729   const Align MemAlign(4);
4730   MachineMemOperand *MMO = MF.getMachineMemOperand(
4731       MachinePointerInfo(),
4732       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4733           MachineMemOperand::MOInvariant,
4734       MemSize, MemAlign);
4735   MI.addMemOperand(MF, MMO);
4736 
4737   // There are no 96-bit result scalar loads, but widening to 128-bit should
4738   // always be legal. We may need to restore this to a 96-bit result if it turns
4739   // out this needs to be converted to a vector load during RegBankSelect.
4740   if (!isPowerOf2_32(Size)) {
4741     if (Ty.isVector())
4742       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4743     else
4744       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4745   }
4746 
4747   Observer.changedInstr(MI);
4748   return true;
4749 }
4750 
4751 // TODO: Move to selection
4752 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4753                                                 MachineRegisterInfo &MRI,
4754                                                 MachineIRBuilder &B) const {
4755   if (!ST.isTrapHandlerEnabled() ||
4756       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4757     return legalizeTrapEndpgm(MI, MRI, B);
4758 
4759   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4760     switch (*HsaAbiVer) {
4761     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4762     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4763       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4764     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4765       return ST.supportsGetDoorbellID() ?
4766           legalizeTrapHsa(MI, MRI, B) :
4767           legalizeTrapHsaQueuePtr(MI, MRI, B);
4768     }
4769   }
4770 
4771   llvm_unreachable("Unknown trap handler");
4772 }
4773 
4774 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4775     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4776   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4777   MI.eraseFromParent();
4778   return true;
4779 }
4780 
4781 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4782     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4783   // Pass queue pointer to trap handler as input, and insert trap instruction
4784   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4785   Register LiveIn =
4786     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4787   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4788     return false;
4789 
4790   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4791   B.buildCopy(SGPR01, LiveIn);
4792   B.buildInstr(AMDGPU::S_TRAP)
4793       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4794       .addReg(SGPR01, RegState::Implicit);
4795 
4796   MI.eraseFromParent();
4797   return true;
4798 }
4799 
4800 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4801     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4802   B.buildInstr(AMDGPU::S_TRAP)
4803       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4804   MI.eraseFromParent();
4805   return true;
4806 }
4807 
4808 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4809     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4810   // Is non-HSA path or trap-handler disabled? Then, report a warning
4811   // accordingly
4812   if (!ST.isTrapHandlerEnabled() ||
4813       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4814     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4815                                      "debugtrap handler not supported",
4816                                      MI.getDebugLoc(), DS_Warning);
4817     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4818     Ctx.diagnose(NoTrap);
4819   } else {
4820     // Insert debug-trap instruction
4821     B.buildInstr(AMDGPU::S_TRAP)
4822         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4823   }
4824 
4825   MI.eraseFromParent();
4826   return true;
4827 }
4828 
4829 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4830                                                MachineIRBuilder &B) const {
4831   MachineRegisterInfo &MRI = *B.getMRI();
4832   const LLT S16 = LLT::scalar(16);
4833   const LLT S32 = LLT::scalar(32);
4834 
4835   Register DstReg = MI.getOperand(0).getReg();
4836   Register NodePtr = MI.getOperand(2).getReg();
4837   Register RayExtent = MI.getOperand(3).getReg();
4838   Register RayOrigin = MI.getOperand(4).getReg();
4839   Register RayDir = MI.getOperand(5).getReg();
4840   Register RayInvDir = MI.getOperand(6).getReg();
4841   Register TDescr = MI.getOperand(7).getReg();
4842 
4843   if (!ST.hasGFX10_AEncoding()) {
4844     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4845                                         "intrinsic not supported on subtarget",
4846                                         MI.getDebugLoc());
4847     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4848     return false;
4849   }
4850 
4851   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4852   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4853   const unsigned NumVDataDwords = 4;
4854   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4855   const bool UseNSA =
4856       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4857   const unsigned BaseOpcodes[2][2] = {
4858       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4859       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4860        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4861   int Opcode;
4862   if (UseNSA) {
4863     Opcode =
4864         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4865                               NumVDataDwords, NumVAddrDwords);
4866   } else {
4867     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4868                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4869                                    PowerOf2Ceil(NumVAddrDwords));
4870   }
4871   assert(Opcode != -1);
4872 
4873   SmallVector<Register, 12> Ops;
4874   if (Is64) {
4875     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4876     Ops.push_back(Unmerge.getReg(0));
4877     Ops.push_back(Unmerge.getReg(1));
4878   } else {
4879     Ops.push_back(NodePtr);
4880   }
4881   Ops.push_back(RayExtent);
4882 
4883   auto packLanes = [&Ops, &S32, &B](Register Src) {
4884     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4885     Ops.push_back(Unmerge.getReg(0));
4886     Ops.push_back(Unmerge.getReg(1));
4887     Ops.push_back(Unmerge.getReg(2));
4888   };
4889 
4890   packLanes(RayOrigin);
4891   if (IsA16) {
4892     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4893     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4894     Register R1 = MRI.createGenericVirtualRegister(S32);
4895     Register R2 = MRI.createGenericVirtualRegister(S32);
4896     Register R3 = MRI.createGenericVirtualRegister(S32);
4897     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4898     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4899     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4900     Ops.push_back(R1);
4901     Ops.push_back(R2);
4902     Ops.push_back(R3);
4903   } else {
4904     packLanes(RayDir);
4905     packLanes(RayInvDir);
4906   }
4907 
4908   if (!UseNSA) {
4909     // Build a single vector containing all the operands so far prepared.
4910     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4911     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4912     Ops.clear();
4913     Ops.push_back(MergedOps);
4914   }
4915 
4916   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4917     .addDef(DstReg)
4918     .addImm(Opcode);
4919 
4920   for (Register R : Ops) {
4921     MIB.addUse(R);
4922   }
4923 
4924   MIB.addUse(TDescr)
4925      .addImm(IsA16 ? 1 : 0)
4926      .cloneMemRefs(MI);
4927 
4928   MI.eraseFromParent();
4929   return true;
4930 }
4931 
4932 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4933                                             MachineInstr &MI) const {
4934   MachineIRBuilder &B = Helper.MIRBuilder;
4935   MachineRegisterInfo &MRI = *B.getMRI();
4936 
4937   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4938   auto IntrID = MI.getIntrinsicID();
4939   switch (IntrID) {
4940   case Intrinsic::amdgcn_if:
4941   case Intrinsic::amdgcn_else: {
4942     MachineInstr *Br = nullptr;
4943     MachineBasicBlock *UncondBrTarget = nullptr;
4944     bool Negated = false;
4945     if (MachineInstr *BrCond =
4946             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4947       const SIRegisterInfo *TRI
4948         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4949 
4950       Register Def = MI.getOperand(1).getReg();
4951       Register Use = MI.getOperand(3).getReg();
4952 
4953       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4954 
4955       if (Negated)
4956         std::swap(CondBrTarget, UncondBrTarget);
4957 
4958       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4959       if (IntrID == Intrinsic::amdgcn_if) {
4960         B.buildInstr(AMDGPU::SI_IF)
4961           .addDef(Def)
4962           .addUse(Use)
4963           .addMBB(UncondBrTarget);
4964       } else {
4965         B.buildInstr(AMDGPU::SI_ELSE)
4966             .addDef(Def)
4967             .addUse(Use)
4968             .addMBB(UncondBrTarget);
4969       }
4970 
4971       if (Br) {
4972         Br->getOperand(0).setMBB(CondBrTarget);
4973       } else {
4974         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4975         // since we're swapping branch targets it needs to be reinserted.
4976         // FIXME: IRTranslator should probably not do this
4977         B.buildBr(*CondBrTarget);
4978       }
4979 
4980       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4981       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4982       MI.eraseFromParent();
4983       BrCond->eraseFromParent();
4984       return true;
4985     }
4986 
4987     return false;
4988   }
4989   case Intrinsic::amdgcn_loop: {
4990     MachineInstr *Br = nullptr;
4991     MachineBasicBlock *UncondBrTarget = nullptr;
4992     bool Negated = false;
4993     if (MachineInstr *BrCond =
4994             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4995       const SIRegisterInfo *TRI
4996         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4997 
4998       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4999       Register Reg = MI.getOperand(2).getReg();
5000 
5001       if (Negated)
5002         std::swap(CondBrTarget, UncondBrTarget);
5003 
5004       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5005       B.buildInstr(AMDGPU::SI_LOOP)
5006         .addUse(Reg)
5007         .addMBB(UncondBrTarget);
5008 
5009       if (Br)
5010         Br->getOperand(0).setMBB(CondBrTarget);
5011       else
5012         B.buildBr(*CondBrTarget);
5013 
5014       MI.eraseFromParent();
5015       BrCond->eraseFromParent();
5016       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5017       return true;
5018     }
5019 
5020     return false;
5021   }
5022   case Intrinsic::amdgcn_kernarg_segment_ptr:
5023     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5024       // This only makes sense to call in a kernel, so just lower to null.
5025       B.buildConstant(MI.getOperand(0).getReg(), 0);
5026       MI.eraseFromParent();
5027       return true;
5028     }
5029 
5030     return legalizePreloadedArgIntrin(
5031       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5032   case Intrinsic::amdgcn_implicitarg_ptr:
5033     return legalizeImplicitArgPtr(MI, MRI, B);
5034   case Intrinsic::amdgcn_workitem_id_x:
5035     return legalizePreloadedArgIntrin(MI, MRI, B,
5036                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5037   case Intrinsic::amdgcn_workitem_id_y:
5038     return legalizePreloadedArgIntrin(MI, MRI, B,
5039                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5040   case Intrinsic::amdgcn_workitem_id_z:
5041     return legalizePreloadedArgIntrin(MI, MRI, B,
5042                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5043   case Intrinsic::amdgcn_workgroup_id_x:
5044     return legalizePreloadedArgIntrin(MI, MRI, B,
5045                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5046   case Intrinsic::amdgcn_workgroup_id_y:
5047     return legalizePreloadedArgIntrin(MI, MRI, B,
5048                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5049   case Intrinsic::amdgcn_workgroup_id_z:
5050     return legalizePreloadedArgIntrin(MI, MRI, B,
5051                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5052   case Intrinsic::amdgcn_dispatch_ptr:
5053     return legalizePreloadedArgIntrin(MI, MRI, B,
5054                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5055   case Intrinsic::amdgcn_queue_ptr:
5056     return legalizePreloadedArgIntrin(MI, MRI, B,
5057                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5058   case Intrinsic::amdgcn_implicit_buffer_ptr:
5059     return legalizePreloadedArgIntrin(
5060       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5061   case Intrinsic::amdgcn_dispatch_id:
5062     return legalizePreloadedArgIntrin(MI, MRI, B,
5063                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5064   case Intrinsic::amdgcn_fdiv_fast:
5065     return legalizeFDIVFastIntrin(MI, MRI, B);
5066   case Intrinsic::amdgcn_is_shared:
5067     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5068   case Intrinsic::amdgcn_is_private:
5069     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5070   case Intrinsic::amdgcn_wavefrontsize: {
5071     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5072     MI.eraseFromParent();
5073     return true;
5074   }
5075   case Intrinsic::amdgcn_s_buffer_load:
5076     return legalizeSBufferLoad(Helper, MI);
5077   case Intrinsic::amdgcn_raw_buffer_store:
5078   case Intrinsic::amdgcn_struct_buffer_store:
5079     return legalizeBufferStore(MI, MRI, B, false, false);
5080   case Intrinsic::amdgcn_raw_buffer_store_format:
5081   case Intrinsic::amdgcn_struct_buffer_store_format:
5082     return legalizeBufferStore(MI, MRI, B, false, true);
5083   case Intrinsic::amdgcn_raw_tbuffer_store:
5084   case Intrinsic::amdgcn_struct_tbuffer_store:
5085     return legalizeBufferStore(MI, MRI, B, true, true);
5086   case Intrinsic::amdgcn_raw_buffer_load:
5087   case Intrinsic::amdgcn_struct_buffer_load:
5088     return legalizeBufferLoad(MI, MRI, B, false, false);
5089   case Intrinsic::amdgcn_raw_buffer_load_format:
5090   case Intrinsic::amdgcn_struct_buffer_load_format:
5091     return legalizeBufferLoad(MI, MRI, B, true, false);
5092   case Intrinsic::amdgcn_raw_tbuffer_load:
5093   case Intrinsic::amdgcn_struct_tbuffer_load:
5094     return legalizeBufferLoad(MI, MRI, B, true, true);
5095   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5096   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5097   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5098   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5099   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5100   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5101   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5102   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5103   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5104   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5105   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5106   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5107   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5108   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5109   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5110   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5111   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5112   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5113   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5114   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5115   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5116   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5117   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5118   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5119   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5120   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5121   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5122   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5123   case Intrinsic::amdgcn_buffer_atomic_fadd:
5124   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5125   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5126   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5127   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5128     return legalizeBufferAtomic(MI, B, IntrID);
5129   case Intrinsic::amdgcn_atomic_inc:
5130     return legalizeAtomicIncDec(MI, B, true);
5131   case Intrinsic::amdgcn_atomic_dec:
5132     return legalizeAtomicIncDec(MI, B, false);
5133   case Intrinsic::trap:
5134     return legalizeTrapIntrinsic(MI, MRI, B);
5135   case Intrinsic::debugtrap:
5136     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5137   case Intrinsic::amdgcn_rsq_clamp:
5138     return legalizeRsqClampIntrinsic(MI, MRI, B);
5139   case Intrinsic::amdgcn_ds_fadd:
5140   case Intrinsic::amdgcn_ds_fmin:
5141   case Intrinsic::amdgcn_ds_fmax:
5142     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5143   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5144     return legalizeBVHIntrinsic(MI, B);
5145   default: {
5146     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5147             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5148       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5149     return true;
5150   }
5151   }
5152 
5153   return true;
5154 }
5155