1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElementsStrict(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 
1056     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057       return true;
1058 
1059     const LLT PtrTy = Query.Types[1];
1060     unsigned AS = PtrTy.getAddressSpace();
1061     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062       return true;
1063 
1064     // Catch weird sized loads that don't evenly divide into the access sizes
1065     // TODO: May be able to widen depending on alignment etc.
1066     unsigned NumRegs = (MemSize + 31) / 32;
1067     if (NumRegs == 3) {
1068       if (!ST.hasDwordx3LoadStores())
1069         return true;
1070     } else {
1071       // If the alignment allows, these should have been widened.
1072       if (!isPowerOf2_32(NumRegs))
1073         return true;
1074     }
1075 
1076     return false;
1077   };
1078 
1079   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084   // LDS
1085   // TODO: Unsupported flat for SI.
1086 
1087   for (unsigned Op : {G_LOAD, G_STORE}) {
1088     const bool IsStore = Op == G_STORE;
1089 
1090     auto &Actions = getActionDefinitionsBuilder(Op);
1091     // Explicitly list some common cases.
1092     // TODO: Does this help compile time at all?
1093     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096                                       {S64, GlobalPtr, S64, GlobalAlign32},
1097                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099                                       {S32, GlobalPtr, S8, GlobalAlign8},
1100                                       {S32, GlobalPtr, S16, GlobalAlign16},
1101 
1102                                       {S32, LocalPtr, S32, 32},
1103                                       {S64, LocalPtr, S64, 32},
1104                                       {V2S32, LocalPtr, V2S32, 32},
1105                                       {S32, LocalPtr, S8, 8},
1106                                       {S32, LocalPtr, S16, 16},
1107                                       {V2S16, LocalPtr, S32, 32},
1108 
1109                                       {S32, PrivatePtr, S32, 32},
1110                                       {S32, PrivatePtr, S8, 8},
1111                                       {S32, PrivatePtr, S16, 16},
1112                                       {V2S16, PrivatePtr, S32, 32},
1113 
1114                                       {S32, ConstantPtr, S32, GlobalAlign32},
1115                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117                                       {S64, ConstantPtr, S64, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119     Actions.legalIf(
1120       [=](const LegalityQuery &Query) -> bool {
1121         return isLoadStoreLegal(ST, Query);
1122       });
1123 
1124     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125     // 64-bits.
1126     //
1127     // TODO: Should generalize bitcast action into coerce, which will also cover
1128     // inserting addrspacecasts.
1129     Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131     // Turn any illegal element vectors into something easier to deal
1132     // with. These will ultimately produce 32-bit scalar shifts to extract the
1133     // parts anyway.
1134     //
1135     // For odd 16-bit element vectors, prefer to split those into pieces with
1136     // 16-bit vector parts.
1137     Actions.bitcastIf(
1138       [=](const LegalityQuery &Query) -> bool {
1139         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140                                           Query.MMODescrs[0].MemoryTy);
1141       }, bitcastToRegisterType(0));
1142 
1143     if (!IsStore) {
1144       // Widen suitably aligned loads by loading extra bytes. The standard
1145       // legalization actions can't properly express widening memory operands.
1146       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147         return shouldWidenLoad(ST, Query, G_LOAD);
1148       });
1149     }
1150 
1151     // FIXME: load/store narrowing should be moved to lower action
1152     Actions
1153         .narrowScalarIf(
1154             [=](const LegalityQuery &Query) -> bool {
1155               return !Query.Types[0].isVector() &&
1156                      needToSplitMemOp(Query, Op == G_LOAD);
1157             },
1158             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159               const LLT DstTy = Query.Types[0];
1160               const LLT PtrTy = Query.Types[1];
1161 
1162               const unsigned DstSize = DstTy.getSizeInBits();
1163               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164 
1165               // Split extloads.
1166               if (DstSize > MemSize)
1167                 return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169               unsigned MaxSize = maxSizeForAddrSpace(ST,
1170                                                      PtrTy.getAddressSpace(),
1171                                                      Op == G_LOAD);
1172               if (MemSize > MaxSize)
1173                 return std::make_pair(0, LLT::scalar(MaxSize));
1174 
1175               unsigned Align = Query.MMODescrs[0].AlignInBits;
1176               return std::make_pair(0, LLT::scalar(Align));
1177             })
1178         .fewerElementsIf(
1179             [=](const LegalityQuery &Query) -> bool {
1180               return Query.Types[0].isVector() &&
1181                      needToSplitMemOp(Query, Op == G_LOAD);
1182             },
1183             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184               const LLT DstTy = Query.Types[0];
1185               const LLT PtrTy = Query.Types[1];
1186 
1187               LLT EltTy = DstTy.getElementType();
1188               unsigned MaxSize = maxSizeForAddrSpace(ST,
1189                                                      PtrTy.getAddressSpace(),
1190                                                      Op == G_LOAD);
1191 
1192               // FIXME: Handle widened to power of 2 results better. This ends
1193               // up scalarizing.
1194               // FIXME: 3 element stores scalarized on SI
1195 
1196               // Split if it's too large for the address space.
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198               if (MemSize > MaxSize) {
1199                 unsigned NumElts = DstTy.getNumElements();
1200                 unsigned EltSize = EltTy.getSizeInBits();
1201 
1202                 if (MaxSize % EltSize == 0) {
1203                   return std::make_pair(
1204                       0, LLT::scalarOrVector(
1205                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = MemSize / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(
1217                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               if (DstTy.getSizeInBits() > MemSize)
1222                 return std::make_pair(0, EltTy);
1223 
1224               unsigned EltSize = EltTy.getSizeInBits();
1225               unsigned DstSize = DstTy.getSizeInBits();
1226               if (!isPowerOf2_32(DstSize)) {
1227                 // We're probably decomposing an odd sized store. Try to split
1228                 // to the widest type. TODO: Account for alignment. As-is it
1229                 // should be OK, since the new parts will be further legalized.
1230                 unsigned FloorSize = PowerOf2Floor(DstSize);
1231                 return std::make_pair(
1232                     0, LLT::scalarOrVector(
1233                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234               }
1235 
1236               // May need relegalization for the scalars.
1237               return std::make_pair(0, EltTy);
1238             })
1239     .minScalar(0, S32)
1240     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241     .widenScalarToNextPow2(0)
1242     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243     .lower();
1244   }
1245 
1246   // FIXME: Unaligned accesses not lowered.
1247   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249                                                   {S32, GlobalPtr, S16, 2 * 8},
1250                                                   {S32, LocalPtr, S8, 8},
1251                                                   {S32, LocalPtr, S16, 16},
1252                                                   {S32, PrivatePtr, S8, 8},
1253                                                   {S32, PrivatePtr, S16, 16},
1254                                                   {S32, ConstantPtr, S8, 8},
1255                                                   {S32, ConstantPtr, S16, 2 * 8}})
1256                        .legalIf(
1257                          [=](const LegalityQuery &Query) -> bool {
1258                            return isLoadStoreLegal(ST, Query);
1259                          });
1260 
1261   if (ST.hasFlatAddressSpace()) {
1262     ExtLoads.legalForTypesWithMemDesc(
1263         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264   }
1265 
1266   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267   // 64-bits.
1268   //
1269   // TODO: Should generalize bitcast action into coerce, which will also cover
1270   // inserting addrspacecasts.
1271   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272 
1273   ExtLoads.clampScalar(0, S32, S32)
1274           .widenScalarToNextPow2(0)
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290   if (ST.hasLDSFPAtomicAdd()) {
1291     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292     if (ST.hasGFX90AInsts())
1293       Atomic.legalFor({{S64, LocalPtr}});
1294   }
1295   if (ST.hasAtomicFaddInsts())
1296     Atomic.legalFor({{S32, GlobalPtr}});
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310                                  LocalPtr, FlatPtr, PrivatePtr,
1311                                  LLT::fixed_vector(2, LocalPtr),
1312                                  LLT::fixed_vector(2, PrivatePtr)},
1313                                 {S1, S32})
1314       .clampScalar(0, S16, S64)
1315       .scalarize(1)
1316       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318       .clampMaxNumElements(0, S32, 2)
1319       .clampMaxNumElements(0, LocalPtr, 2)
1320       .clampMaxNumElements(0, PrivatePtr, 2)
1321       .scalarize(0)
1322       .widenScalarToNextPow2(0)
1323       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324 
1325   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326   // be more flexible with the shift amount type.
1327   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328     .legalFor({{S32, S32}, {S64, S32}});
1329   if (ST.has16BitInsts()) {
1330     if (ST.hasVOP3PInsts()) {
1331       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332             .clampMaxNumElements(0, S16, 2);
1333     } else
1334       Shifts.legalFor({{S16, S16}});
1335 
1336     // TODO: Support 16-bit shift amounts for all types
1337     Shifts.widenScalarIf(
1338       [=](const LegalityQuery &Query) {
1339         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340         // 32-bit amount.
1341         const LLT ValTy = Query.Types[0];
1342         const LLT AmountTy = Query.Types[1];
1343         return ValTy.getSizeInBits() <= 16 &&
1344                AmountTy.getSizeInBits() < 16;
1345       }, changeTo(1, S16));
1346     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347     Shifts.clampScalar(1, S32, S32);
1348     Shifts.clampScalar(0, S16, S64);
1349     Shifts.widenScalarToNextPow2(0, 16);
1350 
1351     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352       .minScalar(0, S16)
1353       .scalarize(0)
1354       .lower();
1355   } else {
1356     // Make sure we legalize the shift amount type first, as the general
1357     // expansion for the shifted type will produce much worse code if it hasn't
1358     // been truncated already.
1359     Shifts.clampScalar(1, S32, S32);
1360     Shifts.clampScalar(0, S32, S64);
1361     Shifts.widenScalarToNextPow2(0, 32);
1362 
1363     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364       .minScalar(0, S32)
1365       .scalarize(0)
1366       .lower();
1367   }
1368   Shifts.scalarize(0);
1369 
1370   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373     unsigned IdxTypeIdx = 2;
1374 
1375     getActionDefinitionsBuilder(Op)
1376       .customIf([=](const LegalityQuery &Query) {
1377           const LLT EltTy = Query.Types[EltTypeIdx];
1378           const LLT VecTy = Query.Types[VecTypeIdx];
1379           const LLT IdxTy = Query.Types[IdxTypeIdx];
1380           const unsigned EltSize = EltTy.getSizeInBits();
1381           return (EltSize == 32 || EltSize == 64) &&
1382                   VecTy.getSizeInBits() % 32 == 0 &&
1383                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1384                   IdxTy.getSizeInBits() == 32;
1385         })
1386       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387                  bitcastToVectorElement32(VecTypeIdx))
1388       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389       .bitcastIf(
1390         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391         [=](const LegalityQuery &Query) {
1392           // For > 64-bit element types, try to turn this into a 64-bit
1393           // element vector since we may be able to do better indexing
1394           // if this is scalar. If not, fall back to 32.
1395           const LLT EltTy = Query.Types[EltTypeIdx];
1396           const LLT VecTy = Query.Types[VecTypeIdx];
1397           const unsigned DstEltSize = EltTy.getSizeInBits();
1398           const unsigned VecSize = VecTy.getSizeInBits();
1399 
1400           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401           return std::make_pair(
1402               VecTypeIdx,
1403               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404         })
1405       .clampScalar(EltTypeIdx, S32, S64)
1406       .clampScalar(VecTypeIdx, S32, S64)
1407       .clampScalar(IdxTypeIdx, S32, S32)
1408       .clampMaxNumElements(VecTypeIdx, S32, 32)
1409       // TODO: Clamp elements for 64-bit vectors?
1410       // It should only be necessary with variable indexes.
1411       // As a last resort, lower to the stack
1412       .lower();
1413   }
1414 
1415   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416     .unsupportedIf([=](const LegalityQuery &Query) {
1417         const LLT &EltTy = Query.Types[1].getElementType();
1418         return Query.Types[0] != EltTy;
1419       });
1420 
1421   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425     // FIXME: Doesn't handle extract of illegal sizes.
1426     getActionDefinitionsBuilder(Op)
1427       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428       .lowerIf([=](const LegalityQuery &Query) {
1429           // Sub-vector(or single element) insert and extract.
1430           // TODO: verify immediate offset here since lower only works with
1431           // whole elements.
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           return BigTy.isVector();
1434         })
1435       // FIXME: Multiples of 16 should not be legal.
1436       .legalIf([=](const LegalityQuery &Query) {
1437           const LLT BigTy = Query.Types[BigTyIdx];
1438           const LLT LitTy = Query.Types[LitTyIdx];
1439           return (BigTy.getSizeInBits() % 32 == 0) &&
1440                  (LitTy.getSizeInBits() % 16 == 0);
1441         })
1442       .widenScalarIf(
1443         [=](const LegalityQuery &Query) {
1444           const LLT BigTy = Query.Types[BigTyIdx];
1445           return (BigTy.getScalarSizeInBits() < 16);
1446         },
1447         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1448       .widenScalarIf(
1449         [=](const LegalityQuery &Query) {
1450           const LLT LitTy = Query.Types[LitTyIdx];
1451           return (LitTy.getScalarSizeInBits() < 16);
1452         },
1453         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1454       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455       .widenScalarToNextPow2(BigTyIdx, 32);
1456 
1457   }
1458 
1459   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460     .legalForCartesianProduct(AllS32Vectors, {S32})
1461     .legalForCartesianProduct(AllS64Vectors, {S64})
1462     .clampNumElements(0, V16S32, V32S32)
1463     .clampNumElements(0, V2S64, V16S64)
1464     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465 
1466   if (ST.hasScalarPackInsts()) {
1467     BuildVector
1468       // FIXME: Should probably widen s1 vectors straight to s32
1469       .minScalarOrElt(0, S16)
1470       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471       .minScalar(1, S32);
1472 
1473     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474       .legalFor({V2S16, S32})
1475       .lower();
1476     BuildVector.minScalarOrElt(0, S32);
1477   } else {
1478     BuildVector.customFor({V2S16, S16});
1479     BuildVector.minScalarOrElt(0, S32);
1480 
1481     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482       .customFor({V2S16, S32})
1483       .lower();
1484   }
1485 
1486   BuildVector.legalIf(isRegisterType(0));
1487 
1488   // FIXME: Clamp maximum size
1489   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491     .clampMaxNumElements(0, S32, 32)
1492     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493     .clampMaxNumElements(0, S16, 64);
1494 
1495   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496   // pre-legalize.
1497   if (ST.hasVOP3PInsts()) {
1498     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499       .customFor({V2S16, V2S16})
1500       .lower();
1501   } else
1502     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503 
1504   // Merge/Unmerge
1505   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508 
1509     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510       const LLT Ty = Query.Types[TypeIdx];
1511       if (Ty.isVector()) {
1512         const LLT &EltTy = Ty.getElementType();
1513         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514           return true;
1515         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516           return true;
1517       }
1518       return false;
1519     };
1520 
1521     auto &Builder = getActionDefinitionsBuilder(Op)
1522       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523       .lowerFor({{S16, V2S16}})
1524       .lowerIf([=](const LegalityQuery &Query) {
1525           const LLT BigTy = Query.Types[BigTyIdx];
1526           return BigTy.getSizeInBits() == 32;
1527         })
1528       // Try to widen to s16 first for small types.
1529       // TODO: Only do this on targets with legal s16 shifts
1530       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534                            elementTypeIs(1, S16)),
1535                        changeTo(1, V2S16))
1536       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538       // valid.
1539       .clampScalar(LitTyIdx, S32, S512)
1540       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541       // Break up vectors with weird elements into scalars
1542       .fewerElementsIf(
1543         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544         scalarize(0))
1545       .fewerElementsIf(
1546         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547         scalarize(1))
1548       .clampScalar(BigTyIdx, S32, MaxScalar);
1549 
1550     if (Op == G_MERGE_VALUES) {
1551       Builder.widenScalarIf(
1552         // TODO: Use 16-bit shifts if legal for 8-bit values?
1553         [=](const LegalityQuery &Query) {
1554           const LLT Ty = Query.Types[LitTyIdx];
1555           return Ty.getSizeInBits() < 32;
1556         },
1557         changeTo(LitTyIdx, S32));
1558     }
1559 
1560     Builder.widenScalarIf(
1561       [=](const LegalityQuery &Query) {
1562         const LLT Ty = Query.Types[BigTyIdx];
1563         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564           Ty.getSizeInBits() % 16 != 0;
1565       },
1566       [=](const LegalityQuery &Query) {
1567         // Pick the next power of 2, or a multiple of 64 over 128.
1568         // Whichever is smaller.
1569         const LLT &Ty = Query.Types[BigTyIdx];
1570         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571         if (NewSizeInBits >= 256) {
1572           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573           if (RoundedTo < NewSizeInBits)
1574             NewSizeInBits = RoundedTo;
1575         }
1576         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577       })
1578       // Any vectors left are the wrong size. Scalarize them.
1579       .scalarize(0)
1580       .scalarize(1);
1581   }
1582 
1583   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584   // RegBankSelect.
1585   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586     .legalFor({{S32}, {S64}});
1587 
1588   if (ST.hasVOP3PInsts()) {
1589     SextInReg.lowerFor({{V2S16}})
1590       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591       // get more vector shift opportunities, since we'll get those when
1592       // expanded.
1593       .clampMaxNumElementsStrict(0, S16, 2);
1594   } else if (ST.has16BitInsts()) {
1595     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596   } else {
1597     // Prefer to promote to s32 before lowering if we don't have 16-bit
1598     // shifts. This avoid a lot of intermediate truncate and extend operations.
1599     SextInReg.lowerFor({{S32}, {S64}});
1600   }
1601 
1602   SextInReg
1603     .scalarize(0)
1604     .clampScalar(0, S32, S64)
1605     .lower();
1606 
1607   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608     .scalarize(0)
1609     .lower();
1610 
1611   // TODO: Only Try to form v2s16 with legal packed instructions.
1612   getActionDefinitionsBuilder(G_FSHR)
1613     .legalFor({{S32, S32}})
1614     .lowerFor({{V2S16, V2S16}})
1615     .clampMaxNumElementsStrict(0, S16, 2)
1616     .scalarize(0)
1617     .lower();
1618 
1619   if (ST.hasVOP3PInsts()) {
1620     getActionDefinitionsBuilder(G_FSHL)
1621       .lowerFor({{V2S16, V2S16}})
1622       .clampMaxNumElementsStrict(0, S16, 2)
1623       .scalarize(0)
1624       .lower();
1625   } else {
1626     getActionDefinitionsBuilder(G_FSHL)
1627       .scalarize(0)
1628       .lower();
1629   }
1630 
1631   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632     .legalFor({S64});
1633 
1634   getActionDefinitionsBuilder(G_FENCE)
1635     .alwaysLegal();
1636 
1637   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638       .scalarize(0)
1639       .minScalar(0, S32)
1640       .lower();
1641 
1642   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643       .legalFor({{S32, S32}, {S64, S32}})
1644       .clampScalar(1, S32, S32)
1645       .clampScalar(0, S32, S64)
1646       .widenScalarToNextPow2(0)
1647       .scalarize(0);
1648 
1649   getActionDefinitionsBuilder({
1650       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651       G_FCOPYSIGN,
1652 
1653       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654       G_ATOMICRMW_NAND,
1655       G_ATOMICRMW_FSUB,
1656       G_READ_REGISTER,
1657       G_WRITE_REGISTER,
1658 
1659       G_SADDO, G_SSUBO,
1660 
1661        // TODO: Implement
1662       G_FMINIMUM, G_FMAXIMUM}).lower();
1663 
1664   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665       .lower();
1666 
1667   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670     .unsupported();
1671 
1672   getLegacyLegalizerInfo().computeTables();
1673   verify(*ST.getInstrInfo());
1674 }
1675 
1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677                                          MachineInstr &MI) const {
1678   MachineIRBuilder &B = Helper.MIRBuilder;
1679   MachineRegisterInfo &MRI = *B.getMRI();
1680 
1681   switch (MI.getOpcode()) {
1682   case TargetOpcode::G_ADDRSPACE_CAST:
1683     return legalizeAddrSpaceCast(MI, MRI, B);
1684   case TargetOpcode::G_FRINT:
1685     return legalizeFrint(MI, MRI, B);
1686   case TargetOpcode::G_FCEIL:
1687     return legalizeFceil(MI, MRI, B);
1688   case TargetOpcode::G_FREM:
1689     return legalizeFrem(MI, MRI, B);
1690   case TargetOpcode::G_INTRINSIC_TRUNC:
1691     return legalizeIntrinsicTrunc(MI, MRI, B);
1692   case TargetOpcode::G_SITOFP:
1693     return legalizeITOFP(MI, MRI, B, true);
1694   case TargetOpcode::G_UITOFP:
1695     return legalizeITOFP(MI, MRI, B, false);
1696   case TargetOpcode::G_FPTOSI:
1697     return legalizeFPTOI(MI, MRI, B, true);
1698   case TargetOpcode::G_FPTOUI:
1699     return legalizeFPTOI(MI, MRI, B, false);
1700   case TargetOpcode::G_FMINNUM:
1701   case TargetOpcode::G_FMAXNUM:
1702   case TargetOpcode::G_FMINNUM_IEEE:
1703   case TargetOpcode::G_FMAXNUM_IEEE:
1704     return legalizeMinNumMaxNum(Helper, MI);
1705   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706     return legalizeExtractVectorElt(MI, MRI, B);
1707   case TargetOpcode::G_INSERT_VECTOR_ELT:
1708     return legalizeInsertVectorElt(MI, MRI, B);
1709   case TargetOpcode::G_SHUFFLE_VECTOR:
1710     return legalizeShuffleVector(MI, MRI, B);
1711   case TargetOpcode::G_FSIN:
1712   case TargetOpcode::G_FCOS:
1713     return legalizeSinCos(MI, MRI, B);
1714   case TargetOpcode::G_GLOBAL_VALUE:
1715     return legalizeGlobalValue(MI, MRI, B);
1716   case TargetOpcode::G_LOAD:
1717   case TargetOpcode::G_SEXTLOAD:
1718   case TargetOpcode::G_ZEXTLOAD:
1719     return legalizeLoad(Helper, MI);
1720   case TargetOpcode::G_FMAD:
1721     return legalizeFMad(MI, MRI, B);
1722   case TargetOpcode::G_FDIV:
1723     return legalizeFDIV(MI, MRI, B);
1724   case TargetOpcode::G_UDIV:
1725   case TargetOpcode::G_UREM:
1726   case TargetOpcode::G_UDIVREM:
1727     return legalizeUnsignedDIV_REM(MI, MRI, B);
1728   case TargetOpcode::G_SDIV:
1729   case TargetOpcode::G_SREM:
1730   case TargetOpcode::G_SDIVREM:
1731     return legalizeSignedDIV_REM(MI, MRI, B);
1732   case TargetOpcode::G_ATOMIC_CMPXCHG:
1733     return legalizeAtomicCmpXChg(MI, MRI, B);
1734   case TargetOpcode::G_FLOG:
1735     return legalizeFlog(MI, B, numbers::ln2f);
1736   case TargetOpcode::G_FLOG10:
1737     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738   case TargetOpcode::G_FEXP:
1739     return legalizeFExp(MI, B);
1740   case TargetOpcode::G_FPOW:
1741     return legalizeFPow(MI, B);
1742   case TargetOpcode::G_FFLOOR:
1743     return legalizeFFloor(MI, MRI, B);
1744   case TargetOpcode::G_BUILD_VECTOR:
1745     return legalizeBuildVector(MI, MRI, B);
1746   case TargetOpcode::G_CTLZ:
1747   case TargetOpcode::G_CTTZ:
1748     return legalizeCTLZ_CTTZ(MI, MRI, B);
1749   default:
1750     return false;
1751   }
1752 
1753   llvm_unreachable("expected switch to return");
1754 }
1755 
1756 Register AMDGPULegalizerInfo::getSegmentAperture(
1757   unsigned AS,
1758   MachineRegisterInfo &MRI,
1759   MachineIRBuilder &B) const {
1760   MachineFunction &MF = B.getMF();
1761   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762   const LLT S32 = LLT::scalar(32);
1763 
1764   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1765 
1766   if (ST.hasApertureRegs()) {
1767     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768     // getreg.
1769     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1771         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1772     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1773         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1774         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1775     unsigned Encoding =
1776         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1777         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1778         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779 
1780     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781 
1782     B.buildInstr(AMDGPU::S_GETREG_B32)
1783       .addDef(GetReg)
1784       .addImm(Encoding);
1785     MRI.setType(GetReg, S32);
1786 
1787     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789   }
1790 
1791   Register QueuePtr = MRI.createGenericVirtualRegister(
1792     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1793 
1794   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1795     return Register();
1796 
1797   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798   // private_segment_aperture_base_hi.
1799   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800 
1801   // TODO: can we be smarter about machine pointer info?
1802   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1803   MachineMemOperand *MMO = MF.getMachineMemOperand(
1804       PtrInfo,
1805       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1806           MachineMemOperand::MOInvariant,
1807       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808 
1809   Register LoadAddr;
1810 
1811   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813 }
1814 
1815 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B) const {
1818   MachineFunction &MF = B.getMF();
1819 
1820   const LLT S32 = LLT::scalar(32);
1821   Register Dst = MI.getOperand(0).getReg();
1822   Register Src = MI.getOperand(1).getReg();
1823 
1824   LLT DstTy = MRI.getType(Dst);
1825   LLT SrcTy = MRI.getType(Src);
1826   unsigned DestAS = DstTy.getAddressSpace();
1827   unsigned SrcAS = SrcTy.getAddressSpace();
1828 
1829   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1830   // vector element.
1831   assert(!DstTy.isVector());
1832 
1833   const AMDGPUTargetMachine &TM
1834     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1835 
1836   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1837     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1838     return true;
1839   }
1840 
1841   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1842     // Truncate.
1843     B.buildExtract(Dst, Src, 0);
1844     MI.eraseFromParent();
1845     return true;
1846   }
1847 
1848   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1849     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1850     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1851 
1852     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1853     // another. Merge operands are required to be the same type, but creating an
1854     // extra ptrtoint would be kind of pointless.
1855     auto HighAddr = B.buildConstant(
1856       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1857     B.buildMerge(Dst, {Src, HighAddr});
1858     MI.eraseFromParent();
1859     return true;
1860   }
1861 
1862   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1863     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1864            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1865     unsigned NullVal = TM.getNullPointerValue(DestAS);
1866 
1867     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1868     auto FlatNull = B.buildConstant(SrcTy, 0);
1869 
1870     // Extract low 32-bits of the pointer.
1871     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1872 
1873     auto CmpRes =
1874         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1875     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1876 
1877     MI.eraseFromParent();
1878     return true;
1879   }
1880 
1881   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1882     return false;
1883 
1884   if (!ST.hasFlatAddressSpace())
1885     return false;
1886 
1887   auto SegmentNull =
1888       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1889   auto FlatNull =
1890       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1891 
1892   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1893   if (!ApertureReg.isValid())
1894     return false;
1895 
1896   auto CmpRes =
1897       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1898 
1899   // Coerce the type of the low half of the result so we can use merge_values.
1900   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1901 
1902   // TODO: Should we allow mismatched types but matching sizes in merges to
1903   // avoid the ptrtoint?
1904   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1905   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1906 
1907   MI.eraseFromParent();
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFrint(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914   Register Src = MI.getOperand(1).getReg();
1915   LLT Ty = MRI.getType(Src);
1916   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1917 
1918   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1919   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1920 
1921   auto C1 = B.buildFConstant(Ty, C1Val);
1922   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1923 
1924   // TODO: Should this propagate fast-math-flags?
1925   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1926   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1927 
1928   auto C2 = B.buildFConstant(Ty, C2Val);
1929   auto Fabs = B.buildFAbs(Ty, Src);
1930 
1931   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1932   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1933   MI.eraseFromParent();
1934   return true;
1935 }
1936 
1937 bool AMDGPULegalizerInfo::legalizeFceil(
1938   MachineInstr &MI, MachineRegisterInfo &MRI,
1939   MachineIRBuilder &B) const {
1940 
1941   const LLT S1 = LLT::scalar(1);
1942   const LLT S64 = LLT::scalar(64);
1943 
1944   Register Src = MI.getOperand(1).getReg();
1945   assert(MRI.getType(Src) == S64);
1946 
1947   // result = trunc(src)
1948   // if (src > 0.0 && src != result)
1949   //   result += 1.0
1950 
1951   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1952 
1953   const auto Zero = B.buildFConstant(S64, 0.0);
1954   const auto One = B.buildFConstant(S64, 1.0);
1955   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1956   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1957   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1958   auto Add = B.buildSelect(S64, And, One, Zero);
1959 
1960   // TODO: Should this propagate fast-math-flags?
1961   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1962   return true;
1963 }
1964 
1965 bool AMDGPULegalizerInfo::legalizeFrem(
1966   MachineInstr &MI, MachineRegisterInfo &MRI,
1967   MachineIRBuilder &B) const {
1968     Register DstReg = MI.getOperand(0).getReg();
1969     Register Src0Reg = MI.getOperand(1).getReg();
1970     Register Src1Reg = MI.getOperand(2).getReg();
1971     auto Flags = MI.getFlags();
1972     LLT Ty = MRI.getType(DstReg);
1973 
1974     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1975     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1976     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1977     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1978     MI.eraseFromParent();
1979     return true;
1980 }
1981 
1982 static MachineInstrBuilder extractF64Exponent(Register Hi,
1983                                               MachineIRBuilder &B) {
1984   const unsigned FractBits = 52;
1985   const unsigned ExpBits = 11;
1986   LLT S32 = LLT::scalar(32);
1987 
1988   auto Const0 = B.buildConstant(S32, FractBits - 32);
1989   auto Const1 = B.buildConstant(S32, ExpBits);
1990 
1991   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1992     .addUse(Hi)
1993     .addUse(Const0.getReg(0))
1994     .addUse(Const1.getReg(0));
1995 
1996   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1997 }
1998 
1999 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2000   MachineInstr &MI, MachineRegisterInfo &MRI,
2001   MachineIRBuilder &B) const {
2002   const LLT S1 = LLT::scalar(1);
2003   const LLT S32 = LLT::scalar(32);
2004   const LLT S64 = LLT::scalar(64);
2005 
2006   Register Src = MI.getOperand(1).getReg();
2007   assert(MRI.getType(Src) == S64);
2008 
2009   // TODO: Should this use extract since the low half is unused?
2010   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2011   Register Hi = Unmerge.getReg(1);
2012 
2013   // Extract the upper half, since this is where we will find the sign and
2014   // exponent.
2015   auto Exp = extractF64Exponent(Hi, B);
2016 
2017   const unsigned FractBits = 52;
2018 
2019   // Extract the sign bit.
2020   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2021   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2022 
2023   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2024 
2025   const auto Zero32 = B.buildConstant(S32, 0);
2026 
2027   // Extend back to 64-bits.
2028   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2029 
2030   auto Shr = B.buildAShr(S64, FractMask, Exp);
2031   auto Not = B.buildNot(S64, Shr);
2032   auto Tmp0 = B.buildAnd(S64, Src, Not);
2033   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2034 
2035   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2036   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2037 
2038   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2039   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2040   MI.eraseFromParent();
2041   return true;
2042 }
2043 
2044 bool AMDGPULegalizerInfo::legalizeITOFP(
2045   MachineInstr &MI, MachineRegisterInfo &MRI,
2046   MachineIRBuilder &B, bool Signed) const {
2047 
2048   Register Dst = MI.getOperand(0).getReg();
2049   Register Src = MI.getOperand(1).getReg();
2050 
2051   const LLT S64 = LLT::scalar(64);
2052   const LLT S32 = LLT::scalar(32);
2053 
2054   assert(MRI.getType(Src) == S64);
2055 
2056   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2057   auto ThirtyTwo = B.buildConstant(S32, 32);
2058 
2059   if (MRI.getType(Dst) == S64) {
2060     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2061                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2062 
2063     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2064     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2065                      .addUse(CvtHi.getReg(0))
2066                      .addUse(ThirtyTwo.getReg(0));
2067 
2068     // TODO: Should this propagate fast-math-flags?
2069     B.buildFAdd(Dst, LdExp, CvtLo);
2070     MI.eraseFromParent();
2071     return true;
2072   }
2073 
2074   assert(MRI.getType(Dst) == S32);
2075 
2076   auto One = B.buildConstant(S32, 1);
2077 
2078   MachineInstrBuilder ShAmt;
2079   if (Signed) {
2080     auto ThirtyOne = B.buildConstant(S32, 31);
2081     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2082     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2083     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2084     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2085                                /*HasSideEffects=*/false)
2086                   .addUse(Unmerge.getReg(1));
2087     auto LS2 = B.buildSub(S32, LS, One);
2088     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2089   } else
2090     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2091   auto Norm = B.buildShl(S64, Src, ShAmt);
2092   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2093   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2094   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2095   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2096   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2097   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2098                    /*HasSideEffects=*/false)
2099       .addUse(FVal.getReg(0))
2100       .addUse(Scale.getReg(0));
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104 
2105 // TODO: Copied from DAG implementation. Verify logic and document how this
2106 // actually works.
2107 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2108                                         MachineRegisterInfo &MRI,
2109                                         MachineIRBuilder &B,
2110                                         bool Signed) const {
2111 
2112   Register Dst = MI.getOperand(0).getReg();
2113   Register Src = MI.getOperand(1).getReg();
2114 
2115   const LLT S64 = LLT::scalar(64);
2116   const LLT S32 = LLT::scalar(32);
2117 
2118   const LLT SrcLT = MRI.getType(Src);
2119   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2120 
2121   unsigned Flags = MI.getFlags();
2122 
2123   // The basic idea of converting a floating point number into a pair of 32-bit
2124   // integers is illustrated as follows:
2125   //
2126   //     tf := trunc(val);
2127   //    hif := floor(tf * 2^-32);
2128   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2129   //     hi := fptoi(hif);
2130   //     lo := fptoi(lof);
2131   //
2132   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2133   MachineInstrBuilder Sign;
2134   if (Signed && SrcLT == S32) {
2135     // However, a 32-bit floating point number has only 23 bits mantissa and
2136     // it's not enough to hold all the significant bits of `lof` if val is
2137     // negative. To avoid the loss of precision, We need to take the absolute
2138     // value after truncating and flip the result back based on the original
2139     // signedness.
2140     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2141     Trunc = B.buildFAbs(S32, Trunc, Flags);
2142   }
2143   MachineInstrBuilder K0, K1;
2144   if (SrcLT == S64) {
2145     K0 = B.buildFConstant(S64,
2146                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2147     K1 = B.buildFConstant(S64,
2148                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2149   } else {
2150     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2151     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2152   }
2153 
2154   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2155   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2156   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2157 
2158   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2159                                      : B.buildFPTOUI(S32, FloorMul);
2160   auto Lo = B.buildFPTOUI(S32, Fma);
2161 
2162   if (Signed && SrcLT == S32) {
2163     // Flip the result based on the signedness, which is either all 0s or 1s.
2164     Sign = B.buildMerge(S64, {Sign, Sign});
2165     // r := xor({lo, hi}, sign) - sign;
2166     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2167   } else
2168     B.buildMerge(Dst, {Lo, Hi});
2169   MI.eraseFromParent();
2170 
2171   return true;
2172 }
2173 
2174 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2175                                                MachineInstr &MI) const {
2176   MachineFunction &MF = Helper.MIRBuilder.getMF();
2177   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2178 
2179   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2180                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2181 
2182   // With ieee_mode disabled, the instructions have the correct behavior
2183   // already for G_FMINNUM/G_FMAXNUM
2184   if (!MFI->getMode().IEEE)
2185     return !IsIEEEOp;
2186 
2187   if (IsIEEEOp)
2188     return true;
2189 
2190   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2191 }
2192 
2193 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2194   MachineInstr &MI, MachineRegisterInfo &MRI,
2195   MachineIRBuilder &B) const {
2196   // TODO: Should move some of this into LegalizerHelper.
2197 
2198   // TODO: Promote dynamic indexing of s16 to s32
2199 
2200   // FIXME: Artifact combiner probably should have replaced the truncated
2201   // constant before this, so we shouldn't need
2202   // getIConstantVRegValWithLookThrough.
2203   Optional<ValueAndVReg> MaybeIdxVal =
2204       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2205   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2206     return true;
2207   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2208 
2209   Register Dst = MI.getOperand(0).getReg();
2210   Register Vec = MI.getOperand(1).getReg();
2211 
2212   LLT VecTy = MRI.getType(Vec);
2213   LLT EltTy = VecTy.getElementType();
2214   assert(EltTy == MRI.getType(Dst));
2215 
2216   if (IdxVal < VecTy.getNumElements()) {
2217     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2218     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2219   } else {
2220     B.buildUndef(Dst);
2221   }
2222 
2223   MI.eraseFromParent();
2224   return true;
2225 }
2226 
2227 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2228   MachineInstr &MI, MachineRegisterInfo &MRI,
2229   MachineIRBuilder &B) const {
2230   // TODO: Should move some of this into LegalizerHelper.
2231 
2232   // TODO: Promote dynamic indexing of s16 to s32
2233 
2234   // FIXME: Artifact combiner probably should have replaced the truncated
2235   // constant before this, so we shouldn't need
2236   // getIConstantVRegValWithLookThrough.
2237   Optional<ValueAndVReg> MaybeIdxVal =
2238       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2239   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2240     return true;
2241 
2242   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2243   Register Dst = MI.getOperand(0).getReg();
2244   Register Vec = MI.getOperand(1).getReg();
2245   Register Ins = MI.getOperand(2).getReg();
2246 
2247   LLT VecTy = MRI.getType(Vec);
2248   LLT EltTy = VecTy.getElementType();
2249   assert(EltTy == MRI.getType(Ins));
2250 
2251   unsigned NumElts = VecTy.getNumElements();
2252   if (IdxVal < NumElts) {
2253     SmallVector<Register, 8> SrcRegs;
2254     for (unsigned i = 0; i < NumElts; ++i)
2255       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2256     B.buildUnmerge(SrcRegs, Vec);
2257 
2258     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2259     B.buildMerge(Dst, SrcRegs);
2260   } else {
2261     B.buildUndef(Dst);
2262   }
2263 
2264   MI.eraseFromParent();
2265   return true;
2266 }
2267 
2268 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2269   MachineInstr &MI, MachineRegisterInfo &MRI,
2270   MachineIRBuilder &B) const {
2271   const LLT V2S16 = LLT::fixed_vector(2, 16);
2272 
2273   Register Dst = MI.getOperand(0).getReg();
2274   Register Src0 = MI.getOperand(1).getReg();
2275   LLT DstTy = MRI.getType(Dst);
2276   LLT SrcTy = MRI.getType(Src0);
2277 
2278   if (SrcTy == V2S16 && DstTy == V2S16 &&
2279       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2280     return true;
2281 
2282   MachineIRBuilder HelperBuilder(MI);
2283   GISelObserverWrapper DummyObserver;
2284   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2285   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2286 }
2287 
2288 bool AMDGPULegalizerInfo::legalizeSinCos(
2289   MachineInstr &MI, MachineRegisterInfo &MRI,
2290   MachineIRBuilder &B) const {
2291 
2292   Register DstReg = MI.getOperand(0).getReg();
2293   Register SrcReg = MI.getOperand(1).getReg();
2294   LLT Ty = MRI.getType(DstReg);
2295   unsigned Flags = MI.getFlags();
2296 
2297   Register TrigVal;
2298   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2299   if (ST.hasTrigReducedRange()) {
2300     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2301     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2302       .addUse(MulVal.getReg(0))
2303       .setMIFlags(Flags).getReg(0);
2304   } else
2305     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2306 
2307   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2308     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2309   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2310     .addUse(TrigVal)
2311     .setMIFlags(Flags);
2312   MI.eraseFromParent();
2313   return true;
2314 }
2315 
2316 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2317                                                   MachineIRBuilder &B,
2318                                                   const GlobalValue *GV,
2319                                                   int64_t Offset,
2320                                                   unsigned GAFlags) const {
2321   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2322   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2323   // to the following code sequence:
2324   //
2325   // For constant address space:
2326   //   s_getpc_b64 s[0:1]
2327   //   s_add_u32 s0, s0, $symbol
2328   //   s_addc_u32 s1, s1, 0
2329   //
2330   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2331   //   a fixup or relocation is emitted to replace $symbol with a literal
2332   //   constant, which is a pc-relative offset from the encoding of the $symbol
2333   //   operand to the global variable.
2334   //
2335   // For global address space:
2336   //   s_getpc_b64 s[0:1]
2337   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2338   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2339   //
2340   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2341   //   fixups or relocations are emitted to replace $symbol@*@lo and
2342   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2343   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2344   //   operand to the global variable.
2345   //
2346   // What we want here is an offset from the value returned by s_getpc
2347   // (which is the address of the s_add_u32 instruction) to the global
2348   // variable, but since the encoding of $symbol starts 4 bytes after the start
2349   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2350   // small. This requires us to add 4 to the global variable offset in order to
2351   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2352   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2353   // instruction.
2354 
2355   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2356 
2357   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2358     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2359 
2360   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2361     .addDef(PCReg);
2362 
2363   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2364   if (GAFlags == SIInstrInfo::MO_NONE)
2365     MIB.addImm(0);
2366   else
2367     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2368 
2369   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2370 
2371   if (PtrTy.getSizeInBits() == 32)
2372     B.buildExtract(DstReg, PCReg, 0);
2373   return true;
2374  }
2375 
2376 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2377   MachineInstr &MI, MachineRegisterInfo &MRI,
2378   MachineIRBuilder &B) const {
2379   Register DstReg = MI.getOperand(0).getReg();
2380   LLT Ty = MRI.getType(DstReg);
2381   unsigned AS = Ty.getAddressSpace();
2382 
2383   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2384   MachineFunction &MF = B.getMF();
2385   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2386 
2387   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2388     if (!MFI->isModuleEntryFunction() &&
2389         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2390       const Function &Fn = MF.getFunction();
2391       DiagnosticInfoUnsupported BadLDSDecl(
2392         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2393         DS_Warning);
2394       Fn.getContext().diagnose(BadLDSDecl);
2395 
2396       // We currently don't have a way to correctly allocate LDS objects that
2397       // aren't directly associated with a kernel. We do force inlining of
2398       // functions that use local objects. However, if these dead functions are
2399       // not eliminated, we don't want a compile time error. Just emit a warning
2400       // and a trap, since there should be no callable path here.
2401       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2402       B.buildUndef(DstReg);
2403       MI.eraseFromParent();
2404       return true;
2405     }
2406 
2407     // TODO: We could emit code to handle the initialization somewhere.
2408     // We ignore the initializer for now and legalize it to allow selection.
2409     // The initializer will anyway get errored out during assembly emission.
2410     const SITargetLowering *TLI = ST.getTargetLowering();
2411     if (!TLI->shouldUseLDSConstAddress(GV)) {
2412       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2413       return true; // Leave in place;
2414     }
2415 
2416     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2417       Type *Ty = GV->getValueType();
2418       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2419       // zero-sized type in other languages to declare the dynamic shared
2420       // memory which size is not known at the compile time. They will be
2421       // allocated by the runtime and placed directly after the static
2422       // allocated ones. They all share the same offset.
2423       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2424         // Adjust alignment for that dynamic shared memory array.
2425         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2426         LLT S32 = LLT::scalar(32);
2427         auto Sz =
2428             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2429         B.buildIntToPtr(DstReg, Sz);
2430         MI.eraseFromParent();
2431         return true;
2432       }
2433     }
2434 
2435     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2436                                                    *cast<GlobalVariable>(GV)));
2437     MI.eraseFromParent();
2438     return true;
2439   }
2440 
2441   const SITargetLowering *TLI = ST.getTargetLowering();
2442 
2443   if (TLI->shouldEmitFixup(GV)) {
2444     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2445     MI.eraseFromParent();
2446     return true;
2447   }
2448 
2449   if (TLI->shouldEmitPCReloc(GV)) {
2450     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2451     MI.eraseFromParent();
2452     return true;
2453   }
2454 
2455   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2456   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2457 
2458   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2459   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2460       MachinePointerInfo::getGOT(MF),
2461       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2462           MachineMemOperand::MOInvariant,
2463       LoadTy, Align(8));
2464 
2465   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2466 
2467   if (Ty.getSizeInBits() == 32) {
2468     // Truncate if this is a 32-bit constant address.
2469     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2470     B.buildExtract(DstReg, Load, 0);
2471   } else
2472     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2473 
2474   MI.eraseFromParent();
2475   return true;
2476 }
2477 
2478 static LLT widenToNextPowerOf2(LLT Ty) {
2479   if (Ty.isVector())
2480     return Ty.changeElementCount(
2481         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2482   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2483 }
2484 
2485 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2486                                        MachineInstr &MI) const {
2487   MachineIRBuilder &B = Helper.MIRBuilder;
2488   MachineRegisterInfo &MRI = *B.getMRI();
2489   GISelChangeObserver &Observer = Helper.Observer;
2490 
2491   Register PtrReg = MI.getOperand(1).getReg();
2492   LLT PtrTy = MRI.getType(PtrReg);
2493   unsigned AddrSpace = PtrTy.getAddressSpace();
2494 
2495   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2496     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2497     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2498     Observer.changingInstr(MI);
2499     MI.getOperand(1).setReg(Cast.getReg(0));
2500     Observer.changedInstr(MI);
2501     return true;
2502   }
2503 
2504   if (MI.getOpcode() != AMDGPU::G_LOAD)
2505     return false;
2506 
2507   Register ValReg = MI.getOperand(0).getReg();
2508   LLT ValTy = MRI.getType(ValReg);
2509 
2510   MachineMemOperand *MMO = *MI.memoperands_begin();
2511   const unsigned ValSize = ValTy.getSizeInBits();
2512   const LLT MemTy = MMO->getMemoryType();
2513   const Align MemAlign = MMO->getAlign();
2514   const unsigned MemSize = MemTy.getSizeInBits();
2515   const unsigned AlignInBits = 8 * MemAlign.value();
2516 
2517   // Widen non-power-of-2 loads to the alignment if needed
2518   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2519     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2520 
2521     // This was already the correct extending load result type, so just adjust
2522     // the memory type.
2523     if (WideMemSize == ValSize) {
2524       MachineFunction &MF = B.getMF();
2525 
2526       MachineMemOperand *WideMMO =
2527           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2528       Observer.changingInstr(MI);
2529       MI.setMemRefs(MF, {WideMMO});
2530       Observer.changedInstr(MI);
2531       return true;
2532     }
2533 
2534     // Don't bother handling edge case that should probably never be produced.
2535     if (ValSize > WideMemSize)
2536       return false;
2537 
2538     LLT WideTy = widenToNextPowerOf2(ValTy);
2539 
2540     Register WideLoad;
2541     if (!WideTy.isVector()) {
2542       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2543       B.buildTrunc(ValReg, WideLoad).getReg(0);
2544     } else {
2545       // Extract the subvector.
2546 
2547       if (isRegisterType(ValTy)) {
2548         // If this a case where G_EXTRACT is legal, use it.
2549         // (e.g. <3 x s32> -> <4 x s32>)
2550         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2551         B.buildExtract(ValReg, WideLoad, 0);
2552       } else {
2553         // For cases where the widened type isn't a nice register value, unmerge
2554         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2555         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2556         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2557       }
2558     }
2559 
2560     MI.eraseFromParent();
2561     return true;
2562   }
2563 
2564   return false;
2565 }
2566 
2567 bool AMDGPULegalizerInfo::legalizeFMad(
2568   MachineInstr &MI, MachineRegisterInfo &MRI,
2569   MachineIRBuilder &B) const {
2570   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2571   assert(Ty.isScalar());
2572 
2573   MachineFunction &MF = B.getMF();
2574   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2575 
2576   // TODO: Always legal with future ftz flag.
2577   // FIXME: Do we need just output?
2578   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2579     return true;
2580   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2581     return true;
2582 
2583   MachineIRBuilder HelperBuilder(MI);
2584   GISelObserverWrapper DummyObserver;
2585   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2586   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2587 }
2588 
2589 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2590   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2591   Register DstReg = MI.getOperand(0).getReg();
2592   Register PtrReg = MI.getOperand(1).getReg();
2593   Register CmpVal = MI.getOperand(2).getReg();
2594   Register NewVal = MI.getOperand(3).getReg();
2595 
2596   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2597          "this should not have been custom lowered");
2598 
2599   LLT ValTy = MRI.getType(CmpVal);
2600   LLT VecTy = LLT::fixed_vector(2, ValTy);
2601 
2602   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2603 
2604   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2605     .addDef(DstReg)
2606     .addUse(PtrReg)
2607     .addUse(PackedVal)
2608     .setMemRefs(MI.memoperands());
2609 
2610   MI.eraseFromParent();
2611   return true;
2612 }
2613 
2614 bool AMDGPULegalizerInfo::legalizeFlog(
2615   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2616   Register Dst = MI.getOperand(0).getReg();
2617   Register Src = MI.getOperand(1).getReg();
2618   LLT Ty = B.getMRI()->getType(Dst);
2619   unsigned Flags = MI.getFlags();
2620 
2621   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2622   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2623 
2624   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2625   MI.eraseFromParent();
2626   return true;
2627 }
2628 
2629 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2630                                        MachineIRBuilder &B) const {
2631   Register Dst = MI.getOperand(0).getReg();
2632   Register Src = MI.getOperand(1).getReg();
2633   unsigned Flags = MI.getFlags();
2634   LLT Ty = B.getMRI()->getType(Dst);
2635 
2636   auto K = B.buildFConstant(Ty, numbers::log2e);
2637   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2638   B.buildFExp2(Dst, Mul, Flags);
2639   MI.eraseFromParent();
2640   return true;
2641 }
2642 
2643 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2644                                        MachineIRBuilder &B) const {
2645   Register Dst = MI.getOperand(0).getReg();
2646   Register Src0 = MI.getOperand(1).getReg();
2647   Register Src1 = MI.getOperand(2).getReg();
2648   unsigned Flags = MI.getFlags();
2649   LLT Ty = B.getMRI()->getType(Dst);
2650   const LLT S16 = LLT::scalar(16);
2651   const LLT S32 = LLT::scalar(32);
2652 
2653   if (Ty == S32) {
2654     auto Log = B.buildFLog2(S32, Src0, Flags);
2655     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2656       .addUse(Log.getReg(0))
2657       .addUse(Src1)
2658       .setMIFlags(Flags);
2659     B.buildFExp2(Dst, Mul, Flags);
2660   } else if (Ty == S16) {
2661     // There's no f16 fmul_legacy, so we need to convert for it.
2662     auto Log = B.buildFLog2(S16, Src0, Flags);
2663     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2664     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2665     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2666       .addUse(Ext0.getReg(0))
2667       .addUse(Ext1.getReg(0))
2668       .setMIFlags(Flags);
2669 
2670     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2671   } else
2672     return false;
2673 
2674   MI.eraseFromParent();
2675   return true;
2676 }
2677 
2678 // Find a source register, ignoring any possible source modifiers.
2679 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2680   Register ModSrc = OrigSrc;
2681   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2682     ModSrc = SrcFNeg->getOperand(1).getReg();
2683     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2684       ModSrc = SrcFAbs->getOperand(1).getReg();
2685   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2686     ModSrc = SrcFAbs->getOperand(1).getReg();
2687   return ModSrc;
2688 }
2689 
2690 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2691                                          MachineRegisterInfo &MRI,
2692                                          MachineIRBuilder &B) const {
2693 
2694   const LLT S1 = LLT::scalar(1);
2695   const LLT S64 = LLT::scalar(64);
2696   Register Dst = MI.getOperand(0).getReg();
2697   Register OrigSrc = MI.getOperand(1).getReg();
2698   unsigned Flags = MI.getFlags();
2699   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2700          "this should not have been custom lowered");
2701 
2702   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2703   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2704   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2705   // V_FRACT bug is:
2706   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2707   //
2708   // Convert floor(x) to (x - fract(x))
2709 
2710   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2711     .addUse(OrigSrc)
2712     .setMIFlags(Flags);
2713 
2714   // Give source modifier matching some assistance before obscuring a foldable
2715   // pattern.
2716 
2717   // TODO: We can avoid the neg on the fract? The input sign to fract
2718   // shouldn't matter?
2719   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2720 
2721   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2722 
2723   Register Min = MRI.createGenericVirtualRegister(S64);
2724 
2725   // We don't need to concern ourselves with the snan handling difference, so
2726   // use the one which will directly select.
2727   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2728   if (MFI->getMode().IEEE)
2729     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2730   else
2731     B.buildFMinNum(Min, Fract, Const, Flags);
2732 
2733   Register CorrectedFract = Min;
2734   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2735     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2736     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2737   }
2738 
2739   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2740   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2741 
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 // Turn an illegal packed v2s16 build vector into bit operations.
2747 // TODO: This should probably be a bitcast action in LegalizerHelper.
2748 bool AMDGPULegalizerInfo::legalizeBuildVector(
2749   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2750   Register Dst = MI.getOperand(0).getReg();
2751   const LLT S32 = LLT::scalar(32);
2752   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2753 
2754   Register Src0 = MI.getOperand(1).getReg();
2755   Register Src1 = MI.getOperand(2).getReg();
2756   assert(MRI.getType(Src0) == LLT::scalar(16));
2757 
2758   auto Merge = B.buildMerge(S32, {Src0, Src1});
2759   B.buildBitcast(Dst, Merge);
2760 
2761   MI.eraseFromParent();
2762   return true;
2763 }
2764 
2765 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2766 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2767 // case with a single min instruction instead of a compare+select.
2768 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2769                                             MachineRegisterInfo &MRI,
2770                                             MachineIRBuilder &B) const {
2771   Register Dst = MI.getOperand(0).getReg();
2772   Register Src = MI.getOperand(1).getReg();
2773   LLT DstTy = MRI.getType(Dst);
2774   LLT SrcTy = MRI.getType(Src);
2775 
2776   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2777                         ? AMDGPU::G_AMDGPU_FFBH_U32
2778                         : AMDGPU::G_AMDGPU_FFBL_B32;
2779   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2780   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2781 
2782   MI.eraseFromParent();
2783   return true;
2784 }
2785 
2786 // Check that this is a G_XOR x, -1
2787 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2788   if (MI.getOpcode() != TargetOpcode::G_XOR)
2789     return false;
2790   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2791   return ConstVal && *ConstVal == -1;
2792 }
2793 
2794 // Return the use branch instruction, otherwise null if the usage is invalid.
2795 static MachineInstr *
2796 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2797                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2798   Register CondDef = MI.getOperand(0).getReg();
2799   if (!MRI.hasOneNonDBGUse(CondDef))
2800     return nullptr;
2801 
2802   MachineBasicBlock *Parent = MI.getParent();
2803   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2804 
2805   if (isNot(MRI, *UseMI)) {
2806     Register NegatedCond = UseMI->getOperand(0).getReg();
2807     if (!MRI.hasOneNonDBGUse(NegatedCond))
2808       return nullptr;
2809 
2810     // We're deleting the def of this value, so we need to remove it.
2811     eraseInstr(*UseMI, MRI);
2812 
2813     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2814     Negated = true;
2815   }
2816 
2817   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2818     return nullptr;
2819 
2820   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2821   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2822   if (Next == Parent->end()) {
2823     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2824     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2825       return nullptr;
2826     UncondBrTarget = &*NextMBB;
2827   } else {
2828     if (Next->getOpcode() != AMDGPU::G_BR)
2829       return nullptr;
2830     Br = &*Next;
2831     UncondBrTarget = Br->getOperand(0).getMBB();
2832   }
2833 
2834   return UseMI;
2835 }
2836 
2837 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2838                                          const ArgDescriptor *Arg,
2839                                          const TargetRegisterClass *ArgRC,
2840                                          LLT ArgTy) const {
2841   MCRegister SrcReg = Arg->getRegister();
2842   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2843   assert(DstReg.isVirtual() && "Virtual register expected");
2844 
2845   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2846                                              ArgTy);
2847   if (Arg->isMasked()) {
2848     // TODO: Should we try to emit this once in the entry block?
2849     const LLT S32 = LLT::scalar(32);
2850     const unsigned Mask = Arg->getMask();
2851     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2852 
2853     Register AndMaskSrc = LiveIn;
2854 
2855     if (Shift != 0) {
2856       auto ShiftAmt = B.buildConstant(S32, Shift);
2857       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2858     }
2859 
2860     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2861   } else {
2862     B.buildCopy(DstReg, LiveIn);
2863   }
2864 
2865   return true;
2866 }
2867 
2868 bool AMDGPULegalizerInfo::loadInputValue(
2869     Register DstReg, MachineIRBuilder &B,
2870     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2871   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2872   const ArgDescriptor *Arg;
2873   const TargetRegisterClass *ArgRC;
2874   LLT ArgTy;
2875   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2876 
2877   if (!Arg) {
2878     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2879       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2880       // case the pointer argument may be missing and we use null.
2881       B.buildConstant(DstReg, 0);
2882       return true;
2883     }
2884 
2885     // It's undefined behavior if a function marked with the amdgpu-no-*
2886     // attributes uses the corresponding intrinsic.
2887     B.buildUndef(DstReg);
2888     return true;
2889   }
2890 
2891   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2892     return false; // TODO: Handle these
2893   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2894 }
2895 
2896 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2897     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2898     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2899   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2900     return false;
2901 
2902   MI.eraseFromParent();
2903   return true;
2904 }
2905 
2906 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2907                                        MachineRegisterInfo &MRI,
2908                                        MachineIRBuilder &B) const {
2909   Register Dst = MI.getOperand(0).getReg();
2910   LLT DstTy = MRI.getType(Dst);
2911   LLT S16 = LLT::scalar(16);
2912   LLT S32 = LLT::scalar(32);
2913   LLT S64 = LLT::scalar(64);
2914 
2915   if (DstTy == S16)
2916     return legalizeFDIV16(MI, MRI, B);
2917   if (DstTy == S32)
2918     return legalizeFDIV32(MI, MRI, B);
2919   if (DstTy == S64)
2920     return legalizeFDIV64(MI, MRI, B);
2921 
2922   return false;
2923 }
2924 
2925 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2926                                                         Register DstDivReg,
2927                                                         Register DstRemReg,
2928                                                         Register X,
2929                                                         Register Y) const {
2930   const LLT S1 = LLT::scalar(1);
2931   const LLT S32 = LLT::scalar(32);
2932 
2933   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2934   // algorithm used here.
2935 
2936   // Initial estimate of inv(y).
2937   auto FloatY = B.buildUITOFP(S32, Y);
2938   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2939   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2940   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2941   auto Z = B.buildFPTOUI(S32, ScaledY);
2942 
2943   // One round of UNR.
2944   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2945   auto NegYZ = B.buildMul(S32, NegY, Z);
2946   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2947 
2948   // Quotient/remainder estimate.
2949   auto Q = B.buildUMulH(S32, X, Z);
2950   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2951 
2952   // First quotient/remainder refinement.
2953   auto One = B.buildConstant(S32, 1);
2954   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2955   if (DstDivReg)
2956     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2957   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2958 
2959   // Second quotient/remainder refinement.
2960   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2961   if (DstDivReg)
2962     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2963 
2964   if (DstRemReg)
2965     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2966 }
2967 
2968 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2969 //
2970 // Return lo, hi of result
2971 //
2972 // %cvt.lo = G_UITOFP Val.lo
2973 // %cvt.hi = G_UITOFP Val.hi
2974 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2975 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2976 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2977 // %mul2 = G_FMUL %mul1, 2**(-32)
2978 // %trunc = G_INTRINSIC_TRUNC %mul2
2979 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2980 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2981 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2982                                                        Register Val) {
2983   const LLT S32 = LLT::scalar(32);
2984   auto Unmerge = B.buildUnmerge(S32, Val);
2985 
2986   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2987   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2988 
2989   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2990                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2991 
2992   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2993   auto Mul1 =
2994       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2995 
2996   // 2**(-32)
2997   auto Mul2 =
2998       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2999   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3000 
3001   // -(2**32)
3002   auto Mad2 = B.buildFMAD(S32, Trunc,
3003                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3004 
3005   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3006   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3007 
3008   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3009 }
3010 
3011 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3012                                                         Register DstDivReg,
3013                                                         Register DstRemReg,
3014                                                         Register Numer,
3015                                                         Register Denom) const {
3016   const LLT S32 = LLT::scalar(32);
3017   const LLT S64 = LLT::scalar(64);
3018   const LLT S1 = LLT::scalar(1);
3019   Register RcpLo, RcpHi;
3020 
3021   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3022 
3023   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3024 
3025   auto Zero64 = B.buildConstant(S64, 0);
3026   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3027 
3028   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3029   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3030 
3031   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3032   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3033   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3034 
3035   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3036   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3037   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3038 
3039   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3040   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3041   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3042   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3043   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3044 
3045   auto Zero32 = B.buildConstant(S32, 0);
3046   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3047   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3048   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3049 
3050   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3051   Register NumerLo = UnmergeNumer.getReg(0);
3052   Register NumerHi = UnmergeNumer.getReg(1);
3053 
3054   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3055   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3056   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3057   Register Mul3_Lo = UnmergeMul3.getReg(0);
3058   Register Mul3_Hi = UnmergeMul3.getReg(1);
3059   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3060   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3061   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3062   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3063 
3064   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3065   Register DenomLo = UnmergeDenom.getReg(0);
3066   Register DenomHi = UnmergeDenom.getReg(1);
3067 
3068   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3069   auto C1 = B.buildSExt(S32, CmpHi);
3070 
3071   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3072   auto C2 = B.buildSExt(S32, CmpLo);
3073 
3074   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3075   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3076 
3077   // TODO: Here and below portions of the code can be enclosed into if/endif.
3078   // Currently control flow is unconditional and we have 4 selects after
3079   // potential endif to substitute PHIs.
3080 
3081   // if C3 != 0 ...
3082   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3083   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3084   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3085   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3086 
3087   auto One64 = B.buildConstant(S64, 1);
3088   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3089 
3090   auto C4 =
3091       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3092   auto C5 =
3093       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3094   auto C6 = B.buildSelect(
3095       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3096 
3097   // if (C6 != 0)
3098   auto Add4 = B.buildAdd(S64, Add3, One64);
3099   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3100 
3101   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3102   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3103   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3104 
3105   // endif C6
3106   // endif C3
3107 
3108   if (DstDivReg) {
3109     auto Sel1 = B.buildSelect(
3110         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3111     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3112                   Sel1, MulHi3);
3113   }
3114 
3115   if (DstRemReg) {
3116     auto Sel2 = B.buildSelect(
3117         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3118     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3119                   Sel2, Sub1);
3120   }
3121 }
3122 
3123 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3124                                                   MachineRegisterInfo &MRI,
3125                                                   MachineIRBuilder &B) const {
3126   Register DstDivReg, DstRemReg;
3127   switch (MI.getOpcode()) {
3128   default:
3129     llvm_unreachable("Unexpected opcode!");
3130   case AMDGPU::G_UDIV: {
3131     DstDivReg = MI.getOperand(0).getReg();
3132     break;
3133   }
3134   case AMDGPU::G_UREM: {
3135     DstRemReg = MI.getOperand(0).getReg();
3136     break;
3137   }
3138   case AMDGPU::G_UDIVREM: {
3139     DstDivReg = MI.getOperand(0).getReg();
3140     DstRemReg = MI.getOperand(1).getReg();
3141     break;
3142   }
3143   }
3144 
3145   const LLT S64 = LLT::scalar(64);
3146   const LLT S32 = LLT::scalar(32);
3147   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3148   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3149   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3150   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3151 
3152   if (Ty == S32)
3153     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3154   else if (Ty == S64)
3155     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3156   else
3157     return false;
3158 
3159   MI.eraseFromParent();
3160   return true;
3161 }
3162 
3163 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3164                                                 MachineRegisterInfo &MRI,
3165                                                 MachineIRBuilder &B) const {
3166   const LLT S64 = LLT::scalar(64);
3167   const LLT S32 = LLT::scalar(32);
3168 
3169   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3170   if (Ty != S32 && Ty != S64)
3171     return false;
3172 
3173   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3174   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3175   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3176 
3177   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3178   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3179   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3180 
3181   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3182   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3183 
3184   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3185   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3186 
3187   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3188   switch (MI.getOpcode()) {
3189   default:
3190     llvm_unreachable("Unexpected opcode!");
3191   case AMDGPU::G_SDIV: {
3192     DstDivReg = MI.getOperand(0).getReg();
3193     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3194     break;
3195   }
3196   case AMDGPU::G_SREM: {
3197     DstRemReg = MI.getOperand(0).getReg();
3198     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3199     break;
3200   }
3201   case AMDGPU::G_SDIVREM: {
3202     DstDivReg = MI.getOperand(0).getReg();
3203     DstRemReg = MI.getOperand(1).getReg();
3204     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3205     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3206     break;
3207   }
3208   }
3209 
3210   if (Ty == S32)
3211     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3212   else
3213     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3214 
3215   if (DstDivReg) {
3216     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3217     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3218     B.buildSub(DstDivReg, SignXor, Sign);
3219   }
3220 
3221   if (DstRemReg) {
3222     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3223     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3224     B.buildSub(DstRemReg, SignXor, Sign);
3225   }
3226 
3227   MI.eraseFromParent();
3228   return true;
3229 }
3230 
3231 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3232                                                  MachineRegisterInfo &MRI,
3233                                                  MachineIRBuilder &B) const {
3234   Register Res = MI.getOperand(0).getReg();
3235   Register LHS = MI.getOperand(1).getReg();
3236   Register RHS = MI.getOperand(2).getReg();
3237   uint16_t Flags = MI.getFlags();
3238   LLT ResTy = MRI.getType(Res);
3239 
3240   const MachineFunction &MF = B.getMF();
3241   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3242                             MI.getFlag(MachineInstr::FmAfn);
3243 
3244   if (!AllowInaccurateRcp)
3245     return false;
3246 
3247   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3248     // 1 / x -> RCP(x)
3249     if (CLHS->isExactlyValue(1.0)) {
3250       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3251         .addUse(RHS)
3252         .setMIFlags(Flags);
3253 
3254       MI.eraseFromParent();
3255       return true;
3256     }
3257 
3258     // -1 / x -> RCP( FNEG(x) )
3259     if (CLHS->isExactlyValue(-1.0)) {
3260       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3261       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3262         .addUse(FNeg.getReg(0))
3263         .setMIFlags(Flags);
3264 
3265       MI.eraseFromParent();
3266       return true;
3267     }
3268   }
3269 
3270   // x / y -> x * (1.0 / y)
3271   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3272     .addUse(RHS)
3273     .setMIFlags(Flags);
3274   B.buildFMul(Res, LHS, RCP, Flags);
3275 
3276   MI.eraseFromParent();
3277   return true;
3278 }
3279 
3280 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3281                                                    MachineRegisterInfo &MRI,
3282                                                    MachineIRBuilder &B) const {
3283   Register Res = MI.getOperand(0).getReg();
3284   Register X = MI.getOperand(1).getReg();
3285   Register Y = MI.getOperand(2).getReg();
3286   uint16_t Flags = MI.getFlags();
3287   LLT ResTy = MRI.getType(Res);
3288 
3289   const MachineFunction &MF = B.getMF();
3290   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3291                             MI.getFlag(MachineInstr::FmAfn);
3292 
3293   if (!AllowInaccurateRcp)
3294     return false;
3295 
3296   auto NegY = B.buildFNeg(ResTy, Y);
3297   auto One = B.buildFConstant(ResTy, 1.0);
3298 
3299   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3300     .addUse(Y)
3301     .setMIFlags(Flags);
3302 
3303   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3304   R = B.buildFMA(ResTy, Tmp0, R, R);
3305 
3306   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3307   R = B.buildFMA(ResTy, Tmp1, R, R);
3308 
3309   auto Ret = B.buildFMul(ResTy, X, R);
3310   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3311 
3312   B.buildFMA(Res, Tmp2, R, Ret);
3313   MI.eraseFromParent();
3314   return true;
3315 }
3316 
3317 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3318                                          MachineRegisterInfo &MRI,
3319                                          MachineIRBuilder &B) const {
3320   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3321     return true;
3322 
3323   Register Res = MI.getOperand(0).getReg();
3324   Register LHS = MI.getOperand(1).getReg();
3325   Register RHS = MI.getOperand(2).getReg();
3326 
3327   uint16_t Flags = MI.getFlags();
3328 
3329   LLT S16 = LLT::scalar(16);
3330   LLT S32 = LLT::scalar(32);
3331 
3332   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3333   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3334 
3335   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3336     .addUse(RHSExt.getReg(0))
3337     .setMIFlags(Flags);
3338 
3339   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3340   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3341 
3342   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3343     .addUse(RDst.getReg(0))
3344     .addUse(RHS)
3345     .addUse(LHS)
3346     .setMIFlags(Flags);
3347 
3348   MI.eraseFromParent();
3349   return true;
3350 }
3351 
3352 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3353 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3354 static void toggleSPDenormMode(bool Enable,
3355                                MachineIRBuilder &B,
3356                                const GCNSubtarget &ST,
3357                                AMDGPU::SIModeRegisterDefaults Mode) {
3358   // Set SP denorm mode to this value.
3359   unsigned SPDenormMode =
3360     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3361 
3362   if (ST.hasDenormModeInst()) {
3363     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3364     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3365 
3366     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3367     B.buildInstr(AMDGPU::S_DENORM_MODE)
3368       .addImm(NewDenormModeValue);
3369 
3370   } else {
3371     // Select FP32 bit field in mode register.
3372     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3373                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3374                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3375 
3376     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3377       .addImm(SPDenormMode)
3378       .addImm(SPDenormModeBitField);
3379   }
3380 }
3381 
3382 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3383                                          MachineRegisterInfo &MRI,
3384                                          MachineIRBuilder &B) const {
3385   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3386     return true;
3387 
3388   Register Res = MI.getOperand(0).getReg();
3389   Register LHS = MI.getOperand(1).getReg();
3390   Register RHS = MI.getOperand(2).getReg();
3391   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3392   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3393 
3394   uint16_t Flags = MI.getFlags();
3395 
3396   LLT S32 = LLT::scalar(32);
3397   LLT S1 = LLT::scalar(1);
3398 
3399   auto One = B.buildFConstant(S32, 1.0f);
3400 
3401   auto DenominatorScaled =
3402     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3403       .addUse(LHS)
3404       .addUse(RHS)
3405       .addImm(0)
3406       .setMIFlags(Flags);
3407   auto NumeratorScaled =
3408     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3409       .addUse(LHS)
3410       .addUse(RHS)
3411       .addImm(1)
3412       .setMIFlags(Flags);
3413 
3414   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3415     .addUse(DenominatorScaled.getReg(0))
3416     .setMIFlags(Flags);
3417   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3418 
3419   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3420   // aren't modeled as reading it.
3421   if (!Mode.allFP32Denormals())
3422     toggleSPDenormMode(true, B, ST, Mode);
3423 
3424   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3425   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3426   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3427   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3428   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3429   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3430 
3431   if (!Mode.allFP32Denormals())
3432     toggleSPDenormMode(false, B, ST, Mode);
3433 
3434   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3435     .addUse(Fma4.getReg(0))
3436     .addUse(Fma1.getReg(0))
3437     .addUse(Fma3.getReg(0))
3438     .addUse(NumeratorScaled.getReg(1))
3439     .setMIFlags(Flags);
3440 
3441   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3442     .addUse(Fmas.getReg(0))
3443     .addUse(RHS)
3444     .addUse(LHS)
3445     .setMIFlags(Flags);
3446 
3447   MI.eraseFromParent();
3448   return true;
3449 }
3450 
3451 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3452                                          MachineRegisterInfo &MRI,
3453                                          MachineIRBuilder &B) const {
3454   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3455     return true;
3456 
3457   Register Res = MI.getOperand(0).getReg();
3458   Register LHS = MI.getOperand(1).getReg();
3459   Register RHS = MI.getOperand(2).getReg();
3460 
3461   uint16_t Flags = MI.getFlags();
3462 
3463   LLT S64 = LLT::scalar(64);
3464   LLT S1 = LLT::scalar(1);
3465 
3466   auto One = B.buildFConstant(S64, 1.0);
3467 
3468   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3469     .addUse(LHS)
3470     .addUse(RHS)
3471     .addImm(0)
3472     .setMIFlags(Flags);
3473 
3474   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3475 
3476   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3477     .addUse(DivScale0.getReg(0))
3478     .setMIFlags(Flags);
3479 
3480   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3481   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3482   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3483 
3484   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3485     .addUse(LHS)
3486     .addUse(RHS)
3487     .addImm(1)
3488     .setMIFlags(Flags);
3489 
3490   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3491   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3492   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3493 
3494   Register Scale;
3495   if (!ST.hasUsableDivScaleConditionOutput()) {
3496     // Workaround a hardware bug on SI where the condition output from div_scale
3497     // is not usable.
3498 
3499     LLT S32 = LLT::scalar(32);
3500 
3501     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3502     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3503     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3504     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3505 
3506     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3507                               Scale1Unmerge.getReg(1));
3508     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3509                               Scale0Unmerge.getReg(1));
3510     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3511   } else {
3512     Scale = DivScale1.getReg(1);
3513   }
3514 
3515   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3516     .addUse(Fma4.getReg(0))
3517     .addUse(Fma3.getReg(0))
3518     .addUse(Mul.getReg(0))
3519     .addUse(Scale)
3520     .setMIFlags(Flags);
3521 
3522   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3523     .addUse(Fmas.getReg(0))
3524     .addUse(RHS)
3525     .addUse(LHS)
3526     .setMIFlags(Flags);
3527 
3528   MI.eraseFromParent();
3529   return true;
3530 }
3531 
3532 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3533                                                  MachineRegisterInfo &MRI,
3534                                                  MachineIRBuilder &B) const {
3535   Register Res = MI.getOperand(0).getReg();
3536   Register LHS = MI.getOperand(2).getReg();
3537   Register RHS = MI.getOperand(3).getReg();
3538   uint16_t Flags = MI.getFlags();
3539 
3540   LLT S32 = LLT::scalar(32);
3541   LLT S1 = LLT::scalar(1);
3542 
3543   auto Abs = B.buildFAbs(S32, RHS, Flags);
3544   const APFloat C0Val(1.0f);
3545 
3546   auto C0 = B.buildConstant(S32, 0x6f800000);
3547   auto C1 = B.buildConstant(S32, 0x2f800000);
3548   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3549 
3550   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3551   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3552 
3553   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3554 
3555   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3556     .addUse(Mul0.getReg(0))
3557     .setMIFlags(Flags);
3558 
3559   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3560 
3561   B.buildFMul(Res, Sel, Mul1, Flags);
3562 
3563   MI.eraseFromParent();
3564   return true;
3565 }
3566 
3567 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3568 // FIXME: Why do we handle this one but not other removed instructions?
3569 //
3570 // Reciprocal square root.  The clamp prevents infinite results, clamping
3571 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3572 // +-max_float.
3573 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3574                                                     MachineRegisterInfo &MRI,
3575                                                     MachineIRBuilder &B) const {
3576   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3577     return true;
3578 
3579   Register Dst = MI.getOperand(0).getReg();
3580   Register Src = MI.getOperand(2).getReg();
3581   auto Flags = MI.getFlags();
3582 
3583   LLT Ty = MRI.getType(Dst);
3584 
3585   const fltSemantics *FltSemantics;
3586   if (Ty == LLT::scalar(32))
3587     FltSemantics = &APFloat::IEEEsingle();
3588   else if (Ty == LLT::scalar(64))
3589     FltSemantics = &APFloat::IEEEdouble();
3590   else
3591     return false;
3592 
3593   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3594     .addUse(Src)
3595     .setMIFlags(Flags);
3596 
3597   // We don't need to concern ourselves with the snan handling difference, since
3598   // the rsq quieted (or not) so use the one which will directly select.
3599   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3600   const bool UseIEEE = MFI->getMode().IEEE;
3601 
3602   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3603   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3604                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3605 
3606   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3607 
3608   if (UseIEEE)
3609     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3610   else
3611     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3612   MI.eraseFromParent();
3613   return true;
3614 }
3615 
3616 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3617   switch (IID) {
3618   case Intrinsic::amdgcn_ds_fadd:
3619     return AMDGPU::G_ATOMICRMW_FADD;
3620   case Intrinsic::amdgcn_ds_fmin:
3621     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3622   case Intrinsic::amdgcn_ds_fmax:
3623     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3624   default:
3625     llvm_unreachable("not a DS FP intrinsic");
3626   }
3627 }
3628 
3629 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3630                                                       MachineInstr &MI,
3631                                                       Intrinsic::ID IID) const {
3632   GISelChangeObserver &Observer = Helper.Observer;
3633   Observer.changingInstr(MI);
3634 
3635   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3636 
3637   // The remaining operands were used to set fields in the MemOperand on
3638   // construction.
3639   for (int I = 6; I > 3; --I)
3640     MI.RemoveOperand(I);
3641 
3642   MI.RemoveOperand(1); // Remove the intrinsic ID.
3643   Observer.changedInstr(MI);
3644   return true;
3645 }
3646 
3647 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3648                                             MachineRegisterInfo &MRI,
3649                                             MachineIRBuilder &B) const {
3650   uint64_t Offset =
3651     ST.getTargetLowering()->getImplicitParameterOffset(
3652       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3653   LLT DstTy = MRI.getType(DstReg);
3654   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3655 
3656   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3657   if (!loadInputValue(KernargPtrReg, B,
3658                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3659     return false;
3660 
3661   // FIXME: This should be nuw
3662   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3663   return true;
3664 }
3665 
3666 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3667                                                  MachineRegisterInfo &MRI,
3668                                                  MachineIRBuilder &B) const {
3669   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3670   if (!MFI->isEntryFunction()) {
3671     return legalizePreloadedArgIntrin(MI, MRI, B,
3672                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3673   }
3674 
3675   Register DstReg = MI.getOperand(0).getReg();
3676   if (!getImplicitArgPtr(DstReg, MRI, B))
3677     return false;
3678 
3679   MI.eraseFromParent();
3680   return true;
3681 }
3682 
3683 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3684                                               MachineRegisterInfo &MRI,
3685                                               MachineIRBuilder &B,
3686                                               unsigned AddrSpace) const {
3687   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3688   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3689   Register Hi32 = Unmerge.getReg(1);
3690 
3691   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3692   MI.eraseFromParent();
3693   return true;
3694 }
3695 
3696 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3697 // offset (the offset that is included in bounds checking and swizzling, to be
3698 // split between the instruction's voffset and immoffset fields) and soffset
3699 // (the offset that is excluded from bounds checking and swizzling, to go in
3700 // the instruction's soffset field).  This function takes the first kind of
3701 // offset and figures out how to split it between voffset and immoffset.
3702 std::pair<Register, unsigned>
3703 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3704                                         Register OrigOffset) const {
3705   const unsigned MaxImm = 4095;
3706   Register BaseReg;
3707   unsigned ImmOffset;
3708   const LLT S32 = LLT::scalar(32);
3709   MachineRegisterInfo &MRI = *B.getMRI();
3710 
3711   std::tie(BaseReg, ImmOffset) =
3712       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3713 
3714   // If BaseReg is a pointer, convert it to int.
3715   if (MRI.getType(BaseReg).isPointer())
3716     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3717 
3718   // If the immediate value is too big for the immoffset field, put the value
3719   // and -4096 into the immoffset field so that the value that is copied/added
3720   // for the voffset field is a multiple of 4096, and it stands more chance
3721   // of being CSEd with the copy/add for another similar load/store.
3722   // However, do not do that rounding down to a multiple of 4096 if that is a
3723   // negative number, as it appears to be illegal to have a negative offset
3724   // in the vgpr, even if adding the immediate offset makes it positive.
3725   unsigned Overflow = ImmOffset & ~MaxImm;
3726   ImmOffset -= Overflow;
3727   if ((int32_t)Overflow < 0) {
3728     Overflow += ImmOffset;
3729     ImmOffset = 0;
3730   }
3731 
3732   if (Overflow != 0) {
3733     if (!BaseReg) {
3734       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3735     } else {
3736       auto OverflowVal = B.buildConstant(S32, Overflow);
3737       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3738     }
3739   }
3740 
3741   if (!BaseReg)
3742     BaseReg = B.buildConstant(S32, 0).getReg(0);
3743 
3744   return std::make_pair(BaseReg, ImmOffset);
3745 }
3746 
3747 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3748 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3749                                           Register VOffset, Register SOffset,
3750                                           unsigned ImmOffset, Register VIndex,
3751                                           MachineRegisterInfo &MRI) const {
3752   Optional<ValueAndVReg> MaybeVOffsetVal =
3753       getIConstantVRegValWithLookThrough(VOffset, MRI);
3754   Optional<ValueAndVReg> MaybeSOffsetVal =
3755       getIConstantVRegValWithLookThrough(SOffset, MRI);
3756   Optional<ValueAndVReg> MaybeVIndexVal =
3757       getIConstantVRegValWithLookThrough(VIndex, MRI);
3758   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3759   // update the MMO with that offset. The stride is unknown so we can only do
3760   // this if VIndex is constant 0.
3761   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3762       MaybeVIndexVal->Value == 0) {
3763     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3764                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3765     MMO->setOffset(TotalOffset);
3766   } else {
3767     // We don't have a constant combined offset to use in the MMO. Give up.
3768     MMO->setValue((Value *)nullptr);
3769   }
3770 }
3771 
3772 /// Handle register layout difference for f16 images for some subtargets.
3773 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3774                                              MachineRegisterInfo &MRI,
3775                                              Register Reg,
3776                                              bool ImageStore) const {
3777   const LLT S16 = LLT::scalar(16);
3778   const LLT S32 = LLT::scalar(32);
3779   LLT StoreVT = MRI.getType(Reg);
3780   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3781 
3782   if (ST.hasUnpackedD16VMem()) {
3783     auto Unmerge = B.buildUnmerge(S16, Reg);
3784 
3785     SmallVector<Register, 4> WideRegs;
3786     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3787       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3788 
3789     int NumElts = StoreVT.getNumElements();
3790 
3791     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3792         .getReg(0);
3793   }
3794 
3795   if (ImageStore && ST.hasImageStoreD16Bug()) {
3796     if (StoreVT.getNumElements() == 2) {
3797       SmallVector<Register, 4> PackedRegs;
3798       Reg = B.buildBitcast(S32, Reg).getReg(0);
3799       PackedRegs.push_back(Reg);
3800       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3801       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3802           .getReg(0);
3803     }
3804 
3805     if (StoreVT.getNumElements() == 3) {
3806       SmallVector<Register, 4> PackedRegs;
3807       auto Unmerge = B.buildUnmerge(S16, Reg);
3808       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3809         PackedRegs.push_back(Unmerge.getReg(I));
3810       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3811       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3812       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3813     }
3814 
3815     if (StoreVT.getNumElements() == 4) {
3816       SmallVector<Register, 4> PackedRegs;
3817       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3818       auto Unmerge = B.buildUnmerge(S32, Reg);
3819       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3820         PackedRegs.push_back(Unmerge.getReg(I));
3821       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3822       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3823           .getReg(0);
3824     }
3825 
3826     llvm_unreachable("invalid data type");
3827   }
3828 
3829   if (StoreVT == LLT::fixed_vector(3, S16)) {
3830     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3831               .getReg(0);
3832   }
3833   return Reg;
3834 }
3835 
3836 Register AMDGPULegalizerInfo::fixStoreSourceType(
3837   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3838   MachineRegisterInfo *MRI = B.getMRI();
3839   LLT Ty = MRI->getType(VData);
3840 
3841   const LLT S16 = LLT::scalar(16);
3842 
3843   // Fixup illegal register types for i8 stores.
3844   if (Ty == LLT::scalar(8) || Ty == S16) {
3845     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3846     return AnyExt;
3847   }
3848 
3849   if (Ty.isVector()) {
3850     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3851       if (IsFormat)
3852         return handleD16VData(B, *MRI, VData);
3853     }
3854   }
3855 
3856   return VData;
3857 }
3858 
3859 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3860                                               MachineRegisterInfo &MRI,
3861                                               MachineIRBuilder &B,
3862                                               bool IsTyped,
3863                                               bool IsFormat) const {
3864   Register VData = MI.getOperand(1).getReg();
3865   LLT Ty = MRI.getType(VData);
3866   LLT EltTy = Ty.getScalarType();
3867   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3868   const LLT S32 = LLT::scalar(32);
3869 
3870   VData = fixStoreSourceType(B, VData, IsFormat);
3871   Register RSrc = MI.getOperand(2).getReg();
3872 
3873   MachineMemOperand *MMO = *MI.memoperands_begin();
3874   const int MemSize = MMO->getSize();
3875 
3876   unsigned ImmOffset;
3877 
3878   // The typed intrinsics add an immediate after the registers.
3879   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3880 
3881   // The struct intrinsic variants add one additional operand over raw.
3882   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3883   Register VIndex;
3884   int OpOffset = 0;
3885   if (HasVIndex) {
3886     VIndex = MI.getOperand(3).getReg();
3887     OpOffset = 1;
3888   } else {
3889     VIndex = B.buildConstant(S32, 0).getReg(0);
3890   }
3891 
3892   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3893   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3894 
3895   unsigned Format = 0;
3896   if (IsTyped) {
3897     Format = MI.getOperand(5 + OpOffset).getImm();
3898     ++OpOffset;
3899   }
3900 
3901   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3902 
3903   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3904   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3905 
3906   unsigned Opc;
3907   if (IsTyped) {
3908     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3909                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3910   } else if (IsFormat) {
3911     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3912                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3913   } else {
3914     switch (MemSize) {
3915     case 1:
3916       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3917       break;
3918     case 2:
3919       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3920       break;
3921     default:
3922       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3923       break;
3924     }
3925   }
3926 
3927   auto MIB = B.buildInstr(Opc)
3928     .addUse(VData)              // vdata
3929     .addUse(RSrc)               // rsrc
3930     .addUse(VIndex)             // vindex
3931     .addUse(VOffset)            // voffset
3932     .addUse(SOffset)            // soffset
3933     .addImm(ImmOffset);         // offset(imm)
3934 
3935   if (IsTyped)
3936     MIB.addImm(Format);
3937 
3938   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3939      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3940      .addMemOperand(MMO);
3941 
3942   MI.eraseFromParent();
3943   return true;
3944 }
3945 
3946 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3947                                              MachineRegisterInfo &MRI,
3948                                              MachineIRBuilder &B,
3949                                              bool IsFormat,
3950                                              bool IsTyped) const {
3951   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3952   MachineMemOperand *MMO = *MI.memoperands_begin();
3953   const LLT MemTy = MMO->getMemoryType();
3954   const LLT S32 = LLT::scalar(32);
3955 
3956   Register Dst = MI.getOperand(0).getReg();
3957   Register RSrc = MI.getOperand(2).getReg();
3958 
3959   // The typed intrinsics add an immediate after the registers.
3960   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3961 
3962   // The struct intrinsic variants add one additional operand over raw.
3963   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3964   Register VIndex;
3965   int OpOffset = 0;
3966   if (HasVIndex) {
3967     VIndex = MI.getOperand(3).getReg();
3968     OpOffset = 1;
3969   } else {
3970     VIndex = B.buildConstant(S32, 0).getReg(0);
3971   }
3972 
3973   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3974   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3975 
3976   unsigned Format = 0;
3977   if (IsTyped) {
3978     Format = MI.getOperand(5 + OpOffset).getImm();
3979     ++OpOffset;
3980   }
3981 
3982   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3983   unsigned ImmOffset;
3984 
3985   LLT Ty = MRI.getType(Dst);
3986   LLT EltTy = Ty.getScalarType();
3987   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3988   const bool Unpacked = ST.hasUnpackedD16VMem();
3989 
3990   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3991   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3992 
3993   unsigned Opc;
3994 
3995   if (IsTyped) {
3996     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3997                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3998   } else if (IsFormat) {
3999     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4000                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4001   } else {
4002     switch (MemTy.getSizeInBits()) {
4003     case 8:
4004       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4005       break;
4006     case 16:
4007       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4008       break;
4009     default:
4010       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4011       break;
4012     }
4013   }
4014 
4015   Register LoadDstReg;
4016 
4017   bool IsExtLoad =
4018       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4019   LLT UnpackedTy = Ty.changeElementSize(32);
4020 
4021   if (IsExtLoad)
4022     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4023   else if (Unpacked && IsD16 && Ty.isVector())
4024     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4025   else
4026     LoadDstReg = Dst;
4027 
4028   auto MIB = B.buildInstr(Opc)
4029     .addDef(LoadDstReg)         // vdata
4030     .addUse(RSrc)               // rsrc
4031     .addUse(VIndex)             // vindex
4032     .addUse(VOffset)            // voffset
4033     .addUse(SOffset)            // soffset
4034     .addImm(ImmOffset);         // offset(imm)
4035 
4036   if (IsTyped)
4037     MIB.addImm(Format);
4038 
4039   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4040      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4041      .addMemOperand(MMO);
4042 
4043   if (LoadDstReg != Dst) {
4044     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4045 
4046     // Widen result for extending loads was widened.
4047     if (IsExtLoad)
4048       B.buildTrunc(Dst, LoadDstReg);
4049     else {
4050       // Repack to original 16-bit vector result
4051       // FIXME: G_TRUNC should work, but legalization currently fails
4052       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4053       SmallVector<Register, 4> Repack;
4054       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4055         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4056       B.buildMerge(Dst, Repack);
4057     }
4058   }
4059 
4060   MI.eraseFromParent();
4061   return true;
4062 }
4063 
4064 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4065                                                MachineIRBuilder &B,
4066                                                bool IsInc) const {
4067   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4068                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4069   B.buildInstr(Opc)
4070     .addDef(MI.getOperand(0).getReg())
4071     .addUse(MI.getOperand(2).getReg())
4072     .addUse(MI.getOperand(3).getReg())
4073     .cloneMemRefs(MI);
4074   MI.eraseFromParent();
4075   return true;
4076 }
4077 
4078 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4079   switch (IntrID) {
4080   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4081   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4082     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4083   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4084   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4085     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4086   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4087   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4088     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4089   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4090   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4091     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4092   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4093   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4094     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4095   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4096   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4097     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4098   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4099   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4100     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4101   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4102   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4103     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4104   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4105   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4106     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4107   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4108   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4109     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4110   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4111   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4112     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4113   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4114   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4115     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4116   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4117   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4118     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4119   case Intrinsic::amdgcn_buffer_atomic_fadd:
4120   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4121   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4122     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4123   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4124   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4125     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4126   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4127   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4128     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4129   default:
4130     llvm_unreachable("unhandled atomic opcode");
4131   }
4132 }
4133 
4134 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4135                                                MachineIRBuilder &B,
4136                                                Intrinsic::ID IID) const {
4137   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4138                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4139   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4140 
4141   Register Dst;
4142 
4143   int OpOffset = 0;
4144   if (HasReturn) {
4145     // A few FP atomics do not support return values.
4146     Dst = MI.getOperand(0).getReg();
4147   } else {
4148     OpOffset = -1;
4149   }
4150 
4151   Register VData = MI.getOperand(2 + OpOffset).getReg();
4152   Register CmpVal;
4153 
4154   if (IsCmpSwap) {
4155     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4156     ++OpOffset;
4157   }
4158 
4159   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4160   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4161 
4162   // The struct intrinsic variants add one additional operand over raw.
4163   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4164   Register VIndex;
4165   if (HasVIndex) {
4166     VIndex = MI.getOperand(4 + OpOffset).getReg();
4167     ++OpOffset;
4168   } else {
4169     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4170   }
4171 
4172   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4173   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4174   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4175 
4176   MachineMemOperand *MMO = *MI.memoperands_begin();
4177 
4178   unsigned ImmOffset;
4179   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4180   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4181 
4182   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4183 
4184   if (HasReturn)
4185     MIB.addDef(Dst);
4186 
4187   MIB.addUse(VData); // vdata
4188 
4189   if (IsCmpSwap)
4190     MIB.addReg(CmpVal);
4191 
4192   MIB.addUse(RSrc)               // rsrc
4193      .addUse(VIndex)             // vindex
4194      .addUse(VOffset)            // voffset
4195      .addUse(SOffset)            // soffset
4196      .addImm(ImmOffset)          // offset(imm)
4197      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4198      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4199      .addMemOperand(MMO);
4200 
4201   MI.eraseFromParent();
4202   return true;
4203 }
4204 
4205 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4206 /// vector with s16 typed elements.
4207 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4208                                       SmallVectorImpl<Register> &PackedAddrs,
4209                                       unsigned ArgOffset,
4210                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4211                                       bool IsA16, bool IsG16) {
4212   const LLT S16 = LLT::scalar(16);
4213   const LLT V2S16 = LLT::fixed_vector(2, 16);
4214   auto EndIdx = Intr->VAddrEnd;
4215 
4216   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4217     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4218     if (!SrcOp.isReg())
4219       continue; // _L to _LZ may have eliminated this.
4220 
4221     Register AddrReg = SrcOp.getReg();
4222 
4223     if ((I < Intr->GradientStart) ||
4224         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4225         (I >= Intr->CoordStart && !IsA16)) {
4226       // Handle any gradient or coordinate operands that should not be packed
4227       if ((I < Intr->GradientStart) && IsA16 &&
4228           (B.getMRI()->getType(AddrReg) == S16)) {
4229         // Special handling of bias when A16 is on. Bias is of type half but
4230         // occupies full 32-bit.
4231         PackedAddrs.push_back(
4232             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4233                 .getReg(0));
4234       } else {
4235         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4236         PackedAddrs.push_back(AddrReg);
4237       }
4238     } else {
4239       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4240       // derivatives dx/dh and dx/dv are packed with undef.
4241       if (((I + 1) >= EndIdx) ||
4242           ((Intr->NumGradients / 2) % 2 == 1 &&
4243            (I == static_cast<unsigned>(Intr->GradientStart +
4244                                        (Intr->NumGradients / 2) - 1) ||
4245             I == static_cast<unsigned>(Intr->GradientStart +
4246                                        Intr->NumGradients - 1))) ||
4247           // Check for _L to _LZ optimization
4248           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4249         PackedAddrs.push_back(
4250             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4251                 .getReg(0));
4252       } else {
4253         PackedAddrs.push_back(
4254             B.buildBuildVector(
4255                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4256                 .getReg(0));
4257         ++I;
4258       }
4259     }
4260   }
4261 }
4262 
4263 /// Convert from separate vaddr components to a single vector address register,
4264 /// and replace the remaining operands with $noreg.
4265 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4266                                      int DimIdx, int NumVAddrs) {
4267   const LLT S32 = LLT::scalar(32);
4268 
4269   SmallVector<Register, 8> AddrRegs;
4270   for (int I = 0; I != NumVAddrs; ++I) {
4271     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4272     if (SrcOp.isReg()) {
4273       AddrRegs.push_back(SrcOp.getReg());
4274       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4275     }
4276   }
4277 
4278   int NumAddrRegs = AddrRegs.size();
4279   if (NumAddrRegs != 1) {
4280     // Above 8 elements round up to next power of 2 (i.e. 16).
4281     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4282       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4283       auto Undef = B.buildUndef(S32);
4284       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4285       NumAddrRegs = RoundedNumRegs;
4286     }
4287 
4288     auto VAddr =
4289         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4290     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4291   }
4292 
4293   for (int I = 1; I != NumVAddrs; ++I) {
4294     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4295     if (SrcOp.isReg())
4296       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4297   }
4298 }
4299 
4300 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4301 ///
4302 /// Depending on the subtarget, load/store with 16-bit element data need to be
4303 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4304 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4305 /// registers.
4306 ///
4307 /// We don't want to directly select image instructions just yet, but also want
4308 /// to exposes all register repacking to the legalizer/combiners. We also don't
4309 /// want a selected instrution entering RegBankSelect. In order to avoid
4310 /// defining a multitude of intermediate image instructions, directly hack on
4311 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4312 /// padding now unnecessary arguments with $noreg.
4313 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4314     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4315     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4316 
4317   const unsigned NumDefs = MI.getNumExplicitDefs();
4318   const unsigned ArgOffset = NumDefs + 1;
4319   bool IsTFE = NumDefs == 2;
4320   // We are only processing the operands of d16 image operations on subtargets
4321   // that use the unpacked register layout, or need to repack the TFE result.
4322 
4323   // TODO: Do we need to guard against already legalized intrinsics?
4324   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4325       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4326 
4327   MachineRegisterInfo *MRI = B.getMRI();
4328   const LLT S32 = LLT::scalar(32);
4329   const LLT S16 = LLT::scalar(16);
4330   const LLT V2S16 = LLT::fixed_vector(2, 16);
4331 
4332   unsigned DMask = 0;
4333 
4334   // Check for 16 bit addresses and pack if true.
4335   LLT GradTy =
4336       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4337   LLT AddrTy =
4338       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4339   const bool IsG16 = GradTy == S16;
4340   const bool IsA16 = AddrTy == S16;
4341 
4342   int DMaskLanes = 0;
4343   if (!BaseOpcode->Atomic) {
4344     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4345     if (BaseOpcode->Gather4) {
4346       DMaskLanes = 4;
4347     } else if (DMask != 0) {
4348       DMaskLanes = countPopulation(DMask);
4349     } else if (!IsTFE && !BaseOpcode->Store) {
4350       // If dmask is 0, this is a no-op load. This can be eliminated.
4351       B.buildUndef(MI.getOperand(0));
4352       MI.eraseFromParent();
4353       return true;
4354     }
4355   }
4356 
4357   Observer.changingInstr(MI);
4358   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4359 
4360   unsigned NewOpcode = NumDefs == 0 ?
4361     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4362 
4363   // Track that we legalized this
4364   MI.setDesc(B.getTII().get(NewOpcode));
4365 
4366   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4367   // dmask to be at least 1 otherwise the instruction will fail
4368   if (IsTFE && DMask == 0) {
4369     DMask = 0x1;
4370     DMaskLanes = 1;
4371     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4372   }
4373 
4374   if (BaseOpcode->Atomic) {
4375     Register VData0 = MI.getOperand(2).getReg();
4376     LLT Ty = MRI->getType(VData0);
4377 
4378     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4379     if (Ty.isVector())
4380       return false;
4381 
4382     if (BaseOpcode->AtomicX2) {
4383       Register VData1 = MI.getOperand(3).getReg();
4384       // The two values are packed in one register.
4385       LLT PackedTy = LLT::fixed_vector(2, Ty);
4386       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4387       MI.getOperand(2).setReg(Concat.getReg(0));
4388       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4389     }
4390   }
4391 
4392   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4393 
4394   // Optimize _L to _LZ when _L is zero
4395   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4396           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4397     const ConstantFP *ConstantLod;
4398 
4399     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4400                  m_GFCst(ConstantLod))) {
4401       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4402         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4403         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4404             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4405                                                      Intr->Dim);
4406 
4407         // The starting indexes should remain in the same place.
4408         --CorrectedNumVAddrs;
4409 
4410         MI.getOperand(MI.getNumExplicitDefs())
4411             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4412         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4413         Intr = NewImageDimIntr;
4414       }
4415     }
4416   }
4417 
4418   // Optimize _mip away, when 'lod' is zero
4419   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4420     int64_t ConstantLod;
4421     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4422                  m_ICst(ConstantLod))) {
4423       if (ConstantLod == 0) {
4424         // TODO: Change intrinsic opcode and remove operand instead or replacing
4425         // it with 0, as the _L to _LZ handling is done above.
4426         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4427         --CorrectedNumVAddrs;
4428       }
4429     }
4430   }
4431 
4432   // Rewrite the addressing register layout before doing anything else.
4433   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4434     // 16 bit gradients are supported, but are tied to the A16 control
4435     // so both gradients and addresses must be 16 bit
4436     return false;
4437   }
4438 
4439   if (IsA16 && !ST.hasA16()) {
4440     // A16 not supported
4441     return false;
4442   }
4443 
4444   if (IsA16 || IsG16) {
4445     if (Intr->NumVAddrs > 1) {
4446       SmallVector<Register, 4> PackedRegs;
4447 
4448       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4449                                 IsG16);
4450 
4451       // See also below in the non-a16 branch
4452       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4453                           PackedRegs.size() <= ST.getNSAMaxSize();
4454 
4455       if (!UseNSA && PackedRegs.size() > 1) {
4456         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4457         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4458         PackedRegs[0] = Concat.getReg(0);
4459         PackedRegs.resize(1);
4460       }
4461 
4462       const unsigned NumPacked = PackedRegs.size();
4463       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4464         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4465         if (!SrcOp.isReg()) {
4466           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4467           continue;
4468         }
4469 
4470         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4471 
4472         if (I - Intr->VAddrStart < NumPacked)
4473           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4474         else
4475           SrcOp.setReg(AMDGPU::NoRegister);
4476       }
4477     }
4478   } else {
4479     // If the register allocator cannot place the address registers contiguously
4480     // without introducing moves, then using the non-sequential address encoding
4481     // is always preferable, since it saves VALU instructions and is usually a
4482     // wash in terms of code size or even better.
4483     //
4484     // However, we currently have no way of hinting to the register allocator
4485     // that MIMG addresses should be placed contiguously when it is possible to
4486     // do so, so force non-NSA for the common 2-address case as a heuristic.
4487     //
4488     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4489     // allocation when possible.
4490     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4491                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4492 
4493     if (!UseNSA && Intr->NumVAddrs > 1)
4494       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4495                                Intr->NumVAddrs);
4496   }
4497 
4498   int Flags = 0;
4499   if (IsA16)
4500     Flags |= 1;
4501   if (IsG16)
4502     Flags |= 2;
4503   MI.addOperand(MachineOperand::CreateImm(Flags));
4504 
4505   if (BaseOpcode->Store) { // No TFE for stores?
4506     // TODO: Handle dmask trim
4507     Register VData = MI.getOperand(1).getReg();
4508     LLT Ty = MRI->getType(VData);
4509     if (!Ty.isVector() || Ty.getElementType() != S16)
4510       return true;
4511 
4512     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4513     if (RepackedReg != VData) {
4514       MI.getOperand(1).setReg(RepackedReg);
4515     }
4516 
4517     return true;
4518   }
4519 
4520   Register DstReg = MI.getOperand(0).getReg();
4521   LLT Ty = MRI->getType(DstReg);
4522   const LLT EltTy = Ty.getScalarType();
4523   const bool IsD16 = Ty.getScalarType() == S16;
4524   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4525 
4526   // Confirm that the return type is large enough for the dmask specified
4527   if (NumElts < DMaskLanes)
4528     return false;
4529 
4530   if (NumElts > 4 || DMaskLanes > 4)
4531     return false;
4532 
4533   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4534   const LLT AdjustedTy =
4535       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4536 
4537   // The raw dword aligned data component of the load. The only legal cases
4538   // where this matters should be when using the packed D16 format, for
4539   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4540   LLT RoundedTy;
4541 
4542   // S32 vector to to cover all data, plus TFE result element.
4543   LLT TFETy;
4544 
4545   // Register type to use for each loaded component. Will be S32 or V2S16.
4546   LLT RegTy;
4547 
4548   if (IsD16 && ST.hasUnpackedD16VMem()) {
4549     RoundedTy =
4550         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4551     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4552     RegTy = S32;
4553   } else {
4554     unsigned EltSize = EltTy.getSizeInBits();
4555     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4556     unsigned RoundedSize = 32 * RoundedElts;
4557     RoundedTy = LLT::scalarOrVector(
4558         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4559     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4560     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4561   }
4562 
4563   // The return type does not need adjustment.
4564   // TODO: Should we change s16 case to s32 or <2 x s16>?
4565   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4566     return true;
4567 
4568   Register Dst1Reg;
4569 
4570   // Insert after the instruction.
4571   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4572 
4573   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4574   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4575   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4576   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4577 
4578   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4579 
4580   MI.getOperand(0).setReg(NewResultReg);
4581 
4582   // In the IR, TFE is supposed to be used with a 2 element struct return
4583   // type. The instruction really returns these two values in one contiguous
4584   // register, with one additional dword beyond the loaded data. Rewrite the
4585   // return type to use a single register result.
4586 
4587   if (IsTFE) {
4588     Dst1Reg = MI.getOperand(1).getReg();
4589     if (MRI->getType(Dst1Reg) != S32)
4590       return false;
4591 
4592     // TODO: Make sure the TFE operand bit is set.
4593     MI.RemoveOperand(1);
4594 
4595     // Handle the easy case that requires no repack instructions.
4596     if (Ty == S32) {
4597       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4598       return true;
4599     }
4600   }
4601 
4602   // Now figure out how to copy the new result register back into the old
4603   // result.
4604   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4605 
4606   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4607 
4608   if (ResultNumRegs == 1) {
4609     assert(!IsTFE);
4610     ResultRegs[0] = NewResultReg;
4611   } else {
4612     // We have to repack into a new vector of some kind.
4613     for (int I = 0; I != NumDataRegs; ++I)
4614       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4615     B.buildUnmerge(ResultRegs, NewResultReg);
4616 
4617     // Drop the final TFE element to get the data part. The TFE result is
4618     // directly written to the right place already.
4619     if (IsTFE)
4620       ResultRegs.resize(NumDataRegs);
4621   }
4622 
4623   // For an s16 scalar result, we form an s32 result with a truncate regardless
4624   // of packed vs. unpacked.
4625   if (IsD16 && !Ty.isVector()) {
4626     B.buildTrunc(DstReg, ResultRegs[0]);
4627     return true;
4628   }
4629 
4630   // Avoid a build/concat_vector of 1 entry.
4631   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4632     B.buildBitcast(DstReg, ResultRegs[0]);
4633     return true;
4634   }
4635 
4636   assert(Ty.isVector());
4637 
4638   if (IsD16) {
4639     // For packed D16 results with TFE enabled, all the data components are
4640     // S32. Cast back to the expected type.
4641     //
4642     // TODO: We don't really need to use load s32 elements. We would only need one
4643     // cast for the TFE result if a multiple of v2s16 was used.
4644     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4645       for (Register &Reg : ResultRegs)
4646         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4647     } else if (ST.hasUnpackedD16VMem()) {
4648       for (Register &Reg : ResultRegs)
4649         Reg = B.buildTrunc(S16, Reg).getReg(0);
4650     }
4651   }
4652 
4653   auto padWithUndef = [&](LLT Ty, int NumElts) {
4654     if (NumElts == 0)
4655       return;
4656     Register Undef = B.buildUndef(Ty).getReg(0);
4657     for (int I = 0; I != NumElts; ++I)
4658       ResultRegs.push_back(Undef);
4659   };
4660 
4661   // Pad out any elements eliminated due to the dmask.
4662   LLT ResTy = MRI->getType(ResultRegs[0]);
4663   if (!ResTy.isVector()) {
4664     padWithUndef(ResTy, NumElts - ResultRegs.size());
4665     B.buildBuildVector(DstReg, ResultRegs);
4666     return true;
4667   }
4668 
4669   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4670   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4671 
4672   // Deal with the one annoying legal case.
4673   const LLT V3S16 = LLT::fixed_vector(3, 16);
4674   if (Ty == V3S16) {
4675     if (IsTFE) {
4676       if (ResultRegs.size() == 1) {
4677         NewResultReg = ResultRegs[0];
4678       } else if (ResultRegs.size() == 2) {
4679         LLT V4S16 = LLT::fixed_vector(4, 16);
4680         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4681       } else {
4682         return false;
4683       }
4684     }
4685 
4686     if (MRI->getType(DstReg).getNumElements() <
4687         MRI->getType(NewResultReg).getNumElements()) {
4688       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4689     } else {
4690       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4691     }
4692     return true;
4693   }
4694 
4695   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4696   B.buildConcatVectors(DstReg, ResultRegs);
4697   return true;
4698 }
4699 
4700 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4701   LegalizerHelper &Helper, MachineInstr &MI) const {
4702   MachineIRBuilder &B = Helper.MIRBuilder;
4703   GISelChangeObserver &Observer = Helper.Observer;
4704 
4705   Register Dst = MI.getOperand(0).getReg();
4706   LLT Ty = B.getMRI()->getType(Dst);
4707   unsigned Size = Ty.getSizeInBits();
4708   MachineFunction &MF = B.getMF();
4709 
4710   Observer.changingInstr(MI);
4711 
4712   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4713     Ty = getBitcastRegisterType(Ty);
4714     Helper.bitcastDst(MI, Ty, 0);
4715     Dst = MI.getOperand(0).getReg();
4716     B.setInsertPt(B.getMBB(), MI);
4717   }
4718 
4719   // FIXME: We don't really need this intermediate instruction. The intrinsic
4720   // should be fixed to have a memory operand. Since it's readnone, we're not
4721   // allowed to add one.
4722   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4723   MI.RemoveOperand(1); // Remove intrinsic ID
4724 
4725   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4726   // TODO: Should this use datalayout alignment?
4727   const unsigned MemSize = (Size + 7) / 8;
4728   const Align MemAlign(4);
4729   MachineMemOperand *MMO = MF.getMachineMemOperand(
4730       MachinePointerInfo(),
4731       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4732           MachineMemOperand::MOInvariant,
4733       MemSize, MemAlign);
4734   MI.addMemOperand(MF, MMO);
4735 
4736   // There are no 96-bit result scalar loads, but widening to 128-bit should
4737   // always be legal. We may need to restore this to a 96-bit result if it turns
4738   // out this needs to be converted to a vector load during RegBankSelect.
4739   if (!isPowerOf2_32(Size)) {
4740     if (Ty.isVector())
4741       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4742     else
4743       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4744   }
4745 
4746   Observer.changedInstr(MI);
4747   return true;
4748 }
4749 
4750 // TODO: Move to selection
4751 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4752                                                 MachineRegisterInfo &MRI,
4753                                                 MachineIRBuilder &B) const {
4754   if (!ST.isTrapHandlerEnabled() ||
4755       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4756     return legalizeTrapEndpgm(MI, MRI, B);
4757 
4758   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4759     switch (*HsaAbiVer) {
4760     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4761     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4762       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4763     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4764       return ST.supportsGetDoorbellID() ?
4765           legalizeTrapHsa(MI, MRI, B) :
4766           legalizeTrapHsaQueuePtr(MI, MRI, B);
4767     }
4768   }
4769 
4770   llvm_unreachable("Unknown trap handler");
4771 }
4772 
4773 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4774     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4775   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4776   MI.eraseFromParent();
4777   return true;
4778 }
4779 
4780 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4781     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4782   // Pass queue pointer to trap handler as input, and insert trap instruction
4783   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4784   Register LiveIn =
4785     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4786   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4787     return false;
4788 
4789   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4790   B.buildCopy(SGPR01, LiveIn);
4791   B.buildInstr(AMDGPU::S_TRAP)
4792       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4793       .addReg(SGPR01, RegState::Implicit);
4794 
4795   MI.eraseFromParent();
4796   return true;
4797 }
4798 
4799 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4800     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4801   B.buildInstr(AMDGPU::S_TRAP)
4802       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4803   MI.eraseFromParent();
4804   return true;
4805 }
4806 
4807 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4808     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4809   // Is non-HSA path or trap-handler disabled? Then, report a warning
4810   // accordingly
4811   if (!ST.isTrapHandlerEnabled() ||
4812       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4813     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4814                                      "debugtrap handler not supported",
4815                                      MI.getDebugLoc(), DS_Warning);
4816     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4817     Ctx.diagnose(NoTrap);
4818   } else {
4819     // Insert debug-trap instruction
4820     B.buildInstr(AMDGPU::S_TRAP)
4821         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4822   }
4823 
4824   MI.eraseFromParent();
4825   return true;
4826 }
4827 
4828 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4829                                                MachineIRBuilder &B) const {
4830   MachineRegisterInfo &MRI = *B.getMRI();
4831   const LLT S16 = LLT::scalar(16);
4832   const LLT S32 = LLT::scalar(32);
4833 
4834   Register DstReg = MI.getOperand(0).getReg();
4835   Register NodePtr = MI.getOperand(2).getReg();
4836   Register RayExtent = MI.getOperand(3).getReg();
4837   Register RayOrigin = MI.getOperand(4).getReg();
4838   Register RayDir = MI.getOperand(5).getReg();
4839   Register RayInvDir = MI.getOperand(6).getReg();
4840   Register TDescr = MI.getOperand(7).getReg();
4841 
4842   if (!ST.hasGFX10_AEncoding()) {
4843     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4844                                         "intrinsic not supported on subtarget",
4845                                         MI.getDebugLoc());
4846     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4847     return false;
4848   }
4849 
4850   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4851   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4852   const unsigned NumVDataDwords = 4;
4853   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4854   const bool UseNSA =
4855       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4856   const unsigned BaseOpcodes[2][2] = {
4857       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4858       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4859        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4860   int Opcode;
4861   if (UseNSA) {
4862     Opcode =
4863         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4864                               NumVDataDwords, NumVAddrDwords);
4865   } else {
4866     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4867                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4868                                    PowerOf2Ceil(NumVAddrDwords));
4869   }
4870   assert(Opcode != -1);
4871 
4872   SmallVector<Register, 12> Ops;
4873   if (Is64) {
4874     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4875     Ops.push_back(Unmerge.getReg(0));
4876     Ops.push_back(Unmerge.getReg(1));
4877   } else {
4878     Ops.push_back(NodePtr);
4879   }
4880   Ops.push_back(RayExtent);
4881 
4882   auto packLanes = [&Ops, &S32, &B](Register Src) {
4883     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4884     Ops.push_back(Unmerge.getReg(0));
4885     Ops.push_back(Unmerge.getReg(1));
4886     Ops.push_back(Unmerge.getReg(2));
4887   };
4888 
4889   packLanes(RayOrigin);
4890   if (IsA16) {
4891     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4892     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4893     Register R1 = MRI.createGenericVirtualRegister(S32);
4894     Register R2 = MRI.createGenericVirtualRegister(S32);
4895     Register R3 = MRI.createGenericVirtualRegister(S32);
4896     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4897     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4898     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4899     Ops.push_back(R1);
4900     Ops.push_back(R2);
4901     Ops.push_back(R3);
4902   } else {
4903     packLanes(RayDir);
4904     packLanes(RayInvDir);
4905   }
4906 
4907   if (!UseNSA) {
4908     // Build a single vector containing all the operands so far prepared.
4909     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4910     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4911     Ops.clear();
4912     Ops.push_back(MergedOps);
4913   }
4914 
4915   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4916     .addDef(DstReg)
4917     .addImm(Opcode);
4918 
4919   for (Register R : Ops) {
4920     MIB.addUse(R);
4921   }
4922 
4923   MIB.addUse(TDescr)
4924      .addImm(IsA16 ? 1 : 0)
4925      .cloneMemRefs(MI);
4926 
4927   MI.eraseFromParent();
4928   return true;
4929 }
4930 
4931 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4932                                             MachineInstr &MI) const {
4933   MachineIRBuilder &B = Helper.MIRBuilder;
4934   MachineRegisterInfo &MRI = *B.getMRI();
4935 
4936   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4937   auto IntrID = MI.getIntrinsicID();
4938   switch (IntrID) {
4939   case Intrinsic::amdgcn_if:
4940   case Intrinsic::amdgcn_else: {
4941     MachineInstr *Br = nullptr;
4942     MachineBasicBlock *UncondBrTarget = nullptr;
4943     bool Negated = false;
4944     if (MachineInstr *BrCond =
4945             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4946       const SIRegisterInfo *TRI
4947         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4948 
4949       Register Def = MI.getOperand(1).getReg();
4950       Register Use = MI.getOperand(3).getReg();
4951 
4952       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4953 
4954       if (Negated)
4955         std::swap(CondBrTarget, UncondBrTarget);
4956 
4957       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4958       if (IntrID == Intrinsic::amdgcn_if) {
4959         B.buildInstr(AMDGPU::SI_IF)
4960           .addDef(Def)
4961           .addUse(Use)
4962           .addMBB(UncondBrTarget);
4963       } else {
4964         B.buildInstr(AMDGPU::SI_ELSE)
4965             .addDef(Def)
4966             .addUse(Use)
4967             .addMBB(UncondBrTarget);
4968       }
4969 
4970       if (Br) {
4971         Br->getOperand(0).setMBB(CondBrTarget);
4972       } else {
4973         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4974         // since we're swapping branch targets it needs to be reinserted.
4975         // FIXME: IRTranslator should probably not do this
4976         B.buildBr(*CondBrTarget);
4977       }
4978 
4979       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4980       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4981       MI.eraseFromParent();
4982       BrCond->eraseFromParent();
4983       return true;
4984     }
4985 
4986     return false;
4987   }
4988   case Intrinsic::amdgcn_loop: {
4989     MachineInstr *Br = nullptr;
4990     MachineBasicBlock *UncondBrTarget = nullptr;
4991     bool Negated = false;
4992     if (MachineInstr *BrCond =
4993             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4994       const SIRegisterInfo *TRI
4995         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4996 
4997       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4998       Register Reg = MI.getOperand(2).getReg();
4999 
5000       if (Negated)
5001         std::swap(CondBrTarget, UncondBrTarget);
5002 
5003       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5004       B.buildInstr(AMDGPU::SI_LOOP)
5005         .addUse(Reg)
5006         .addMBB(UncondBrTarget);
5007 
5008       if (Br)
5009         Br->getOperand(0).setMBB(CondBrTarget);
5010       else
5011         B.buildBr(*CondBrTarget);
5012 
5013       MI.eraseFromParent();
5014       BrCond->eraseFromParent();
5015       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5016       return true;
5017     }
5018 
5019     return false;
5020   }
5021   case Intrinsic::amdgcn_kernarg_segment_ptr:
5022     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5023       // This only makes sense to call in a kernel, so just lower to null.
5024       B.buildConstant(MI.getOperand(0).getReg(), 0);
5025       MI.eraseFromParent();
5026       return true;
5027     }
5028 
5029     return legalizePreloadedArgIntrin(
5030       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5031   case Intrinsic::amdgcn_implicitarg_ptr:
5032     return legalizeImplicitArgPtr(MI, MRI, B);
5033   case Intrinsic::amdgcn_workitem_id_x:
5034     return legalizePreloadedArgIntrin(MI, MRI, B,
5035                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5036   case Intrinsic::amdgcn_workitem_id_y:
5037     return legalizePreloadedArgIntrin(MI, MRI, B,
5038                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5039   case Intrinsic::amdgcn_workitem_id_z:
5040     return legalizePreloadedArgIntrin(MI, MRI, B,
5041                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5042   case Intrinsic::amdgcn_workgroup_id_x:
5043     return legalizePreloadedArgIntrin(MI, MRI, B,
5044                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5045   case Intrinsic::amdgcn_workgroup_id_y:
5046     return legalizePreloadedArgIntrin(MI, MRI, B,
5047                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5048   case Intrinsic::amdgcn_workgroup_id_z:
5049     return legalizePreloadedArgIntrin(MI, MRI, B,
5050                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5051   case Intrinsic::amdgcn_dispatch_ptr:
5052     return legalizePreloadedArgIntrin(MI, MRI, B,
5053                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5054   case Intrinsic::amdgcn_queue_ptr:
5055     return legalizePreloadedArgIntrin(MI, MRI, B,
5056                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5057   case Intrinsic::amdgcn_implicit_buffer_ptr:
5058     return legalizePreloadedArgIntrin(
5059       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5060   case Intrinsic::amdgcn_dispatch_id:
5061     return legalizePreloadedArgIntrin(MI, MRI, B,
5062                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5063   case Intrinsic::amdgcn_fdiv_fast:
5064     return legalizeFDIVFastIntrin(MI, MRI, B);
5065   case Intrinsic::amdgcn_is_shared:
5066     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5067   case Intrinsic::amdgcn_is_private:
5068     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5069   case Intrinsic::amdgcn_wavefrontsize: {
5070     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5071     MI.eraseFromParent();
5072     return true;
5073   }
5074   case Intrinsic::amdgcn_s_buffer_load:
5075     return legalizeSBufferLoad(Helper, MI);
5076   case Intrinsic::amdgcn_raw_buffer_store:
5077   case Intrinsic::amdgcn_struct_buffer_store:
5078     return legalizeBufferStore(MI, MRI, B, false, false);
5079   case Intrinsic::amdgcn_raw_buffer_store_format:
5080   case Intrinsic::amdgcn_struct_buffer_store_format:
5081     return legalizeBufferStore(MI, MRI, B, false, true);
5082   case Intrinsic::amdgcn_raw_tbuffer_store:
5083   case Intrinsic::amdgcn_struct_tbuffer_store:
5084     return legalizeBufferStore(MI, MRI, B, true, true);
5085   case Intrinsic::amdgcn_raw_buffer_load:
5086   case Intrinsic::amdgcn_struct_buffer_load:
5087     return legalizeBufferLoad(MI, MRI, B, false, false);
5088   case Intrinsic::amdgcn_raw_buffer_load_format:
5089   case Intrinsic::amdgcn_struct_buffer_load_format:
5090     return legalizeBufferLoad(MI, MRI, B, true, false);
5091   case Intrinsic::amdgcn_raw_tbuffer_load:
5092   case Intrinsic::amdgcn_struct_tbuffer_load:
5093     return legalizeBufferLoad(MI, MRI, B, true, true);
5094   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5095   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5096   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5097   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5098   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5099   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5100   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5101   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5102   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5103   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5104   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5105   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5106   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5107   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5108   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5109   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5110   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5111   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5112   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5113   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5114   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5115   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5116   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5117   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5118   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5119   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5120   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5121   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5122   case Intrinsic::amdgcn_buffer_atomic_fadd:
5123   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5124   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5125   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5126   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5127     return legalizeBufferAtomic(MI, B, IntrID);
5128   case Intrinsic::amdgcn_atomic_inc:
5129     return legalizeAtomicIncDec(MI, B, true);
5130   case Intrinsic::amdgcn_atomic_dec:
5131     return legalizeAtomicIncDec(MI, B, false);
5132   case Intrinsic::trap:
5133     return legalizeTrapIntrinsic(MI, MRI, B);
5134   case Intrinsic::debugtrap:
5135     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5136   case Intrinsic::amdgcn_rsq_clamp:
5137     return legalizeRsqClampIntrinsic(MI, MRI, B);
5138   case Intrinsic::amdgcn_ds_fadd:
5139   case Intrinsic::amdgcn_ds_fmin:
5140   case Intrinsic::amdgcn_ds_fmax:
5141     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5142   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5143     return legalizeBVHIntrinsic(MI, B);
5144   default: {
5145     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5146             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5147       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5148     return true;
5149   }
5150   }
5151 
5152   return true;
5153 }
5154