1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816     .lowerFor({{S32, S64}})
817     .lowerIf(typeIs(1, S1))
818     .customFor({{S64, S64}});
819   if (ST.has16BitInsts())
820     IToFP.legalFor({{S16, S16}});
821   IToFP.clampScalar(1, S32, S64)
822        .minScalar(0, S32)
823        .scalarize(0)
824        .widenScalarToNextPow2(1);
825 
826   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
827     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
828     .customFor({{S64, S32}, {S64, S64}})
829     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
830   if (ST.has16BitInsts())
831     FPToI.legalFor({{S16, S16}});
832   else
833     FPToI.minScalar(1, S32);
834 
835   FPToI.minScalar(0, S32)
836        .widenScalarToNextPow2(0, 32)
837        .scalarize(0)
838        .lower();
839 
840   // Lower roundeven into G_FRINT
841   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
842     .scalarize(0)
843     .lower();
844 
845   if (ST.has16BitInsts()) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S16, S32, S64})
848       .clampScalar(0, S16, S64)
849       .scalarize(0);
850   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32, S64})
853       .clampScalar(0, S32, S64)
854       .scalarize(0);
855   } else {
856     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
857       .legalFor({S32})
858       .customFor({S64})
859       .clampScalar(0, S32, S64)
860       .scalarize(0);
861   }
862 
863   getActionDefinitionsBuilder(G_PTR_ADD)
864     .legalIf(all(isPointer(0), sameSize(0, 1)))
865     .scalarize(0)
866     .scalarSameSizeAs(1, 0);
867 
868   getActionDefinitionsBuilder(G_PTRMASK)
869     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
870     .scalarSameSizeAs(1, 0)
871     .scalarize(0);
872 
873   auto &CmpBuilder =
874     getActionDefinitionsBuilder(G_ICMP)
875     // The compare output type differs based on the register bank of the output,
876     // so make both s1 and s32 legal.
877     //
878     // Scalar compares producing output in scc will be promoted to s32, as that
879     // is the allocatable register type that will be needed for the copy from
880     // scc. This will be promoted during RegBankSelect, and we assume something
881     // before that won't try to use s32 result types.
882     //
883     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884     // bank.
885     .legalForCartesianProduct(
886       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
887     .legalForCartesianProduct(
888       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
889   if (ST.has16BitInsts()) {
890     CmpBuilder.legalFor({{S1, S16}});
891   }
892 
893   CmpBuilder
894     .widenScalarToNextPow2(1)
895     .clampScalar(1, S32, S64)
896     .scalarize(0)
897     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
898 
899   getActionDefinitionsBuilder(G_FCMP)
900     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
901     .widenScalarToNextPow2(1)
902     .clampScalar(1, S32, S64)
903     .scalarize(0);
904 
905   // FIXME: fpow has a selection pattern that should move to custom lowering.
906   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
907   if (ST.has16BitInsts())
908     Exp2Ops.legalFor({S32, S16});
909   else
910     Exp2Ops.legalFor({S32});
911   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
912   Exp2Ops.scalarize(0);
913 
914   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
915   if (ST.has16BitInsts())
916     ExpOps.customFor({{S32}, {S16}});
917   else
918     ExpOps.customFor({S32});
919   ExpOps.clampScalar(0, MinScalarFPTy, S32)
920         .scalarize(0);
921 
922   getActionDefinitionsBuilder(G_FPOWI)
923     .clampScalar(0, MinScalarFPTy, S32)
924     .lower();
925 
926   // The 64-bit versions produce 32-bit results, but only on the SALU.
927   getActionDefinitionsBuilder(G_CTPOP)
928     .legalFor({{S32, S32}, {S32, S64}})
929     .clampScalar(0, S32, S32)
930     .clampScalar(1, S32, S64)
931     .scalarize(0)
932     .widenScalarToNextPow2(0, 32)
933     .widenScalarToNextPow2(1, 32);
934 
935   // The hardware instructions return a different result on 0 than the generic
936   // instructions expect. The hardware produces -1, but these produce the
937   // bitwidth.
938   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
939     .scalarize(0)
940     .clampScalar(0, S32, S32)
941     .clampScalar(1, S32, S64)
942     .widenScalarToNextPow2(0, 32)
943     .widenScalarToNextPow2(1, 32)
944     .lower();
945 
946   // The 64-bit versions produce 32-bit results, but only on the SALU.
947   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
948     .legalFor({{S32, S32}, {S32, S64}})
949     .clampScalar(0, S32, S32)
950     .clampScalar(1, S32, S64)
951     .scalarize(0)
952     .widenScalarToNextPow2(0, 32)
953     .widenScalarToNextPow2(1, 32);
954 
955   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956   // RegBankSelect.
957   getActionDefinitionsBuilder(G_BITREVERSE)
958     .legalFor({S32, S64})
959     .clampScalar(0, S32, S64)
960     .scalarize(0)
961     .widenScalarToNextPow2(0);
962 
963   if (ST.has16BitInsts()) {
964     getActionDefinitionsBuilder(G_BSWAP)
965       .legalFor({S16, S32, V2S16})
966       .clampMaxNumElements(0, S16, 2)
967       // FIXME: Fixing non-power-of-2 before clamp is workaround for
968       // narrowScalar limitation.
969       .widenScalarToNextPow2(0)
970       .clampScalar(0, S16, S32)
971       .scalarize(0);
972 
973     if (ST.hasVOP3PInsts()) {
974       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
975         .legalFor({S32, S16, V2S16})
976         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
977         .clampMaxNumElements(0, S16, 2)
978         .minScalar(0, S16)
979         .widenScalarToNextPow2(0)
980         .scalarize(0)
981         .lower();
982     } else {
983       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
984         .legalFor({S32, S16})
985         .widenScalarToNextPow2(0)
986         .minScalar(0, S16)
987         .scalarize(0)
988         .lower();
989     }
990   } else {
991     // TODO: Should have same legality without v_perm_b32
992     getActionDefinitionsBuilder(G_BSWAP)
993       .legalFor({S32})
994       .lowerIf(scalarNarrowerThan(0, 32))
995       // FIXME: Fixing non-power-of-2 before clamp is workaround for
996       // narrowScalar limitation.
997       .widenScalarToNextPow2(0)
998       .maxScalar(0, S32)
999       .scalarize(0)
1000       .lower();
1001 
1002     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1003       .legalFor({S32})
1004       .minScalar(0, S32)
1005       .widenScalarToNextPow2(0)
1006       .scalarize(0)
1007       .lower();
1008   }
1009 
1010   getActionDefinitionsBuilder(G_INTTOPTR)
1011     // List the common cases
1012     .legalForCartesianProduct(AddrSpaces64, {S64})
1013     .legalForCartesianProduct(AddrSpaces32, {S32})
1014     .scalarize(0)
1015     // Accept any address space as long as the size matches
1016     .legalIf(sameSize(0, 1))
1017     .widenScalarIf(smallerThan(1, 0),
1018       [](const LegalityQuery &Query) {
1019         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1020       })
1021     .narrowScalarIf(largerThan(1, 0),
1022       [](const LegalityQuery &Query) {
1023         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1024       });
1025 
1026   getActionDefinitionsBuilder(G_PTRTOINT)
1027     // List the common cases
1028     .legalForCartesianProduct(AddrSpaces64, {S64})
1029     .legalForCartesianProduct(AddrSpaces32, {S32})
1030     .scalarize(0)
1031     // Accept any address space as long as the size matches
1032     .legalIf(sameSize(0, 1))
1033     .widenScalarIf(smallerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       })
1037     .narrowScalarIf(
1038       largerThan(0, 1),
1039       [](const LegalityQuery &Query) {
1040         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1041       });
1042 
1043   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1044     .scalarize(0)
1045     .custom();
1046 
1047   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1048                                     bool IsLoad) -> bool {
1049     const LLT DstTy = Query.Types[0];
1050 
1051     // Split vector extloads.
1052     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1053     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054 
1055     if (MemSize < DstTy.getSizeInBits())
1056       MemSize = std::max(MemSize, AlignBits);
1057 
1058     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1059       return true;
1060 
1061     const LLT PtrTy = Query.Types[1];
1062     unsigned AS = PtrTy.getAddressSpace();
1063     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1064       return true;
1065 
1066     // Catch weird sized loads that don't evenly divide into the access sizes
1067     // TODO: May be able to widen depending on alignment etc.
1068     unsigned NumRegs = (MemSize + 31) / 32;
1069     if (NumRegs == 3) {
1070       if (!ST.hasDwordx3LoadStores())
1071         return true;
1072     } else {
1073       // If the alignment allows, these should have been widened.
1074       if (!isPowerOf2_32(NumRegs))
1075         return true;
1076     }
1077 
1078     if (AlignBits < MemSize) {
1079       const SITargetLowering *TLI = ST.getTargetLowering();
1080       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1081                                                       Align(AlignBits / 8));
1082     }
1083 
1084     return false;
1085   };
1086 
1087   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1088   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1089   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090 
1091   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1092   // LDS
1093   // TODO: Unsupported flat for SI.
1094 
1095   for (unsigned Op : {G_LOAD, G_STORE}) {
1096     const bool IsStore = Op == G_STORE;
1097 
1098     auto &Actions = getActionDefinitionsBuilder(Op);
1099     // Explicitly list some common cases.
1100     // TODO: Does this help compile time at all?
1101     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1102                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1103                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1104                                       {S64, GlobalPtr, S64, GlobalAlign32},
1105                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1106                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1107                                       {S32, GlobalPtr, S8, GlobalAlign8},
1108                                       {S32, GlobalPtr, S16, GlobalAlign16},
1109 
1110                                       {S32, LocalPtr, S32, 32},
1111                                       {S64, LocalPtr, S64, 32},
1112                                       {V2S32, LocalPtr, V2S32, 32},
1113                                       {S32, LocalPtr, S8, 8},
1114                                       {S32, LocalPtr, S16, 16},
1115                                       {V2S16, LocalPtr, S32, 32},
1116 
1117                                       {S32, PrivatePtr, S32, 32},
1118                                       {S32, PrivatePtr, S8, 8},
1119                                       {S32, PrivatePtr, S16, 16},
1120                                       {V2S16, PrivatePtr, S32, 32},
1121 
1122                                       {S32, ConstantPtr, S32, GlobalAlign32},
1123                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1124                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1125                                       {S64, ConstantPtr, S64, GlobalAlign32},
1126                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1127     Actions.legalIf(
1128       [=](const LegalityQuery &Query) -> bool {
1129         return isLoadStoreLegal(ST, Query);
1130       });
1131 
1132     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1133     // 64-bits.
1134     //
1135     // TODO: Should generalize bitcast action into coerce, which will also cover
1136     // inserting addrspacecasts.
1137     Actions.customIf(typeIs(1, Constant32Ptr));
1138 
1139     // Turn any illegal element vectors into something easier to deal
1140     // with. These will ultimately produce 32-bit scalar shifts to extract the
1141     // parts anyway.
1142     //
1143     // For odd 16-bit element vectors, prefer to split those into pieces with
1144     // 16-bit vector parts.
1145     Actions.bitcastIf(
1146       [=](const LegalityQuery &Query) -> bool {
1147         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1148                                           Query.MMODescrs[0].MemoryTy);
1149       }, bitcastToRegisterType(0));
1150 
1151     if (!IsStore) {
1152       // Widen suitably aligned loads by loading extra bytes. The standard
1153       // legalization actions can't properly express widening memory operands.
1154       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1155         return shouldWidenLoad(ST, Query, G_LOAD);
1156       });
1157     }
1158 
1159     // FIXME: load/store narrowing should be moved to lower action
1160     Actions
1161         .narrowScalarIf(
1162             [=](const LegalityQuery &Query) -> bool {
1163               return !Query.Types[0].isVector() &&
1164                      needToSplitMemOp(Query, Op == G_LOAD);
1165             },
1166             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1167               const LLT DstTy = Query.Types[0];
1168               const LLT PtrTy = Query.Types[1];
1169 
1170               const unsigned DstSize = DstTy.getSizeInBits();
1171               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1172 
1173               // Split extloads.
1174               if (DstSize > MemSize)
1175                 return std::make_pair(0, LLT::scalar(MemSize));
1176 
1177               if (!isPowerOf2_32(DstSize)) {
1178                 // We're probably decomposing an odd sized store. Try to split
1179                 // to the widest type. TODO: Account for alignment. As-is it
1180                 // should be OK, since the new parts will be further legalized.
1181                 unsigned FloorSize = PowerOf2Floor(DstSize);
1182                 return std::make_pair(0, LLT::scalar(FloorSize));
1183               }
1184 
1185               if (DstSize > 32 && (DstSize % 32 != 0)) {
1186                 // FIXME: Need a way to specify non-extload of larger size if
1187                 // suitably aligned.
1188                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1189               }
1190 
1191               unsigned MaxSize = maxSizeForAddrSpace(ST,
1192                                                      PtrTy.getAddressSpace(),
1193                                                      Op == G_LOAD);
1194               if (MemSize > MaxSize)
1195                 return std::make_pair(0, LLT::scalar(MaxSize));
1196 
1197               unsigned Align = Query.MMODescrs[0].AlignInBits;
1198               return std::make_pair(0, LLT::scalar(Align));
1199             })
1200         .fewerElementsIf(
1201             [=](const LegalityQuery &Query) -> bool {
1202               return Query.Types[0].isVector() &&
1203                      needToSplitMemOp(Query, Op == G_LOAD);
1204             },
1205             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1206               const LLT DstTy = Query.Types[0];
1207               const LLT PtrTy = Query.Types[1];
1208 
1209               LLT EltTy = DstTy.getElementType();
1210               unsigned MaxSize = maxSizeForAddrSpace(ST,
1211                                                      PtrTy.getAddressSpace(),
1212                                                      Op == G_LOAD);
1213 
1214               // FIXME: Handle widened to power of 2 results better. This ends
1215               // up scalarizing.
1216               // FIXME: 3 element stores scalarized on SI
1217 
1218               // Split if it's too large for the address space.
1219               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1220               if (MemSize > MaxSize) {
1221                 unsigned NumElts = DstTy.getNumElements();
1222                 unsigned EltSize = EltTy.getSizeInBits();
1223 
1224                 if (MaxSize % EltSize == 0) {
1225                   return std::make_pair(
1226                       0, LLT::scalarOrVector(
1227                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1228                 }
1229 
1230                 unsigned NumPieces = MemSize / MaxSize;
1231 
1232                 // FIXME: Refine when odd breakdowns handled
1233                 // The scalars will need to be re-legalized.
1234                 if (NumPieces == 1 || NumPieces >= NumElts ||
1235                     NumElts % NumPieces != 0)
1236                   return std::make_pair(0, EltTy);
1237 
1238                 return std::make_pair(
1239                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1240               }
1241 
1242               // FIXME: We could probably handle weird extending loads better.
1243               if (DstTy.getSizeInBits() > MemSize)
1244                 return std::make_pair(0, EltTy);
1245 
1246               unsigned EltSize = EltTy.getSizeInBits();
1247               unsigned DstSize = DstTy.getSizeInBits();
1248               if (!isPowerOf2_32(DstSize)) {
1249                 // We're probably decomposing an odd sized store. Try to split
1250                 // to the widest type. TODO: Account for alignment. As-is it
1251                 // should be OK, since the new parts will be further legalized.
1252                 unsigned FloorSize = PowerOf2Floor(DstSize);
1253                 return std::make_pair(
1254                     0, LLT::scalarOrVector(
1255                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1256               }
1257 
1258               // Need to split because of alignment.
1259               unsigned Align = Query.MMODescrs[0].AlignInBits;
1260               if (EltSize > Align &&
1261                   (EltSize / Align < DstTy.getNumElements())) {
1262                 return std::make_pair(
1263                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1264               }
1265 
1266               // May need relegalization for the scalars.
1267               return std::make_pair(0, EltTy);
1268             })
1269     .minScalar(0, S32)
1270     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1271     .widenScalarToNextPow2(0)
1272     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1273     .lower();
1274   }
1275 
1276   // FIXME: Unaligned accesses not lowered.
1277   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1278                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1279                                                   {S32, GlobalPtr, S16, 2 * 8},
1280                                                   {S32, LocalPtr, S8, 8},
1281                                                   {S32, LocalPtr, S16, 16},
1282                                                   {S32, PrivatePtr, S8, 8},
1283                                                   {S32, PrivatePtr, S16, 16},
1284                                                   {S32, ConstantPtr, S8, 8},
1285                                                   {S32, ConstantPtr, S16, 2 * 8}})
1286                        .legalIf(
1287                          [=](const LegalityQuery &Query) -> bool {
1288                            return isLoadStoreLegal(ST, Query);
1289                          });
1290 
1291   if (ST.hasFlatAddressSpace()) {
1292     ExtLoads.legalForTypesWithMemDesc(
1293         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1294   }
1295 
1296   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1297   // 64-bits.
1298   //
1299   // TODO: Should generalize bitcast action into coerce, which will also cover
1300   // inserting addrspacecasts.
1301   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1302 
1303   ExtLoads.clampScalar(0, S32, S32)
1304           .widenScalarToNextPow2(0)
1305           .lower();
1306 
1307   auto &Atomics = getActionDefinitionsBuilder(
1308     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1309      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1310      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1311      G_ATOMICRMW_UMIN})
1312     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1313                {S64, GlobalPtr}, {S64, LocalPtr},
1314                {S32, RegionPtr}, {S64, RegionPtr}});
1315   if (ST.hasFlatAddressSpace()) {
1316     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1317   }
1318 
1319   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1320   if (ST.hasLDSFPAtomics()) {
1321     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1322     if (ST.hasGFX90AInsts())
1323       Atomic.legalFor({{S64, LocalPtr}});
1324   }
1325   if (ST.hasAtomicFaddInsts())
1326     Atomic.legalFor({{S32, GlobalPtr}});
1327 
1328   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1329   // demarshalling
1330   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1331     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1332                 {S32, FlatPtr}, {S64, FlatPtr}})
1333     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1334                {S32, RegionPtr}, {S64, RegionPtr}});
1335   // TODO: Pointer types, any 32-bit or 64-bit vector
1336 
1337   // Condition should be s32 for scalar, s1 for vector.
1338   getActionDefinitionsBuilder(G_SELECT)
1339       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1340                                  LocalPtr, FlatPtr, PrivatePtr,
1341                                  LLT::fixed_vector(2, LocalPtr),
1342                                  LLT::fixed_vector(2, PrivatePtr)},
1343                                 {S1, S32})
1344       .clampScalar(0, S16, S64)
1345       .scalarize(1)
1346       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1347       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1348       .clampMaxNumElements(0, S32, 2)
1349       .clampMaxNumElements(0, LocalPtr, 2)
1350       .clampMaxNumElements(0, PrivatePtr, 2)
1351       .scalarize(0)
1352       .widenScalarToNextPow2(0)
1353       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1354 
1355   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1356   // be more flexible with the shift amount type.
1357   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1358     .legalFor({{S32, S32}, {S64, S32}});
1359   if (ST.has16BitInsts()) {
1360     if (ST.hasVOP3PInsts()) {
1361       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1362             .clampMaxNumElements(0, S16, 2);
1363     } else
1364       Shifts.legalFor({{S16, S16}});
1365 
1366     // TODO: Support 16-bit shift amounts for all types
1367     Shifts.widenScalarIf(
1368       [=](const LegalityQuery &Query) {
1369         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1370         // 32-bit amount.
1371         const LLT ValTy = Query.Types[0];
1372         const LLT AmountTy = Query.Types[1];
1373         return ValTy.getSizeInBits() <= 16 &&
1374                AmountTy.getSizeInBits() < 16;
1375       }, changeTo(1, S16));
1376     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1377     Shifts.clampScalar(1, S32, S32);
1378     Shifts.clampScalar(0, S16, S64);
1379     Shifts.widenScalarToNextPow2(0, 16);
1380 
1381     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1382       .minScalar(0, S16)
1383       .scalarize(0)
1384       .lower();
1385   } else {
1386     // Make sure we legalize the shift amount type first, as the general
1387     // expansion for the shifted type will produce much worse code if it hasn't
1388     // been truncated already.
1389     Shifts.clampScalar(1, S32, S32);
1390     Shifts.clampScalar(0, S32, S64);
1391     Shifts.widenScalarToNextPow2(0, 32);
1392 
1393     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1394       .minScalar(0, S32)
1395       .scalarize(0)
1396       .lower();
1397   }
1398   Shifts.scalarize(0);
1399 
1400   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1401     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1402     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1403     unsigned IdxTypeIdx = 2;
1404 
1405     getActionDefinitionsBuilder(Op)
1406       .customIf([=](const LegalityQuery &Query) {
1407           const LLT EltTy = Query.Types[EltTypeIdx];
1408           const LLT VecTy = Query.Types[VecTypeIdx];
1409           const LLT IdxTy = Query.Types[IdxTypeIdx];
1410           const unsigned EltSize = EltTy.getSizeInBits();
1411           return (EltSize == 32 || EltSize == 64) &&
1412                   VecTy.getSizeInBits() % 32 == 0 &&
1413                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1414                   IdxTy.getSizeInBits() == 32;
1415         })
1416       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1417                  bitcastToVectorElement32(VecTypeIdx))
1418       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1419       .bitcastIf(
1420         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1421         [=](const LegalityQuery &Query) {
1422           // For > 64-bit element types, try to turn this into a 64-bit
1423           // element vector since we may be able to do better indexing
1424           // if this is scalar. If not, fall back to 32.
1425           const LLT EltTy = Query.Types[EltTypeIdx];
1426           const LLT VecTy = Query.Types[VecTypeIdx];
1427           const unsigned DstEltSize = EltTy.getSizeInBits();
1428           const unsigned VecSize = VecTy.getSizeInBits();
1429 
1430           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1431           return std::make_pair(
1432               VecTypeIdx,
1433               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1434         })
1435       .clampScalar(EltTypeIdx, S32, S64)
1436       .clampScalar(VecTypeIdx, S32, S64)
1437       .clampScalar(IdxTypeIdx, S32, S32)
1438       .clampMaxNumElements(VecTypeIdx, S32, 32)
1439       // TODO: Clamp elements for 64-bit vectors?
1440       // It should only be necessary with variable indexes.
1441       // As a last resort, lower to the stack
1442       .lower();
1443   }
1444 
1445   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1446     .unsupportedIf([=](const LegalityQuery &Query) {
1447         const LLT &EltTy = Query.Types[1].getElementType();
1448         return Query.Types[0] != EltTy;
1449       });
1450 
1451   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1452     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1453     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1454 
1455     // FIXME: Doesn't handle extract of illegal sizes.
1456     getActionDefinitionsBuilder(Op)
1457       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1458       // FIXME: Multiples of 16 should not be legal.
1459       .legalIf([=](const LegalityQuery &Query) {
1460           const LLT BigTy = Query.Types[BigTyIdx];
1461           const LLT LitTy = Query.Types[LitTyIdx];
1462           return (BigTy.getSizeInBits() % 32 == 0) &&
1463                  (LitTy.getSizeInBits() % 16 == 0);
1464         })
1465       .widenScalarIf(
1466         [=](const LegalityQuery &Query) {
1467           const LLT BigTy = Query.Types[BigTyIdx];
1468           return (BigTy.getScalarSizeInBits() < 16);
1469         },
1470         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1471       .widenScalarIf(
1472         [=](const LegalityQuery &Query) {
1473           const LLT LitTy = Query.Types[LitTyIdx];
1474           return (LitTy.getScalarSizeInBits() < 16);
1475         },
1476         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1477       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1478       .widenScalarToNextPow2(BigTyIdx, 32);
1479 
1480   }
1481 
1482   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1483     .legalForCartesianProduct(AllS32Vectors, {S32})
1484     .legalForCartesianProduct(AllS64Vectors, {S64})
1485     .clampNumElements(0, V16S32, V32S32)
1486     .clampNumElements(0, V2S64, V16S64)
1487     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1488 
1489   if (ST.hasScalarPackInsts()) {
1490     BuildVector
1491       // FIXME: Should probably widen s1 vectors straight to s32
1492       .minScalarOrElt(0, S16)
1493       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1494       .minScalar(1, S32);
1495 
1496     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1497       .legalFor({V2S16, S32})
1498       .lower();
1499     BuildVector.minScalarOrElt(0, S32);
1500   } else {
1501     BuildVector.customFor({V2S16, S16});
1502     BuildVector.minScalarOrElt(0, S32);
1503 
1504     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1505       .customFor({V2S16, S32})
1506       .lower();
1507   }
1508 
1509   BuildVector.legalIf(isRegisterType(0));
1510 
1511   // FIXME: Clamp maximum size
1512   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1513     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1514     .clampMaxNumElements(0, S32, 32)
1515     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1516     .clampMaxNumElements(0, S16, 64);
1517 
1518   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1519   // pre-legalize.
1520   if (ST.hasVOP3PInsts()) {
1521     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1522       .customFor({V2S16, V2S16})
1523       .lower();
1524   } else
1525     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1526 
1527   // Merge/Unmerge
1528   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1529     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1530     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1531 
1532     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1533       const LLT Ty = Query.Types[TypeIdx];
1534       if (Ty.isVector()) {
1535         const LLT &EltTy = Ty.getElementType();
1536         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1537           return true;
1538         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1539           return true;
1540       }
1541       return false;
1542     };
1543 
1544     auto &Builder = getActionDefinitionsBuilder(Op)
1545       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1546       .lowerFor({{S16, V2S16}})
1547       .lowerIf([=](const LegalityQuery &Query) {
1548           const LLT BigTy = Query.Types[BigTyIdx];
1549           return BigTy.getSizeInBits() == 32;
1550         })
1551       // Try to widen to s16 first for small types.
1552       // TODO: Only do this on targets with legal s16 shifts
1553       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1554       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1555       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1556       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1557                            elementTypeIs(1, S16)),
1558                        changeTo(1, V2S16))
1559       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1560       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1561       // valid.
1562       .clampScalar(LitTyIdx, S32, S512)
1563       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1564       // Break up vectors with weird elements into scalars
1565       .fewerElementsIf(
1566         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1567         scalarize(0))
1568       .fewerElementsIf(
1569         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1570         scalarize(1))
1571       .clampScalar(BigTyIdx, S32, MaxScalar);
1572 
1573     if (Op == G_MERGE_VALUES) {
1574       Builder.widenScalarIf(
1575         // TODO: Use 16-bit shifts if legal for 8-bit values?
1576         [=](const LegalityQuery &Query) {
1577           const LLT Ty = Query.Types[LitTyIdx];
1578           return Ty.getSizeInBits() < 32;
1579         },
1580         changeTo(LitTyIdx, S32));
1581     }
1582 
1583     Builder.widenScalarIf(
1584       [=](const LegalityQuery &Query) {
1585         const LLT Ty = Query.Types[BigTyIdx];
1586         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1587           Ty.getSizeInBits() % 16 != 0;
1588       },
1589       [=](const LegalityQuery &Query) {
1590         // Pick the next power of 2, or a multiple of 64 over 128.
1591         // Whichever is smaller.
1592         const LLT &Ty = Query.Types[BigTyIdx];
1593         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1594         if (NewSizeInBits >= 256) {
1595           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1596           if (RoundedTo < NewSizeInBits)
1597             NewSizeInBits = RoundedTo;
1598         }
1599         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1600       })
1601       // Any vectors left are the wrong size. Scalarize them.
1602       .scalarize(0)
1603       .scalarize(1);
1604   }
1605 
1606   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1607   // RegBankSelect.
1608   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1609     .legalFor({{S32}, {S64}});
1610 
1611   if (ST.hasVOP3PInsts()) {
1612     SextInReg.lowerFor({{V2S16}})
1613       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1614       // get more vector shift opportunities, since we'll get those when
1615       // expanded.
1616       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1617   } else if (ST.has16BitInsts()) {
1618     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1619   } else {
1620     // Prefer to promote to s32 before lowering if we don't have 16-bit
1621     // shifts. This avoid a lot of intermediate truncate and extend operations.
1622     SextInReg.lowerFor({{S32}, {S64}});
1623   }
1624 
1625   SextInReg
1626     .scalarize(0)
1627     .clampScalar(0, S32, S64)
1628     .lower();
1629 
1630   // TODO: Only Try to form v2s16 with legal packed instructions.
1631   getActionDefinitionsBuilder(G_FSHR)
1632     .legalFor({{S32, S32}})
1633     .lowerFor({{V2S16, V2S16}})
1634     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1635     .scalarize(0)
1636     .lower();
1637 
1638   if (ST.hasVOP3PInsts()) {
1639     getActionDefinitionsBuilder(G_FSHL)
1640       .lowerFor({{V2S16, V2S16}})
1641       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1642       .scalarize(0)
1643       .lower();
1644   } else {
1645     getActionDefinitionsBuilder(G_FSHL)
1646       .scalarize(0)
1647       .lower();
1648   }
1649 
1650   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1651     .legalFor({S64});
1652 
1653   getActionDefinitionsBuilder(G_FENCE)
1654     .alwaysLegal();
1655 
1656   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1657       .scalarize(0)
1658       .minScalar(0, S32)
1659       .lower();
1660 
1661   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1662       .legalFor({{S32, S32}, {S64, S32}})
1663       .clampScalar(1, S32, S32)
1664       .clampScalar(0, S32, S64)
1665       .widenScalarToNextPow2(0)
1666       .scalarize(0);
1667 
1668   getActionDefinitionsBuilder({
1669       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1670       G_FCOPYSIGN,
1671 
1672       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1673       G_ATOMICRMW_NAND,
1674       G_ATOMICRMW_FSUB,
1675       G_READ_REGISTER,
1676       G_WRITE_REGISTER,
1677 
1678       G_SADDO, G_SSUBO,
1679 
1680        // TODO: Implement
1681       G_FMINIMUM, G_FMAXIMUM}).lower();
1682 
1683   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1684         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1685         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1686     .unsupported();
1687 
1688   getLegacyLegalizerInfo().computeTables();
1689   verify(*ST.getInstrInfo());
1690 }
1691 
1692 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1693                                          MachineInstr &MI) const {
1694   MachineIRBuilder &B = Helper.MIRBuilder;
1695   MachineRegisterInfo &MRI = *B.getMRI();
1696 
1697   switch (MI.getOpcode()) {
1698   case TargetOpcode::G_ADDRSPACE_CAST:
1699     return legalizeAddrSpaceCast(MI, MRI, B);
1700   case TargetOpcode::G_FRINT:
1701     return legalizeFrint(MI, MRI, B);
1702   case TargetOpcode::G_FCEIL:
1703     return legalizeFceil(MI, MRI, B);
1704   case TargetOpcode::G_FREM:
1705     return legalizeFrem(MI, MRI, B);
1706   case TargetOpcode::G_INTRINSIC_TRUNC:
1707     return legalizeIntrinsicTrunc(MI, MRI, B);
1708   case TargetOpcode::G_SITOFP:
1709     return legalizeITOFP(MI, MRI, B, true);
1710   case TargetOpcode::G_UITOFP:
1711     return legalizeITOFP(MI, MRI, B, false);
1712   case TargetOpcode::G_FPTOSI:
1713     return legalizeFPTOI(MI, MRI, B, true);
1714   case TargetOpcode::G_FPTOUI:
1715     return legalizeFPTOI(MI, MRI, B, false);
1716   case TargetOpcode::G_FMINNUM:
1717   case TargetOpcode::G_FMAXNUM:
1718   case TargetOpcode::G_FMINNUM_IEEE:
1719   case TargetOpcode::G_FMAXNUM_IEEE:
1720     return legalizeMinNumMaxNum(Helper, MI);
1721   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1722     return legalizeExtractVectorElt(MI, MRI, B);
1723   case TargetOpcode::G_INSERT_VECTOR_ELT:
1724     return legalizeInsertVectorElt(MI, MRI, B);
1725   case TargetOpcode::G_SHUFFLE_VECTOR:
1726     return legalizeShuffleVector(MI, MRI, B);
1727   case TargetOpcode::G_FSIN:
1728   case TargetOpcode::G_FCOS:
1729     return legalizeSinCos(MI, MRI, B);
1730   case TargetOpcode::G_GLOBAL_VALUE:
1731     return legalizeGlobalValue(MI, MRI, B);
1732   case TargetOpcode::G_LOAD:
1733   case TargetOpcode::G_SEXTLOAD:
1734   case TargetOpcode::G_ZEXTLOAD:
1735     return legalizeLoad(Helper, MI);
1736   case TargetOpcode::G_FMAD:
1737     return legalizeFMad(MI, MRI, B);
1738   case TargetOpcode::G_FDIV:
1739     return legalizeFDIV(MI, MRI, B);
1740   case TargetOpcode::G_UDIV:
1741   case TargetOpcode::G_UREM:
1742   case TargetOpcode::G_UDIVREM:
1743     return legalizeUnsignedDIV_REM(MI, MRI, B);
1744   case TargetOpcode::G_SDIV:
1745   case TargetOpcode::G_SREM:
1746   case TargetOpcode::G_SDIVREM:
1747     return legalizeSignedDIV_REM(MI, MRI, B);
1748   case TargetOpcode::G_ATOMIC_CMPXCHG:
1749     return legalizeAtomicCmpXChg(MI, MRI, B);
1750   case TargetOpcode::G_FLOG:
1751     return legalizeFlog(MI, B, numbers::ln2f);
1752   case TargetOpcode::G_FLOG10:
1753     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1754   case TargetOpcode::G_FEXP:
1755     return legalizeFExp(MI, B);
1756   case TargetOpcode::G_FPOW:
1757     return legalizeFPow(MI, B);
1758   case TargetOpcode::G_FFLOOR:
1759     return legalizeFFloor(MI, MRI, B);
1760   case TargetOpcode::G_BUILD_VECTOR:
1761     return legalizeBuildVector(MI, MRI, B);
1762   default:
1763     return false;
1764   }
1765 
1766   llvm_unreachable("expected switch to return");
1767 }
1768 
1769 Register AMDGPULegalizerInfo::getSegmentAperture(
1770   unsigned AS,
1771   MachineRegisterInfo &MRI,
1772   MachineIRBuilder &B) const {
1773   MachineFunction &MF = B.getMF();
1774   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1775   const LLT S32 = LLT::scalar(32);
1776 
1777   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1778 
1779   if (ST.hasApertureRegs()) {
1780     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1781     // getreg.
1782     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1783         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1784         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1785     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1786         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1787         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1788     unsigned Encoding =
1789         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1790         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1791         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1792 
1793     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1794 
1795     B.buildInstr(AMDGPU::S_GETREG_B32)
1796       .addDef(GetReg)
1797       .addImm(Encoding);
1798     MRI.setType(GetReg, S32);
1799 
1800     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1801     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1802   }
1803 
1804   Register QueuePtr = MRI.createGenericVirtualRegister(
1805     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1806 
1807   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1808     return Register();
1809 
1810   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1811   // private_segment_aperture_base_hi.
1812   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1813 
1814   // TODO: can we be smarter about machine pointer info?
1815   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1816   MachineMemOperand *MMO = MF.getMachineMemOperand(
1817       PtrInfo,
1818       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1819           MachineMemOperand::MOInvariant,
1820       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1821 
1822   Register LoadAddr;
1823 
1824   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1825   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1826 }
1827 
1828 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1829   MachineInstr &MI, MachineRegisterInfo &MRI,
1830   MachineIRBuilder &B) const {
1831   MachineFunction &MF = B.getMF();
1832 
1833   const LLT S32 = LLT::scalar(32);
1834   Register Dst = MI.getOperand(0).getReg();
1835   Register Src = MI.getOperand(1).getReg();
1836 
1837   LLT DstTy = MRI.getType(Dst);
1838   LLT SrcTy = MRI.getType(Src);
1839   unsigned DestAS = DstTy.getAddressSpace();
1840   unsigned SrcAS = SrcTy.getAddressSpace();
1841 
1842   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1843   // vector element.
1844   assert(!DstTy.isVector());
1845 
1846   const AMDGPUTargetMachine &TM
1847     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1848 
1849   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1850     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1851     return true;
1852   }
1853 
1854   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1855     // Truncate.
1856     B.buildExtract(Dst, Src, 0);
1857     MI.eraseFromParent();
1858     return true;
1859   }
1860 
1861   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1862     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1863     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1864 
1865     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1866     // another. Merge operands are required to be the same type, but creating an
1867     // extra ptrtoint would be kind of pointless.
1868     auto HighAddr = B.buildConstant(
1869       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1870     B.buildMerge(Dst, {Src, HighAddr});
1871     MI.eraseFromParent();
1872     return true;
1873   }
1874 
1875   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1876     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1877            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1878     unsigned NullVal = TM.getNullPointerValue(DestAS);
1879 
1880     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1881     auto FlatNull = B.buildConstant(SrcTy, 0);
1882 
1883     // Extract low 32-bits of the pointer.
1884     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1885 
1886     auto CmpRes =
1887         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1888     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1889 
1890     MI.eraseFromParent();
1891     return true;
1892   }
1893 
1894   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1895     return false;
1896 
1897   if (!ST.hasFlatAddressSpace())
1898     return false;
1899 
1900   auto SegmentNull =
1901       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1902   auto FlatNull =
1903       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1904 
1905   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1906   if (!ApertureReg.isValid())
1907     return false;
1908 
1909   auto CmpRes =
1910       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1911 
1912   // Coerce the type of the low half of the result so we can use merge_values.
1913   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1914 
1915   // TODO: Should we allow mismatched types but matching sizes in merges to
1916   // avoid the ptrtoint?
1917   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1918   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1919 
1920   MI.eraseFromParent();
1921   return true;
1922 }
1923 
1924 bool AMDGPULegalizerInfo::legalizeFrint(
1925   MachineInstr &MI, MachineRegisterInfo &MRI,
1926   MachineIRBuilder &B) const {
1927   Register Src = MI.getOperand(1).getReg();
1928   LLT Ty = MRI.getType(Src);
1929   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1930 
1931   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1932   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1933 
1934   auto C1 = B.buildFConstant(Ty, C1Val);
1935   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1936 
1937   // TODO: Should this propagate fast-math-flags?
1938   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1939   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1940 
1941   auto C2 = B.buildFConstant(Ty, C2Val);
1942   auto Fabs = B.buildFAbs(Ty, Src);
1943 
1944   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1945   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1946   MI.eraseFromParent();
1947   return true;
1948 }
1949 
1950 bool AMDGPULegalizerInfo::legalizeFceil(
1951   MachineInstr &MI, MachineRegisterInfo &MRI,
1952   MachineIRBuilder &B) const {
1953 
1954   const LLT S1 = LLT::scalar(1);
1955   const LLT S64 = LLT::scalar(64);
1956 
1957   Register Src = MI.getOperand(1).getReg();
1958   assert(MRI.getType(Src) == S64);
1959 
1960   // result = trunc(src)
1961   // if (src > 0.0 && src != result)
1962   //   result += 1.0
1963 
1964   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1965 
1966   const auto Zero = B.buildFConstant(S64, 0.0);
1967   const auto One = B.buildFConstant(S64, 1.0);
1968   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1969   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1970   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1971   auto Add = B.buildSelect(S64, And, One, Zero);
1972 
1973   // TODO: Should this propagate fast-math-flags?
1974   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1975   return true;
1976 }
1977 
1978 bool AMDGPULegalizerInfo::legalizeFrem(
1979   MachineInstr &MI, MachineRegisterInfo &MRI,
1980   MachineIRBuilder &B) const {
1981     Register DstReg = MI.getOperand(0).getReg();
1982     Register Src0Reg = MI.getOperand(1).getReg();
1983     Register Src1Reg = MI.getOperand(2).getReg();
1984     auto Flags = MI.getFlags();
1985     LLT Ty = MRI.getType(DstReg);
1986 
1987     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1988     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1989     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1990     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1991     MI.eraseFromParent();
1992     return true;
1993 }
1994 
1995 static MachineInstrBuilder extractF64Exponent(Register Hi,
1996                                               MachineIRBuilder &B) {
1997   const unsigned FractBits = 52;
1998   const unsigned ExpBits = 11;
1999   LLT S32 = LLT::scalar(32);
2000 
2001   auto Const0 = B.buildConstant(S32, FractBits - 32);
2002   auto Const1 = B.buildConstant(S32, ExpBits);
2003 
2004   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2005     .addUse(Hi)
2006     .addUse(Const0.getReg(0))
2007     .addUse(Const1.getReg(0));
2008 
2009   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2010 }
2011 
2012 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2013   MachineInstr &MI, MachineRegisterInfo &MRI,
2014   MachineIRBuilder &B) const {
2015   const LLT S1 = LLT::scalar(1);
2016   const LLT S32 = LLT::scalar(32);
2017   const LLT S64 = LLT::scalar(64);
2018 
2019   Register Src = MI.getOperand(1).getReg();
2020   assert(MRI.getType(Src) == S64);
2021 
2022   // TODO: Should this use extract since the low half is unused?
2023   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2024   Register Hi = Unmerge.getReg(1);
2025 
2026   // Extract the upper half, since this is where we will find the sign and
2027   // exponent.
2028   auto Exp = extractF64Exponent(Hi, B);
2029 
2030   const unsigned FractBits = 52;
2031 
2032   // Extract the sign bit.
2033   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2034   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2035 
2036   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2037 
2038   const auto Zero32 = B.buildConstant(S32, 0);
2039 
2040   // Extend back to 64-bits.
2041   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2042 
2043   auto Shr = B.buildAShr(S64, FractMask, Exp);
2044   auto Not = B.buildNot(S64, Shr);
2045   auto Tmp0 = B.buildAnd(S64, Src, Not);
2046   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2047 
2048   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2049   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2050 
2051   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2052   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2053   MI.eraseFromParent();
2054   return true;
2055 }
2056 
2057 bool AMDGPULegalizerInfo::legalizeITOFP(
2058   MachineInstr &MI, MachineRegisterInfo &MRI,
2059   MachineIRBuilder &B, bool Signed) const {
2060 
2061   Register Dst = MI.getOperand(0).getReg();
2062   Register Src = MI.getOperand(1).getReg();
2063 
2064   const LLT S64 = LLT::scalar(64);
2065   const LLT S32 = LLT::scalar(32);
2066 
2067   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2068 
2069   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2070 
2071   auto CvtHi = Signed ?
2072     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2073     B.buildUITOFP(S64, Unmerge.getReg(1));
2074 
2075   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2076 
2077   auto ThirtyTwo = B.buildConstant(S32, 32);
2078   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2079     .addUse(CvtHi.getReg(0))
2080     .addUse(ThirtyTwo.getReg(0));
2081 
2082   // TODO: Should this propagate fast-math-flags?
2083   B.buildFAdd(Dst, LdExp, CvtLo);
2084   MI.eraseFromParent();
2085   return true;
2086 }
2087 
2088 // TODO: Copied from DAG implementation. Verify logic and document how this
2089 // actually works.
2090 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2091                                         MachineRegisterInfo &MRI,
2092                                         MachineIRBuilder &B,
2093                                         bool Signed) const {
2094 
2095   Register Dst = MI.getOperand(0).getReg();
2096   Register Src = MI.getOperand(1).getReg();
2097 
2098   const LLT S64 = LLT::scalar(64);
2099   const LLT S32 = LLT::scalar(32);
2100 
2101   const LLT SrcLT = MRI.getType(Src);
2102   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2103 
2104   unsigned Flags = MI.getFlags();
2105 
2106   // The basic idea of converting a floating point number into a pair of 32-bit
2107   // integers is illustrated as follows:
2108   //
2109   //     tf := trunc(val);
2110   //    hif := floor(tf * 2^-32);
2111   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2112   //     hi := fptoi(hif);
2113   //     lo := fptoi(lof);
2114   //
2115   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2116   MachineInstrBuilder Sign;
2117   if (Signed && SrcLT == S32) {
2118     // However, a 32-bit floating point number has only 23 bits mantissa and
2119     // it's not enough to hold all the significant bits of `lof` if val is
2120     // negative. To avoid the loss of precision, We need to take the absolute
2121     // value after truncating and flip the result back based on the original
2122     // signedness.
2123     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2124     Trunc = B.buildFAbs(S32, Trunc, Flags);
2125   }
2126   MachineInstrBuilder K0, K1;
2127   if (SrcLT == S64) {
2128     K0 = B.buildFConstant(S64,
2129                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2130     K1 = B.buildFConstant(S64,
2131                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2132   } else {
2133     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2134     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2135   }
2136 
2137   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2138   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2139   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2140 
2141   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2142                                      : B.buildFPTOUI(S32, FloorMul);
2143   auto Lo = B.buildFPTOUI(S32, Fma);
2144 
2145   if (Signed && SrcLT == S32) {
2146     // Flip the result based on the signedness, which is either all 0s or 1s.
2147     Sign = B.buildMerge(S64, {Sign, Sign});
2148     // r := xor({lo, hi}, sign) - sign;
2149     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2150   } else
2151     B.buildMerge(Dst, {Lo, Hi});
2152   MI.eraseFromParent();
2153 
2154   return true;
2155 }
2156 
2157 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2158                                                MachineInstr &MI) const {
2159   MachineFunction &MF = Helper.MIRBuilder.getMF();
2160   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2161 
2162   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2163                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2164 
2165   // With ieee_mode disabled, the instructions have the correct behavior
2166   // already for G_FMINNUM/G_FMAXNUM
2167   if (!MFI->getMode().IEEE)
2168     return !IsIEEEOp;
2169 
2170   if (IsIEEEOp)
2171     return true;
2172 
2173   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2174 }
2175 
2176 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2177   MachineInstr &MI, MachineRegisterInfo &MRI,
2178   MachineIRBuilder &B) const {
2179   // TODO: Should move some of this into LegalizerHelper.
2180 
2181   // TODO: Promote dynamic indexing of s16 to s32
2182 
2183   // FIXME: Artifact combiner probably should have replaced the truncated
2184   // constant before this, so we shouldn't need
2185   // getConstantVRegValWithLookThrough.
2186   Optional<ValueAndVReg> MaybeIdxVal =
2187       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2188   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2189     return true;
2190   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2191 
2192   Register Dst = MI.getOperand(0).getReg();
2193   Register Vec = MI.getOperand(1).getReg();
2194 
2195   LLT VecTy = MRI.getType(Vec);
2196   LLT EltTy = VecTy.getElementType();
2197   assert(EltTy == MRI.getType(Dst));
2198 
2199   if (IdxVal < VecTy.getNumElements())
2200     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2201   else
2202     B.buildUndef(Dst);
2203 
2204   MI.eraseFromParent();
2205   return true;
2206 }
2207 
2208 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2209   MachineInstr &MI, MachineRegisterInfo &MRI,
2210   MachineIRBuilder &B) const {
2211   // TODO: Should move some of this into LegalizerHelper.
2212 
2213   // TODO: Promote dynamic indexing of s16 to s32
2214 
2215   // FIXME: Artifact combiner probably should have replaced the truncated
2216   // constant before this, so we shouldn't need
2217   // getConstantVRegValWithLookThrough.
2218   Optional<ValueAndVReg> MaybeIdxVal =
2219       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2220   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2221     return true;
2222 
2223   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2224   Register Dst = MI.getOperand(0).getReg();
2225   Register Vec = MI.getOperand(1).getReg();
2226   Register Ins = MI.getOperand(2).getReg();
2227 
2228   LLT VecTy = MRI.getType(Vec);
2229   LLT EltTy = VecTy.getElementType();
2230   assert(EltTy == MRI.getType(Ins));
2231 
2232   if (IdxVal < VecTy.getNumElements())
2233     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2234   else
2235     B.buildUndef(Dst);
2236 
2237   MI.eraseFromParent();
2238   return true;
2239 }
2240 
2241 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2242   MachineInstr &MI, MachineRegisterInfo &MRI,
2243   MachineIRBuilder &B) const {
2244   const LLT V2S16 = LLT::fixed_vector(2, 16);
2245 
2246   Register Dst = MI.getOperand(0).getReg();
2247   Register Src0 = MI.getOperand(1).getReg();
2248   LLT DstTy = MRI.getType(Dst);
2249   LLT SrcTy = MRI.getType(Src0);
2250 
2251   if (SrcTy == V2S16 && DstTy == V2S16 &&
2252       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2253     return true;
2254 
2255   MachineIRBuilder HelperBuilder(MI);
2256   GISelObserverWrapper DummyObserver;
2257   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2258   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2259 }
2260 
2261 bool AMDGPULegalizerInfo::legalizeSinCos(
2262   MachineInstr &MI, MachineRegisterInfo &MRI,
2263   MachineIRBuilder &B) const {
2264 
2265   Register DstReg = MI.getOperand(0).getReg();
2266   Register SrcReg = MI.getOperand(1).getReg();
2267   LLT Ty = MRI.getType(DstReg);
2268   unsigned Flags = MI.getFlags();
2269 
2270   Register TrigVal;
2271   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2272   if (ST.hasTrigReducedRange()) {
2273     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2274     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2275       .addUse(MulVal.getReg(0))
2276       .setMIFlags(Flags).getReg(0);
2277   } else
2278     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2279 
2280   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2281     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2282   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2283     .addUse(TrigVal)
2284     .setMIFlags(Flags);
2285   MI.eraseFromParent();
2286   return true;
2287 }
2288 
2289 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2290                                                   MachineIRBuilder &B,
2291                                                   const GlobalValue *GV,
2292                                                   int64_t Offset,
2293                                                   unsigned GAFlags) const {
2294   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2295   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2296   // to the following code sequence:
2297   //
2298   // For constant address space:
2299   //   s_getpc_b64 s[0:1]
2300   //   s_add_u32 s0, s0, $symbol
2301   //   s_addc_u32 s1, s1, 0
2302   //
2303   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2304   //   a fixup or relocation is emitted to replace $symbol with a literal
2305   //   constant, which is a pc-relative offset from the encoding of the $symbol
2306   //   operand to the global variable.
2307   //
2308   // For global address space:
2309   //   s_getpc_b64 s[0:1]
2310   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2311   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2312   //
2313   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2314   //   fixups or relocations are emitted to replace $symbol@*@lo and
2315   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2316   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2317   //   operand to the global variable.
2318   //
2319   // What we want here is an offset from the value returned by s_getpc
2320   // (which is the address of the s_add_u32 instruction) to the global
2321   // variable, but since the encoding of $symbol starts 4 bytes after the start
2322   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2323   // small. This requires us to add 4 to the global variable offset in order to
2324   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2325   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2326   // instruction.
2327 
2328   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2329 
2330   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2331     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2332 
2333   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2334     .addDef(PCReg);
2335 
2336   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2337   if (GAFlags == SIInstrInfo::MO_NONE)
2338     MIB.addImm(0);
2339   else
2340     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2341 
2342   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2343 
2344   if (PtrTy.getSizeInBits() == 32)
2345     B.buildExtract(DstReg, PCReg, 0);
2346   return true;
2347  }
2348 
2349 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2350   MachineInstr &MI, MachineRegisterInfo &MRI,
2351   MachineIRBuilder &B) const {
2352   Register DstReg = MI.getOperand(0).getReg();
2353   LLT Ty = MRI.getType(DstReg);
2354   unsigned AS = Ty.getAddressSpace();
2355 
2356   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2357   MachineFunction &MF = B.getMF();
2358   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2359 
2360   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2361     if (!MFI->isModuleEntryFunction() &&
2362         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2363       const Function &Fn = MF.getFunction();
2364       DiagnosticInfoUnsupported BadLDSDecl(
2365         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2366         DS_Warning);
2367       Fn.getContext().diagnose(BadLDSDecl);
2368 
2369       // We currently don't have a way to correctly allocate LDS objects that
2370       // aren't directly associated with a kernel. We do force inlining of
2371       // functions that use local objects. However, if these dead functions are
2372       // not eliminated, we don't want a compile time error. Just emit a warning
2373       // and a trap, since there should be no callable path here.
2374       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2375       B.buildUndef(DstReg);
2376       MI.eraseFromParent();
2377       return true;
2378     }
2379 
2380     // TODO: We could emit code to handle the initialization somewhere.
2381     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2382       const SITargetLowering *TLI = ST.getTargetLowering();
2383       if (!TLI->shouldUseLDSConstAddress(GV)) {
2384         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2385         return true; // Leave in place;
2386       }
2387 
2388       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2389         Type *Ty = GV->getValueType();
2390         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2391         // zero-sized type in other languages to declare the dynamic shared
2392         // memory which size is not known at the compile time. They will be
2393         // allocated by the runtime and placed directly after the static
2394         // allocated ones. They all share the same offset.
2395         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2396           // Adjust alignment for that dynamic shared memory array.
2397           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2398           LLT S32 = LLT::scalar(32);
2399           auto Sz =
2400               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2401           B.buildIntToPtr(DstReg, Sz);
2402           MI.eraseFromParent();
2403           return true;
2404         }
2405       }
2406 
2407       B.buildConstant(
2408           DstReg,
2409           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2410       MI.eraseFromParent();
2411       return true;
2412     }
2413 
2414     const Function &Fn = MF.getFunction();
2415     DiagnosticInfoUnsupported BadInit(
2416       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2417     Fn.getContext().diagnose(BadInit);
2418     return true;
2419   }
2420 
2421   const SITargetLowering *TLI = ST.getTargetLowering();
2422 
2423   if (TLI->shouldEmitFixup(GV)) {
2424     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2425     MI.eraseFromParent();
2426     return true;
2427   }
2428 
2429   if (TLI->shouldEmitPCReloc(GV)) {
2430     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2431     MI.eraseFromParent();
2432     return true;
2433   }
2434 
2435   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2436   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2437 
2438   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2439   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2440       MachinePointerInfo::getGOT(MF),
2441       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2442           MachineMemOperand::MOInvariant,
2443       LoadTy, Align(8));
2444 
2445   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2446 
2447   if (Ty.getSizeInBits() == 32) {
2448     // Truncate if this is a 32-bit constant adrdess.
2449     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2450     B.buildExtract(DstReg, Load, 0);
2451   } else
2452     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2453 
2454   MI.eraseFromParent();
2455   return true;
2456 }
2457 
2458 static LLT widenToNextPowerOf2(LLT Ty) {
2459   if (Ty.isVector())
2460     return Ty.changeElementCount(
2461         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2462   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2463 }
2464 
2465 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2466                                        MachineInstr &MI) const {
2467   MachineIRBuilder &B = Helper.MIRBuilder;
2468   MachineRegisterInfo &MRI = *B.getMRI();
2469   GISelChangeObserver &Observer = Helper.Observer;
2470 
2471   Register PtrReg = MI.getOperand(1).getReg();
2472   LLT PtrTy = MRI.getType(PtrReg);
2473   unsigned AddrSpace = PtrTy.getAddressSpace();
2474 
2475   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2476     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2477     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2478     Observer.changingInstr(MI);
2479     MI.getOperand(1).setReg(Cast.getReg(0));
2480     Observer.changedInstr(MI);
2481     return true;
2482   }
2483 
2484   if (MI.getOpcode() != AMDGPU::G_LOAD)
2485     return false;
2486 
2487   Register ValReg = MI.getOperand(0).getReg();
2488   LLT ValTy = MRI.getType(ValReg);
2489 
2490   MachineMemOperand *MMO = *MI.memoperands_begin();
2491   const unsigned ValSize = ValTy.getSizeInBits();
2492   const LLT MemTy = MMO->getMemoryType();
2493   const Align MemAlign = MMO->getAlign();
2494   const unsigned MemSize = MemTy.getSizeInBits();
2495   const unsigned AlignInBits = 8 * MemAlign.value();
2496 
2497   // Widen non-power-of-2 loads to the alignment if needed
2498   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2499     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2500 
2501     // This was already the correct extending load result type, so just adjust
2502     // the memory type.
2503     if (WideMemSize == ValSize) {
2504       MachineFunction &MF = B.getMF();
2505 
2506       MachineMemOperand *WideMMO =
2507           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2508       Observer.changingInstr(MI);
2509       MI.setMemRefs(MF, {WideMMO});
2510       Observer.changedInstr(MI);
2511       return true;
2512     }
2513 
2514     // Don't bother handling edge case that should probably never be produced.
2515     if (ValSize > WideMemSize)
2516       return false;
2517 
2518     LLT WideTy = widenToNextPowerOf2(ValTy);
2519 
2520     Register WideLoad;
2521     if (!WideTy.isVector()) {
2522       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2523       B.buildTrunc(ValReg, WideLoad).getReg(0);
2524     } else {
2525       // Extract the subvector.
2526 
2527       if (isRegisterType(ValTy)) {
2528         // If this a case where G_EXTRACT is legal, use it.
2529         // (e.g. <3 x s32> -> <4 x s32>)
2530         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2531         B.buildExtract(ValReg, WideLoad, 0);
2532       } else {
2533         // For cases where the widened type isn't a nice register value, unmerge
2534         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2535         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2536         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2537         B.setInsertPt(B.getMBB(), MI.getIterator());
2538         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2539       }
2540     }
2541 
2542     MI.eraseFromParent();
2543     return true;
2544   }
2545 
2546   return false;
2547 }
2548 
2549 bool AMDGPULegalizerInfo::legalizeFMad(
2550   MachineInstr &MI, MachineRegisterInfo &MRI,
2551   MachineIRBuilder &B) const {
2552   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2553   assert(Ty.isScalar());
2554 
2555   MachineFunction &MF = B.getMF();
2556   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2557 
2558   // TODO: Always legal with future ftz flag.
2559   // FIXME: Do we need just output?
2560   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2561     return true;
2562   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2563     return true;
2564 
2565   MachineIRBuilder HelperBuilder(MI);
2566   GISelObserverWrapper DummyObserver;
2567   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2568   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2569 }
2570 
2571 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2572   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2573   Register DstReg = MI.getOperand(0).getReg();
2574   Register PtrReg = MI.getOperand(1).getReg();
2575   Register CmpVal = MI.getOperand(2).getReg();
2576   Register NewVal = MI.getOperand(3).getReg();
2577 
2578   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2579          "this should not have been custom lowered");
2580 
2581   LLT ValTy = MRI.getType(CmpVal);
2582   LLT VecTy = LLT::fixed_vector(2, ValTy);
2583 
2584   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2585 
2586   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2587     .addDef(DstReg)
2588     .addUse(PtrReg)
2589     .addUse(PackedVal)
2590     .setMemRefs(MI.memoperands());
2591 
2592   MI.eraseFromParent();
2593   return true;
2594 }
2595 
2596 bool AMDGPULegalizerInfo::legalizeFlog(
2597   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2598   Register Dst = MI.getOperand(0).getReg();
2599   Register Src = MI.getOperand(1).getReg();
2600   LLT Ty = B.getMRI()->getType(Dst);
2601   unsigned Flags = MI.getFlags();
2602 
2603   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2604   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2605 
2606   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2607   MI.eraseFromParent();
2608   return true;
2609 }
2610 
2611 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2612                                        MachineIRBuilder &B) const {
2613   Register Dst = MI.getOperand(0).getReg();
2614   Register Src = MI.getOperand(1).getReg();
2615   unsigned Flags = MI.getFlags();
2616   LLT Ty = B.getMRI()->getType(Dst);
2617 
2618   auto K = B.buildFConstant(Ty, numbers::log2e);
2619   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2620   B.buildFExp2(Dst, Mul, Flags);
2621   MI.eraseFromParent();
2622   return true;
2623 }
2624 
2625 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2626                                        MachineIRBuilder &B) const {
2627   Register Dst = MI.getOperand(0).getReg();
2628   Register Src0 = MI.getOperand(1).getReg();
2629   Register Src1 = MI.getOperand(2).getReg();
2630   unsigned Flags = MI.getFlags();
2631   LLT Ty = B.getMRI()->getType(Dst);
2632   const LLT S16 = LLT::scalar(16);
2633   const LLT S32 = LLT::scalar(32);
2634 
2635   if (Ty == S32) {
2636     auto Log = B.buildFLog2(S32, Src0, Flags);
2637     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2638       .addUse(Log.getReg(0))
2639       .addUse(Src1)
2640       .setMIFlags(Flags);
2641     B.buildFExp2(Dst, Mul, Flags);
2642   } else if (Ty == S16) {
2643     // There's no f16 fmul_legacy, so we need to convert for it.
2644     auto Log = B.buildFLog2(S16, Src0, Flags);
2645     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2646     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2647     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2648       .addUse(Ext0.getReg(0))
2649       .addUse(Ext1.getReg(0))
2650       .setMIFlags(Flags);
2651 
2652     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2653   } else
2654     return false;
2655 
2656   MI.eraseFromParent();
2657   return true;
2658 }
2659 
2660 // Find a source register, ignoring any possible source modifiers.
2661 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2662   Register ModSrc = OrigSrc;
2663   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2664     ModSrc = SrcFNeg->getOperand(1).getReg();
2665     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2666       ModSrc = SrcFAbs->getOperand(1).getReg();
2667   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2668     ModSrc = SrcFAbs->getOperand(1).getReg();
2669   return ModSrc;
2670 }
2671 
2672 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2673                                          MachineRegisterInfo &MRI,
2674                                          MachineIRBuilder &B) const {
2675 
2676   const LLT S1 = LLT::scalar(1);
2677   const LLT S64 = LLT::scalar(64);
2678   Register Dst = MI.getOperand(0).getReg();
2679   Register OrigSrc = MI.getOperand(1).getReg();
2680   unsigned Flags = MI.getFlags();
2681   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2682          "this should not have been custom lowered");
2683 
2684   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2685   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2686   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2687   // V_FRACT bug is:
2688   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2689   //
2690   // Convert floor(x) to (x - fract(x))
2691 
2692   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2693     .addUse(OrigSrc)
2694     .setMIFlags(Flags);
2695 
2696   // Give source modifier matching some assistance before obscuring a foldable
2697   // pattern.
2698 
2699   // TODO: We can avoid the neg on the fract? The input sign to fract
2700   // shouldn't matter?
2701   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2702 
2703   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2704 
2705   Register Min = MRI.createGenericVirtualRegister(S64);
2706 
2707   // We don't need to concern ourselves with the snan handling difference, so
2708   // use the one which will directly select.
2709   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2710   if (MFI->getMode().IEEE)
2711     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2712   else
2713     B.buildFMinNum(Min, Fract, Const, Flags);
2714 
2715   Register CorrectedFract = Min;
2716   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2717     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2718     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2719   }
2720 
2721   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2722   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2723 
2724   MI.eraseFromParent();
2725   return true;
2726 }
2727 
2728 // Turn an illegal packed v2s16 build vector into bit operations.
2729 // TODO: This should probably be a bitcast action in LegalizerHelper.
2730 bool AMDGPULegalizerInfo::legalizeBuildVector(
2731   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2732   Register Dst = MI.getOperand(0).getReg();
2733   const LLT S32 = LLT::scalar(32);
2734   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2735 
2736   Register Src0 = MI.getOperand(1).getReg();
2737   Register Src1 = MI.getOperand(2).getReg();
2738   assert(MRI.getType(Src0) == LLT::scalar(16));
2739 
2740   auto Merge = B.buildMerge(S32, {Src0, Src1});
2741   B.buildBitcast(Dst, Merge);
2742 
2743   MI.eraseFromParent();
2744   return true;
2745 }
2746 
2747 // Check that this is a G_XOR x, -1
2748 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2749   if (MI.getOpcode() != TargetOpcode::G_XOR)
2750     return false;
2751   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2752   return ConstVal && *ConstVal == -1;
2753 }
2754 
2755 // Return the use branch instruction, otherwise null if the usage is invalid.
2756 static MachineInstr *
2757 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2758                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2759   Register CondDef = MI.getOperand(0).getReg();
2760   if (!MRI.hasOneNonDBGUse(CondDef))
2761     return nullptr;
2762 
2763   MachineBasicBlock *Parent = MI.getParent();
2764   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2765 
2766   if (isNot(MRI, *UseMI)) {
2767     Register NegatedCond = UseMI->getOperand(0).getReg();
2768     if (!MRI.hasOneNonDBGUse(NegatedCond))
2769       return nullptr;
2770 
2771     // We're deleting the def of this value, so we need to remove it.
2772     UseMI->eraseFromParent();
2773 
2774     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2775     Negated = true;
2776   }
2777 
2778   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2779     return nullptr;
2780 
2781   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2782   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2783   if (Next == Parent->end()) {
2784     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2785     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2786       return nullptr;
2787     UncondBrTarget = &*NextMBB;
2788   } else {
2789     if (Next->getOpcode() != AMDGPU::G_BR)
2790       return nullptr;
2791     Br = &*Next;
2792     UncondBrTarget = Br->getOperand(0).getMBB();
2793   }
2794 
2795   return UseMI;
2796 }
2797 
2798 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2799                                          const ArgDescriptor *Arg,
2800                                          const TargetRegisterClass *ArgRC,
2801                                          LLT ArgTy) const {
2802   MCRegister SrcReg = Arg->getRegister();
2803   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2804   assert(DstReg.isVirtual() && "Virtual register expected");
2805 
2806   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2807                                              ArgTy);
2808   if (Arg->isMasked()) {
2809     // TODO: Should we try to emit this once in the entry block?
2810     const LLT S32 = LLT::scalar(32);
2811     const unsigned Mask = Arg->getMask();
2812     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2813 
2814     Register AndMaskSrc = LiveIn;
2815 
2816     if (Shift != 0) {
2817       auto ShiftAmt = B.buildConstant(S32, Shift);
2818       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2819     }
2820 
2821     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2822   } else {
2823     B.buildCopy(DstReg, LiveIn);
2824   }
2825 
2826   return true;
2827 }
2828 
2829 bool AMDGPULegalizerInfo::loadInputValue(
2830     Register DstReg, MachineIRBuilder &B,
2831     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2832   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2833   const ArgDescriptor *Arg;
2834   const TargetRegisterClass *ArgRC;
2835   LLT ArgTy;
2836   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2837 
2838   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2839     return false; // TODO: Handle these
2840   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2841 }
2842 
2843 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2844     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2845     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2846   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2847     return false;
2848 
2849   MI.eraseFromParent();
2850   return true;
2851 }
2852 
2853 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2854                                        MachineRegisterInfo &MRI,
2855                                        MachineIRBuilder &B) const {
2856   Register Dst = MI.getOperand(0).getReg();
2857   LLT DstTy = MRI.getType(Dst);
2858   LLT S16 = LLT::scalar(16);
2859   LLT S32 = LLT::scalar(32);
2860   LLT S64 = LLT::scalar(64);
2861 
2862   if (DstTy == S16)
2863     return legalizeFDIV16(MI, MRI, B);
2864   if (DstTy == S32)
2865     return legalizeFDIV32(MI, MRI, B);
2866   if (DstTy == S64)
2867     return legalizeFDIV64(MI, MRI, B);
2868 
2869   return false;
2870 }
2871 
2872 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2873                                                         Register DstDivReg,
2874                                                         Register DstRemReg,
2875                                                         Register X,
2876                                                         Register Y) const {
2877   const LLT S1 = LLT::scalar(1);
2878   const LLT S32 = LLT::scalar(32);
2879 
2880   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2881   // algorithm used here.
2882 
2883   // Initial estimate of inv(y).
2884   auto FloatY = B.buildUITOFP(S32, Y);
2885   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2886   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2887   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2888   auto Z = B.buildFPTOUI(S32, ScaledY);
2889 
2890   // One round of UNR.
2891   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2892   auto NegYZ = B.buildMul(S32, NegY, Z);
2893   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2894 
2895   // Quotient/remainder estimate.
2896   auto Q = B.buildUMulH(S32, X, Z);
2897   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2898 
2899   // First quotient/remainder refinement.
2900   auto One = B.buildConstant(S32, 1);
2901   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2902   if (DstDivReg)
2903     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2904   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2905 
2906   // Second quotient/remainder refinement.
2907   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2908   if (DstDivReg)
2909     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2910 
2911   if (DstRemReg)
2912     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2913 }
2914 
2915 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2916 //
2917 // Return lo, hi of result
2918 //
2919 // %cvt.lo = G_UITOFP Val.lo
2920 // %cvt.hi = G_UITOFP Val.hi
2921 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2922 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2923 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2924 // %mul2 = G_FMUL %mul1, 2**(-32)
2925 // %trunc = G_INTRINSIC_TRUNC %mul2
2926 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2927 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2928 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2929                                                        Register Val) {
2930   const LLT S32 = LLT::scalar(32);
2931   auto Unmerge = B.buildUnmerge(S32, Val);
2932 
2933   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2934   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2935 
2936   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2937                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2938 
2939   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2940   auto Mul1 =
2941       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2942 
2943   // 2**(-32)
2944   auto Mul2 =
2945       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2946   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2947 
2948   // -(2**32)
2949   auto Mad2 = B.buildFMAD(S32, Trunc,
2950                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2951 
2952   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2953   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2954 
2955   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2956 }
2957 
2958 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2959                                                         Register DstDivReg,
2960                                                         Register DstRemReg,
2961                                                         Register Numer,
2962                                                         Register Denom) const {
2963   const LLT S32 = LLT::scalar(32);
2964   const LLT S64 = LLT::scalar(64);
2965   const LLT S1 = LLT::scalar(1);
2966   Register RcpLo, RcpHi;
2967 
2968   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2969 
2970   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2971 
2972   auto Zero64 = B.buildConstant(S64, 0);
2973   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2974 
2975   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2976   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2977 
2978   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2979   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2980   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2981 
2982   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2983   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2984   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2985   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2986 
2987   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2988   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2989   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2990   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2991   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2992 
2993   auto Zero32 = B.buildConstant(S32, 0);
2994   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2995   auto Add2_HiC =
2996       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2997   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2998   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2999 
3000   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3001   Register NumerLo = UnmergeNumer.getReg(0);
3002   Register NumerHi = UnmergeNumer.getReg(1);
3003 
3004   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3005   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3006   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3007   Register Mul3_Lo = UnmergeMul3.getReg(0);
3008   Register Mul3_Hi = UnmergeMul3.getReg(1);
3009   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3010   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3011   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3012   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3013 
3014   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3015   Register DenomLo = UnmergeDenom.getReg(0);
3016   Register DenomHi = UnmergeDenom.getReg(1);
3017 
3018   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3019   auto C1 = B.buildSExt(S32, CmpHi);
3020 
3021   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3022   auto C2 = B.buildSExt(S32, CmpLo);
3023 
3024   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3025   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3026 
3027   // TODO: Here and below portions of the code can be enclosed into if/endif.
3028   // Currently control flow is unconditional and we have 4 selects after
3029   // potential endif to substitute PHIs.
3030 
3031   // if C3 != 0 ...
3032   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3033   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3034   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3035   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3036 
3037   auto One64 = B.buildConstant(S64, 1);
3038   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3039 
3040   auto C4 =
3041       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3042   auto C5 =
3043       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3044   auto C6 = B.buildSelect(
3045       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3046 
3047   // if (C6 != 0)
3048   auto Add4 = B.buildAdd(S64, Add3, One64);
3049   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3050 
3051   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3052   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3053   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3054 
3055   // endif C6
3056   // endif C3
3057 
3058   if (DstDivReg) {
3059     auto Sel1 = B.buildSelect(
3060         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3061     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3062                   Sel1, MulHi3);
3063   }
3064 
3065   if (DstRemReg) {
3066     auto Sel2 = B.buildSelect(
3067         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3068     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3069                   Sel2, Sub1);
3070   }
3071 }
3072 
3073 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3074                                                   MachineRegisterInfo &MRI,
3075                                                   MachineIRBuilder &B) const {
3076   Register DstDivReg, DstRemReg;
3077   switch (MI.getOpcode()) {
3078   default:
3079     llvm_unreachable("Unexpected opcode!");
3080   case AMDGPU::G_UDIV: {
3081     DstDivReg = MI.getOperand(0).getReg();
3082     break;
3083   }
3084   case AMDGPU::G_UREM: {
3085     DstRemReg = MI.getOperand(0).getReg();
3086     break;
3087   }
3088   case AMDGPU::G_UDIVREM: {
3089     DstDivReg = MI.getOperand(0).getReg();
3090     DstRemReg = MI.getOperand(1).getReg();
3091     break;
3092   }
3093   }
3094 
3095   const LLT S64 = LLT::scalar(64);
3096   const LLT S32 = LLT::scalar(32);
3097   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3098   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3099   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3100   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3101 
3102   if (Ty == S32)
3103     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3104   else if (Ty == S64)
3105     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3106   else
3107     return false;
3108 
3109   MI.eraseFromParent();
3110   return true;
3111 }
3112 
3113 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3114                                                 MachineRegisterInfo &MRI,
3115                                                 MachineIRBuilder &B) const {
3116   const LLT S64 = LLT::scalar(64);
3117   const LLT S32 = LLT::scalar(32);
3118 
3119   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3120   if (Ty != S32 && Ty != S64)
3121     return false;
3122 
3123   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3124   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3125   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3126 
3127   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3128   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3129   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3130 
3131   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3132   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3133 
3134   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3135   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3136 
3137   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3138   switch (MI.getOpcode()) {
3139   default:
3140     llvm_unreachable("Unexpected opcode!");
3141   case AMDGPU::G_SDIV: {
3142     DstDivReg = MI.getOperand(0).getReg();
3143     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3144     break;
3145   }
3146   case AMDGPU::G_SREM: {
3147     DstRemReg = MI.getOperand(0).getReg();
3148     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3149     break;
3150   }
3151   case AMDGPU::G_SDIVREM: {
3152     DstDivReg = MI.getOperand(0).getReg();
3153     DstRemReg = MI.getOperand(1).getReg();
3154     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3155     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3156     break;
3157   }
3158   }
3159 
3160   if (Ty == S32)
3161     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3162   else
3163     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3164 
3165   if (DstDivReg) {
3166     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3167     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3168     B.buildSub(DstDivReg, SignXor, Sign);
3169   }
3170 
3171   if (DstRemReg) {
3172     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3173     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3174     B.buildSub(DstRemReg, SignXor, Sign);
3175   }
3176 
3177   MI.eraseFromParent();
3178   return true;
3179 }
3180 
3181 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3182                                                  MachineRegisterInfo &MRI,
3183                                                  MachineIRBuilder &B) const {
3184   Register Res = MI.getOperand(0).getReg();
3185   Register LHS = MI.getOperand(1).getReg();
3186   Register RHS = MI.getOperand(2).getReg();
3187   uint16_t Flags = MI.getFlags();
3188   LLT ResTy = MRI.getType(Res);
3189 
3190   const MachineFunction &MF = B.getMF();
3191   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3192                             MI.getFlag(MachineInstr::FmAfn);
3193 
3194   if (!AllowInaccurateRcp)
3195     return false;
3196 
3197   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3198     // 1 / x -> RCP(x)
3199     if (CLHS->isExactlyValue(1.0)) {
3200       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3201         .addUse(RHS)
3202         .setMIFlags(Flags);
3203 
3204       MI.eraseFromParent();
3205       return true;
3206     }
3207 
3208     // -1 / x -> RCP( FNEG(x) )
3209     if (CLHS->isExactlyValue(-1.0)) {
3210       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3211       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3212         .addUse(FNeg.getReg(0))
3213         .setMIFlags(Flags);
3214 
3215       MI.eraseFromParent();
3216       return true;
3217     }
3218   }
3219 
3220   // x / y -> x * (1.0 / y)
3221   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3222     .addUse(RHS)
3223     .setMIFlags(Flags);
3224   B.buildFMul(Res, LHS, RCP, Flags);
3225 
3226   MI.eraseFromParent();
3227   return true;
3228 }
3229 
3230 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3231                                                    MachineRegisterInfo &MRI,
3232                                                    MachineIRBuilder &B) const {
3233   Register Res = MI.getOperand(0).getReg();
3234   Register X = MI.getOperand(1).getReg();
3235   Register Y = MI.getOperand(2).getReg();
3236   uint16_t Flags = MI.getFlags();
3237   LLT ResTy = MRI.getType(Res);
3238 
3239   const MachineFunction &MF = B.getMF();
3240   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3241                             MI.getFlag(MachineInstr::FmAfn);
3242 
3243   if (!AllowInaccurateRcp)
3244     return false;
3245 
3246   auto NegY = B.buildFNeg(ResTy, Y);
3247   auto One = B.buildFConstant(ResTy, 1.0);
3248 
3249   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3250     .addUse(Y)
3251     .setMIFlags(Flags);
3252 
3253   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3254   R = B.buildFMA(ResTy, Tmp0, R, R);
3255 
3256   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3257   R = B.buildFMA(ResTy, Tmp1, R, R);
3258 
3259   auto Ret = B.buildFMul(ResTy, X, R);
3260   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3261 
3262   B.buildFMA(Res, Tmp2, R, Ret);
3263   MI.eraseFromParent();
3264   return true;
3265 }
3266 
3267 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3268                                          MachineRegisterInfo &MRI,
3269                                          MachineIRBuilder &B) const {
3270   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3271     return true;
3272 
3273   Register Res = MI.getOperand(0).getReg();
3274   Register LHS = MI.getOperand(1).getReg();
3275   Register RHS = MI.getOperand(2).getReg();
3276 
3277   uint16_t Flags = MI.getFlags();
3278 
3279   LLT S16 = LLT::scalar(16);
3280   LLT S32 = LLT::scalar(32);
3281 
3282   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3283   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3284 
3285   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3286     .addUse(RHSExt.getReg(0))
3287     .setMIFlags(Flags);
3288 
3289   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3290   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3291 
3292   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3293     .addUse(RDst.getReg(0))
3294     .addUse(RHS)
3295     .addUse(LHS)
3296     .setMIFlags(Flags);
3297 
3298   MI.eraseFromParent();
3299   return true;
3300 }
3301 
3302 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3303 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3304 static void toggleSPDenormMode(bool Enable,
3305                                MachineIRBuilder &B,
3306                                const GCNSubtarget &ST,
3307                                AMDGPU::SIModeRegisterDefaults Mode) {
3308   // Set SP denorm mode to this value.
3309   unsigned SPDenormMode =
3310     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3311 
3312   if (ST.hasDenormModeInst()) {
3313     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3314     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3315 
3316     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3317     B.buildInstr(AMDGPU::S_DENORM_MODE)
3318       .addImm(NewDenormModeValue);
3319 
3320   } else {
3321     // Select FP32 bit field in mode register.
3322     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3323                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3324                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3325 
3326     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3327       .addImm(SPDenormMode)
3328       .addImm(SPDenormModeBitField);
3329   }
3330 }
3331 
3332 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3333                                          MachineRegisterInfo &MRI,
3334                                          MachineIRBuilder &B) const {
3335   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3336     return true;
3337 
3338   Register Res = MI.getOperand(0).getReg();
3339   Register LHS = MI.getOperand(1).getReg();
3340   Register RHS = MI.getOperand(2).getReg();
3341   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3342   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3343 
3344   uint16_t Flags = MI.getFlags();
3345 
3346   LLT S32 = LLT::scalar(32);
3347   LLT S1 = LLT::scalar(1);
3348 
3349   auto One = B.buildFConstant(S32, 1.0f);
3350 
3351   auto DenominatorScaled =
3352     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3353       .addUse(LHS)
3354       .addUse(RHS)
3355       .addImm(0)
3356       .setMIFlags(Flags);
3357   auto NumeratorScaled =
3358     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3359       .addUse(LHS)
3360       .addUse(RHS)
3361       .addImm(1)
3362       .setMIFlags(Flags);
3363 
3364   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3365     .addUse(DenominatorScaled.getReg(0))
3366     .setMIFlags(Flags);
3367   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3368 
3369   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3370   // aren't modeled as reading it.
3371   if (!Mode.allFP32Denormals())
3372     toggleSPDenormMode(true, B, ST, Mode);
3373 
3374   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3375   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3376   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3377   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3378   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3379   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3380 
3381   if (!Mode.allFP32Denormals())
3382     toggleSPDenormMode(false, B, ST, Mode);
3383 
3384   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3385     .addUse(Fma4.getReg(0))
3386     .addUse(Fma1.getReg(0))
3387     .addUse(Fma3.getReg(0))
3388     .addUse(NumeratorScaled.getReg(1))
3389     .setMIFlags(Flags);
3390 
3391   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3392     .addUse(Fmas.getReg(0))
3393     .addUse(RHS)
3394     .addUse(LHS)
3395     .setMIFlags(Flags);
3396 
3397   MI.eraseFromParent();
3398   return true;
3399 }
3400 
3401 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3402                                          MachineRegisterInfo &MRI,
3403                                          MachineIRBuilder &B) const {
3404   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3405     return true;
3406 
3407   Register Res = MI.getOperand(0).getReg();
3408   Register LHS = MI.getOperand(1).getReg();
3409   Register RHS = MI.getOperand(2).getReg();
3410 
3411   uint16_t Flags = MI.getFlags();
3412 
3413   LLT S64 = LLT::scalar(64);
3414   LLT S1 = LLT::scalar(1);
3415 
3416   auto One = B.buildFConstant(S64, 1.0);
3417 
3418   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3419     .addUse(LHS)
3420     .addUse(RHS)
3421     .addImm(0)
3422     .setMIFlags(Flags);
3423 
3424   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3425 
3426   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3427     .addUse(DivScale0.getReg(0))
3428     .setMIFlags(Flags);
3429 
3430   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3431   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3432   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3433 
3434   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3435     .addUse(LHS)
3436     .addUse(RHS)
3437     .addImm(1)
3438     .setMIFlags(Flags);
3439 
3440   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3441   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3442   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3443 
3444   Register Scale;
3445   if (!ST.hasUsableDivScaleConditionOutput()) {
3446     // Workaround a hardware bug on SI where the condition output from div_scale
3447     // is not usable.
3448 
3449     LLT S32 = LLT::scalar(32);
3450 
3451     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3452     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3453     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3454     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3455 
3456     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3457                               Scale1Unmerge.getReg(1));
3458     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3459                               Scale0Unmerge.getReg(1));
3460     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3461   } else {
3462     Scale = DivScale1.getReg(1);
3463   }
3464 
3465   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3466     .addUse(Fma4.getReg(0))
3467     .addUse(Fma3.getReg(0))
3468     .addUse(Mul.getReg(0))
3469     .addUse(Scale)
3470     .setMIFlags(Flags);
3471 
3472   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3473     .addUse(Fmas.getReg(0))
3474     .addUse(RHS)
3475     .addUse(LHS)
3476     .setMIFlags(Flags);
3477 
3478   MI.eraseFromParent();
3479   return true;
3480 }
3481 
3482 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3483                                                  MachineRegisterInfo &MRI,
3484                                                  MachineIRBuilder &B) const {
3485   Register Res = MI.getOperand(0).getReg();
3486   Register LHS = MI.getOperand(2).getReg();
3487   Register RHS = MI.getOperand(3).getReg();
3488   uint16_t Flags = MI.getFlags();
3489 
3490   LLT S32 = LLT::scalar(32);
3491   LLT S1 = LLT::scalar(1);
3492 
3493   auto Abs = B.buildFAbs(S32, RHS, Flags);
3494   const APFloat C0Val(1.0f);
3495 
3496   auto C0 = B.buildConstant(S32, 0x6f800000);
3497   auto C1 = B.buildConstant(S32, 0x2f800000);
3498   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3499 
3500   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3501   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3502 
3503   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3504 
3505   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3506     .addUse(Mul0.getReg(0))
3507     .setMIFlags(Flags);
3508 
3509   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3510 
3511   B.buildFMul(Res, Sel, Mul1, Flags);
3512 
3513   MI.eraseFromParent();
3514   return true;
3515 }
3516 
3517 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3518 // FIXME: Why do we handle this one but not other removed instructions?
3519 //
3520 // Reciprocal square root.  The clamp prevents infinite results, clamping
3521 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3522 // +-max_float.
3523 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3524                                                     MachineRegisterInfo &MRI,
3525                                                     MachineIRBuilder &B) const {
3526   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3527     return true;
3528 
3529   Register Dst = MI.getOperand(0).getReg();
3530   Register Src = MI.getOperand(2).getReg();
3531   auto Flags = MI.getFlags();
3532 
3533   LLT Ty = MRI.getType(Dst);
3534 
3535   const fltSemantics *FltSemantics;
3536   if (Ty == LLT::scalar(32))
3537     FltSemantics = &APFloat::IEEEsingle();
3538   else if (Ty == LLT::scalar(64))
3539     FltSemantics = &APFloat::IEEEdouble();
3540   else
3541     return false;
3542 
3543   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3544     .addUse(Src)
3545     .setMIFlags(Flags);
3546 
3547   // We don't need to concern ourselves with the snan handling difference, since
3548   // the rsq quieted (or not) so use the one which will directly select.
3549   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3550   const bool UseIEEE = MFI->getMode().IEEE;
3551 
3552   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3553   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3554                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3555 
3556   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3557 
3558   if (UseIEEE)
3559     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3560   else
3561     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3562   MI.eraseFromParent();
3563   return true;
3564 }
3565 
3566 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3567   switch (IID) {
3568   case Intrinsic::amdgcn_ds_fadd:
3569     return AMDGPU::G_ATOMICRMW_FADD;
3570   case Intrinsic::amdgcn_ds_fmin:
3571     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3572   case Intrinsic::amdgcn_ds_fmax:
3573     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3574   default:
3575     llvm_unreachable("not a DS FP intrinsic");
3576   }
3577 }
3578 
3579 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3580                                                       MachineInstr &MI,
3581                                                       Intrinsic::ID IID) const {
3582   GISelChangeObserver &Observer = Helper.Observer;
3583   Observer.changingInstr(MI);
3584 
3585   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3586 
3587   // The remaining operands were used to set fields in the MemOperand on
3588   // construction.
3589   for (int I = 6; I > 3; --I)
3590     MI.RemoveOperand(I);
3591 
3592   MI.RemoveOperand(1); // Remove the intrinsic ID.
3593   Observer.changedInstr(MI);
3594   return true;
3595 }
3596 
3597 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3598                                             MachineRegisterInfo &MRI,
3599                                             MachineIRBuilder &B) const {
3600   uint64_t Offset =
3601     ST.getTargetLowering()->getImplicitParameterOffset(
3602       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3603   LLT DstTy = MRI.getType(DstReg);
3604   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3605 
3606   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3607   if (!loadInputValue(KernargPtrReg, B,
3608                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3609     return false;
3610 
3611   // FIXME: This should be nuw
3612   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3613   return true;
3614 }
3615 
3616 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3617                                                  MachineRegisterInfo &MRI,
3618                                                  MachineIRBuilder &B) const {
3619   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3620   if (!MFI->isEntryFunction()) {
3621     return legalizePreloadedArgIntrin(MI, MRI, B,
3622                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3623   }
3624 
3625   Register DstReg = MI.getOperand(0).getReg();
3626   if (!getImplicitArgPtr(DstReg, MRI, B))
3627     return false;
3628 
3629   MI.eraseFromParent();
3630   return true;
3631 }
3632 
3633 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3634                                               MachineRegisterInfo &MRI,
3635                                               MachineIRBuilder &B,
3636                                               unsigned AddrSpace) const {
3637   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3638   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3639   Register Hi32 = Unmerge.getReg(1);
3640 
3641   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3642   MI.eraseFromParent();
3643   return true;
3644 }
3645 
3646 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3647 // offset (the offset that is included in bounds checking and swizzling, to be
3648 // split between the instruction's voffset and immoffset fields) and soffset
3649 // (the offset that is excluded from bounds checking and swizzling, to go in
3650 // the instruction's soffset field).  This function takes the first kind of
3651 // offset and figures out how to split it between voffset and immoffset.
3652 std::pair<Register, unsigned>
3653 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3654                                         Register OrigOffset) const {
3655   const unsigned MaxImm = 4095;
3656   Register BaseReg;
3657   unsigned ImmOffset;
3658   const LLT S32 = LLT::scalar(32);
3659   MachineRegisterInfo &MRI = *B.getMRI();
3660 
3661   std::tie(BaseReg, ImmOffset) =
3662       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3663 
3664   // If BaseReg is a pointer, convert it to int.
3665   if (MRI.getType(BaseReg).isPointer())
3666     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3667 
3668   // If the immediate value is too big for the immoffset field, put the value
3669   // and -4096 into the immoffset field so that the value that is copied/added
3670   // for the voffset field is a multiple of 4096, and it stands more chance
3671   // of being CSEd with the copy/add for another similar load/store.
3672   // However, do not do that rounding down to a multiple of 4096 if that is a
3673   // negative number, as it appears to be illegal to have a negative offset
3674   // in the vgpr, even if adding the immediate offset makes it positive.
3675   unsigned Overflow = ImmOffset & ~MaxImm;
3676   ImmOffset -= Overflow;
3677   if ((int32_t)Overflow < 0) {
3678     Overflow += ImmOffset;
3679     ImmOffset = 0;
3680   }
3681 
3682   if (Overflow != 0) {
3683     if (!BaseReg) {
3684       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3685     } else {
3686       auto OverflowVal = B.buildConstant(S32, Overflow);
3687       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3688     }
3689   }
3690 
3691   if (!BaseReg)
3692     BaseReg = B.buildConstant(S32, 0).getReg(0);
3693 
3694   return std::make_pair(BaseReg, ImmOffset);
3695 }
3696 
3697 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3698 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3699                                           Register VOffset, Register SOffset,
3700                                           unsigned ImmOffset, Register VIndex,
3701                                           MachineRegisterInfo &MRI) const {
3702   Optional<ValueAndVReg> MaybeVOffsetVal =
3703       getConstantVRegValWithLookThrough(VOffset, MRI);
3704   Optional<ValueAndVReg> MaybeSOffsetVal =
3705       getConstantVRegValWithLookThrough(SOffset, MRI);
3706   Optional<ValueAndVReg> MaybeVIndexVal =
3707       getConstantVRegValWithLookThrough(VIndex, MRI);
3708   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3709   // update the MMO with that offset. The stride is unknown so we can only do
3710   // this if VIndex is constant 0.
3711   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3712       MaybeVIndexVal->Value == 0) {
3713     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3714                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3715     MMO->setOffset(TotalOffset);
3716   } else {
3717     // We don't have a constant combined offset to use in the MMO. Give up.
3718     MMO->setValue((Value *)nullptr);
3719   }
3720 }
3721 
3722 /// Handle register layout difference for f16 images for some subtargets.
3723 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3724                                              MachineRegisterInfo &MRI,
3725                                              Register Reg,
3726                                              bool ImageStore) const {
3727   const LLT S16 = LLT::scalar(16);
3728   const LLT S32 = LLT::scalar(32);
3729   LLT StoreVT = MRI.getType(Reg);
3730   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3731 
3732   if (ST.hasUnpackedD16VMem()) {
3733     auto Unmerge = B.buildUnmerge(S16, Reg);
3734 
3735     SmallVector<Register, 4> WideRegs;
3736     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3737       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3738 
3739     int NumElts = StoreVT.getNumElements();
3740 
3741     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3742         .getReg(0);
3743   }
3744 
3745   if (ImageStore && ST.hasImageStoreD16Bug()) {
3746     if (StoreVT.getNumElements() == 2) {
3747       SmallVector<Register, 4> PackedRegs;
3748       Reg = B.buildBitcast(S32, Reg).getReg(0);
3749       PackedRegs.push_back(Reg);
3750       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3751       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3752           .getReg(0);
3753     }
3754 
3755     if (StoreVT.getNumElements() == 3) {
3756       SmallVector<Register, 4> PackedRegs;
3757       auto Unmerge = B.buildUnmerge(S16, Reg);
3758       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3759         PackedRegs.push_back(Unmerge.getReg(I));
3760       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3761       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3762       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3763     }
3764 
3765     if (StoreVT.getNumElements() == 4) {
3766       SmallVector<Register, 4> PackedRegs;
3767       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3768       auto Unmerge = B.buildUnmerge(S32, Reg);
3769       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3770         PackedRegs.push_back(Unmerge.getReg(I));
3771       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3772       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3773           .getReg(0);
3774     }
3775 
3776     llvm_unreachable("invalid data type");
3777   }
3778 
3779   return Reg;
3780 }
3781 
3782 Register AMDGPULegalizerInfo::fixStoreSourceType(
3783   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3784   MachineRegisterInfo *MRI = B.getMRI();
3785   LLT Ty = MRI->getType(VData);
3786 
3787   const LLT S16 = LLT::scalar(16);
3788 
3789   // Fixup illegal register types for i8 stores.
3790   if (Ty == LLT::scalar(8) || Ty == S16) {
3791     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3792     return AnyExt;
3793   }
3794 
3795   if (Ty.isVector()) {
3796     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3797       if (IsFormat)
3798         return handleD16VData(B, *MRI, VData);
3799     }
3800   }
3801 
3802   return VData;
3803 }
3804 
3805 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3806                                               MachineRegisterInfo &MRI,
3807                                               MachineIRBuilder &B,
3808                                               bool IsTyped,
3809                                               bool IsFormat) const {
3810   Register VData = MI.getOperand(1).getReg();
3811   LLT Ty = MRI.getType(VData);
3812   LLT EltTy = Ty.getScalarType();
3813   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3814   const LLT S32 = LLT::scalar(32);
3815 
3816   VData = fixStoreSourceType(B, VData, IsFormat);
3817   Register RSrc = MI.getOperand(2).getReg();
3818 
3819   MachineMemOperand *MMO = *MI.memoperands_begin();
3820   const int MemSize = MMO->getSize();
3821 
3822   unsigned ImmOffset;
3823 
3824   // The typed intrinsics add an immediate after the registers.
3825   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3826 
3827   // The struct intrinsic variants add one additional operand over raw.
3828   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3829   Register VIndex;
3830   int OpOffset = 0;
3831   if (HasVIndex) {
3832     VIndex = MI.getOperand(3).getReg();
3833     OpOffset = 1;
3834   } else {
3835     VIndex = B.buildConstant(S32, 0).getReg(0);
3836   }
3837 
3838   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3839   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3840 
3841   unsigned Format = 0;
3842   if (IsTyped) {
3843     Format = MI.getOperand(5 + OpOffset).getImm();
3844     ++OpOffset;
3845   }
3846 
3847   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3848 
3849   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3850   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3851 
3852   unsigned Opc;
3853   if (IsTyped) {
3854     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3855                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3856   } else if (IsFormat) {
3857     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3858                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3859   } else {
3860     switch (MemSize) {
3861     case 1:
3862       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3863       break;
3864     case 2:
3865       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3866       break;
3867     default:
3868       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3869       break;
3870     }
3871   }
3872 
3873   auto MIB = B.buildInstr(Opc)
3874     .addUse(VData)              // vdata
3875     .addUse(RSrc)               // rsrc
3876     .addUse(VIndex)             // vindex
3877     .addUse(VOffset)            // voffset
3878     .addUse(SOffset)            // soffset
3879     .addImm(ImmOffset);         // offset(imm)
3880 
3881   if (IsTyped)
3882     MIB.addImm(Format);
3883 
3884   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3885      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3886      .addMemOperand(MMO);
3887 
3888   MI.eraseFromParent();
3889   return true;
3890 }
3891 
3892 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3893                                              MachineRegisterInfo &MRI,
3894                                              MachineIRBuilder &B,
3895                                              bool IsFormat,
3896                                              bool IsTyped) const {
3897   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3898   MachineMemOperand *MMO = *MI.memoperands_begin();
3899   const LLT MemTy = MMO->getMemoryType();
3900   const LLT S32 = LLT::scalar(32);
3901 
3902   Register Dst = MI.getOperand(0).getReg();
3903   Register RSrc = MI.getOperand(2).getReg();
3904 
3905   // The typed intrinsics add an immediate after the registers.
3906   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3907 
3908   // The struct intrinsic variants add one additional operand over raw.
3909   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3910   Register VIndex;
3911   int OpOffset = 0;
3912   if (HasVIndex) {
3913     VIndex = MI.getOperand(3).getReg();
3914     OpOffset = 1;
3915   } else {
3916     VIndex = B.buildConstant(S32, 0).getReg(0);
3917   }
3918 
3919   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3920   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3921 
3922   unsigned Format = 0;
3923   if (IsTyped) {
3924     Format = MI.getOperand(5 + OpOffset).getImm();
3925     ++OpOffset;
3926   }
3927 
3928   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3929   unsigned ImmOffset;
3930 
3931   LLT Ty = MRI.getType(Dst);
3932   LLT EltTy = Ty.getScalarType();
3933   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3934   const bool Unpacked = ST.hasUnpackedD16VMem();
3935 
3936   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3937   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3938 
3939   unsigned Opc;
3940 
3941   if (IsTyped) {
3942     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3943                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3944   } else if (IsFormat) {
3945     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3946                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3947   } else {
3948     switch (MemTy.getSizeInBits()) {
3949     case 8:
3950       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3951       break;
3952     case 16:
3953       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3954       break;
3955     default:
3956       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3957       break;
3958     }
3959   }
3960 
3961   Register LoadDstReg;
3962 
3963   bool IsExtLoad =
3964       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
3965   LLT UnpackedTy = Ty.changeElementSize(32);
3966 
3967   if (IsExtLoad)
3968     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3969   else if (Unpacked && IsD16 && Ty.isVector())
3970     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3971   else
3972     LoadDstReg = Dst;
3973 
3974   auto MIB = B.buildInstr(Opc)
3975     .addDef(LoadDstReg)         // vdata
3976     .addUse(RSrc)               // rsrc
3977     .addUse(VIndex)             // vindex
3978     .addUse(VOffset)            // voffset
3979     .addUse(SOffset)            // soffset
3980     .addImm(ImmOffset);         // offset(imm)
3981 
3982   if (IsTyped)
3983     MIB.addImm(Format);
3984 
3985   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3986      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3987      .addMemOperand(MMO);
3988 
3989   if (LoadDstReg != Dst) {
3990     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3991 
3992     // Widen result for extending loads was widened.
3993     if (IsExtLoad)
3994       B.buildTrunc(Dst, LoadDstReg);
3995     else {
3996       // Repack to original 16-bit vector result
3997       // FIXME: G_TRUNC should work, but legalization currently fails
3998       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3999       SmallVector<Register, 4> Repack;
4000       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4001         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4002       B.buildMerge(Dst, Repack);
4003     }
4004   }
4005 
4006   MI.eraseFromParent();
4007   return true;
4008 }
4009 
4010 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4011                                                MachineIRBuilder &B,
4012                                                bool IsInc) const {
4013   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4014                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4015   B.buildInstr(Opc)
4016     .addDef(MI.getOperand(0).getReg())
4017     .addUse(MI.getOperand(2).getReg())
4018     .addUse(MI.getOperand(3).getReg())
4019     .cloneMemRefs(MI);
4020   MI.eraseFromParent();
4021   return true;
4022 }
4023 
4024 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4025   switch (IntrID) {
4026   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4027   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4028     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4029   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4030   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4031     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4032   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4033   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4034     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4035   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4036   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4037     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4038   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4039   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4040     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4041   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4042   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4043     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4044   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4045   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4046     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4047   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4048   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4049     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4050   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4051   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4052     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4053   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4054   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4055     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4056   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4057   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4058     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4059   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4060   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4061     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4062   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4063   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4064     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4065   case Intrinsic::amdgcn_buffer_atomic_fadd:
4066   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4067   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4068     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4069   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4070   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4071     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4072   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4073   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4074     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4075   default:
4076     llvm_unreachable("unhandled atomic opcode");
4077   }
4078 }
4079 
4080 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4081                                                MachineIRBuilder &B,
4082                                                Intrinsic::ID IID) const {
4083   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4084                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4085   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4086 
4087   Register Dst;
4088 
4089   int OpOffset = 0;
4090   if (HasReturn) {
4091     // A few FP atomics do not support return values.
4092     Dst = MI.getOperand(0).getReg();
4093   } else {
4094     OpOffset = -1;
4095   }
4096 
4097   Register VData = MI.getOperand(2 + OpOffset).getReg();
4098   Register CmpVal;
4099 
4100   if (IsCmpSwap) {
4101     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4102     ++OpOffset;
4103   }
4104 
4105   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4106   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4107 
4108   // The struct intrinsic variants add one additional operand over raw.
4109   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4110   Register VIndex;
4111   if (HasVIndex) {
4112     VIndex = MI.getOperand(4 + OpOffset).getReg();
4113     ++OpOffset;
4114   } else {
4115     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4116   }
4117 
4118   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4119   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4120   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4121 
4122   MachineMemOperand *MMO = *MI.memoperands_begin();
4123 
4124   unsigned ImmOffset;
4125   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4126   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4127 
4128   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4129 
4130   if (HasReturn)
4131     MIB.addDef(Dst);
4132 
4133   MIB.addUse(VData); // vdata
4134 
4135   if (IsCmpSwap)
4136     MIB.addReg(CmpVal);
4137 
4138   MIB.addUse(RSrc)               // rsrc
4139      .addUse(VIndex)             // vindex
4140      .addUse(VOffset)            // voffset
4141      .addUse(SOffset)            // soffset
4142      .addImm(ImmOffset)          // offset(imm)
4143      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4144      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4145      .addMemOperand(MMO);
4146 
4147   MI.eraseFromParent();
4148   return true;
4149 }
4150 
4151 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4152 /// vector with s16 typed elements.
4153 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4154                                       SmallVectorImpl<Register> &PackedAddrs,
4155                                       unsigned ArgOffset,
4156                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4157                                       bool IsA16, bool IsG16) {
4158   const LLT S16 = LLT::scalar(16);
4159   const LLT V2S16 = LLT::fixed_vector(2, 16);
4160   auto EndIdx = Intr->VAddrEnd;
4161 
4162   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4163     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4164     if (!SrcOp.isReg())
4165       continue; // _L to _LZ may have eliminated this.
4166 
4167     Register AddrReg = SrcOp.getReg();
4168 
4169     if ((I < Intr->GradientStart) ||
4170         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4171         (I >= Intr->CoordStart && !IsA16)) {
4172       // Handle any gradient or coordinate operands that should not be packed
4173       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4174       PackedAddrs.push_back(AddrReg);
4175     } else {
4176       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4177       // derivatives dx/dh and dx/dv are packed with undef.
4178       if (((I + 1) >= EndIdx) ||
4179           ((Intr->NumGradients / 2) % 2 == 1 &&
4180            (I == static_cast<unsigned>(Intr->GradientStart +
4181                                        (Intr->NumGradients / 2) - 1) ||
4182             I == static_cast<unsigned>(Intr->GradientStart +
4183                                        Intr->NumGradients - 1))) ||
4184           // Check for _L to _LZ optimization
4185           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4186         PackedAddrs.push_back(
4187             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4188                 .getReg(0));
4189       } else {
4190         PackedAddrs.push_back(
4191             B.buildBuildVector(
4192                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4193                 .getReg(0));
4194         ++I;
4195       }
4196     }
4197   }
4198 }
4199 
4200 /// Convert from separate vaddr components to a single vector address register,
4201 /// and replace the remaining operands with $noreg.
4202 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4203                                      int DimIdx, int NumVAddrs) {
4204   const LLT S32 = LLT::scalar(32);
4205 
4206   SmallVector<Register, 8> AddrRegs;
4207   for (int I = 0; I != NumVAddrs; ++I) {
4208     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4209     if (SrcOp.isReg()) {
4210       AddrRegs.push_back(SrcOp.getReg());
4211       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4212     }
4213   }
4214 
4215   int NumAddrRegs = AddrRegs.size();
4216   if (NumAddrRegs != 1) {
4217     // Above 8 elements round up to next power of 2 (i.e. 16).
4218     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4219       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4220       auto Undef = B.buildUndef(S32);
4221       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4222       NumAddrRegs = RoundedNumRegs;
4223     }
4224 
4225     auto VAddr =
4226         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4227     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4228   }
4229 
4230   for (int I = 1; I != NumVAddrs; ++I) {
4231     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4232     if (SrcOp.isReg())
4233       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4234   }
4235 }
4236 
4237 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4238 ///
4239 /// Depending on the subtarget, load/store with 16-bit element data need to be
4240 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4241 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4242 /// registers.
4243 ///
4244 /// We don't want to directly select image instructions just yet, but also want
4245 /// to exposes all register repacking to the legalizer/combiners. We also don't
4246 /// want a selected instrution entering RegBankSelect. In order to avoid
4247 /// defining a multitude of intermediate image instructions, directly hack on
4248 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4249 /// now unnecessary arguments with $noreg.
4250 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4251     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4252     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4253 
4254   const unsigned NumDefs = MI.getNumExplicitDefs();
4255   const unsigned ArgOffset = NumDefs + 1;
4256   bool IsTFE = NumDefs == 2;
4257   // We are only processing the operands of d16 image operations on subtargets
4258   // that use the unpacked register layout, or need to repack the TFE result.
4259 
4260   // TODO: Do we need to guard against already legalized intrinsics?
4261   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4262       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4263 
4264   MachineRegisterInfo *MRI = B.getMRI();
4265   const LLT S32 = LLT::scalar(32);
4266   const LLT S16 = LLT::scalar(16);
4267   const LLT V2S16 = LLT::fixed_vector(2, 16);
4268 
4269   unsigned DMask = 0;
4270 
4271   // Check for 16 bit addresses and pack if true.
4272   LLT GradTy =
4273       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4274   LLT AddrTy =
4275       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4276   const bool IsG16 = GradTy == S16;
4277   const bool IsA16 = AddrTy == S16;
4278 
4279   int DMaskLanes = 0;
4280   if (!BaseOpcode->Atomic) {
4281     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4282     if (BaseOpcode->Gather4) {
4283       DMaskLanes = 4;
4284     } else if (DMask != 0) {
4285       DMaskLanes = countPopulation(DMask);
4286     } else if (!IsTFE && !BaseOpcode->Store) {
4287       // If dmask is 0, this is a no-op load. This can be eliminated.
4288       B.buildUndef(MI.getOperand(0));
4289       MI.eraseFromParent();
4290       return true;
4291     }
4292   }
4293 
4294   Observer.changingInstr(MI);
4295   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4296 
4297   unsigned NewOpcode = NumDefs == 0 ?
4298     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4299 
4300   // Track that we legalized this
4301   MI.setDesc(B.getTII().get(NewOpcode));
4302 
4303   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4304   // dmask to be at least 1 otherwise the instruction will fail
4305   if (IsTFE && DMask == 0) {
4306     DMask = 0x1;
4307     DMaskLanes = 1;
4308     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4309   }
4310 
4311   if (BaseOpcode->Atomic) {
4312     Register VData0 = MI.getOperand(2).getReg();
4313     LLT Ty = MRI->getType(VData0);
4314 
4315     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4316     if (Ty.isVector())
4317       return false;
4318 
4319     if (BaseOpcode->AtomicX2) {
4320       Register VData1 = MI.getOperand(3).getReg();
4321       // The two values are packed in one register.
4322       LLT PackedTy = LLT::fixed_vector(2, Ty);
4323       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4324       MI.getOperand(2).setReg(Concat.getReg(0));
4325       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4326     }
4327   }
4328 
4329   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4330 
4331   // Optimize _L to _LZ when _L is zero
4332   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4333           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4334     const ConstantFP *ConstantLod;
4335 
4336     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4337                  m_GFCst(ConstantLod))) {
4338       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4339         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4340         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4341             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4342                                                       Intr->Dim);
4343 
4344         // The starting indexes should remain in the same place.
4345         --CorrectedNumVAddrs;
4346 
4347         MI.getOperand(MI.getNumExplicitDefs())
4348             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4349         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4350         Intr = NewImageDimIntr;
4351       }
4352     }
4353   }
4354 
4355   // Optimize _mip away, when 'lod' is zero
4356   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4357     int64_t ConstantLod;
4358     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4359                  m_ICst(ConstantLod))) {
4360       if (ConstantLod == 0) {
4361         // TODO: Change intrinsic opcode and remove operand instead or replacing
4362         // it with 0, as the _L to _LZ handling is done above.
4363         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4364         --CorrectedNumVAddrs;
4365       }
4366     }
4367   }
4368 
4369   // Rewrite the addressing register layout before doing anything else.
4370   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4371     // 16 bit gradients are supported, but are tied to the A16 control
4372     // so both gradients and addresses must be 16 bit
4373     return false;
4374   }
4375 
4376   if (IsA16 && !ST.hasA16()) {
4377     // A16 not supported
4378     return false;
4379   }
4380 
4381   if (IsA16 || IsG16) {
4382     if (Intr->NumVAddrs > 1) {
4383       SmallVector<Register, 4> PackedRegs;
4384 
4385       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4386                                 IsG16);
4387 
4388       // See also below in the non-a16 branch
4389       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4390                           PackedRegs.size() <= ST.getNSAMaxSize();
4391 
4392       if (!UseNSA && PackedRegs.size() > 1) {
4393         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4394         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4395         PackedRegs[0] = Concat.getReg(0);
4396         PackedRegs.resize(1);
4397       }
4398 
4399       const unsigned NumPacked = PackedRegs.size();
4400       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4401         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4402         if (!SrcOp.isReg()) {
4403           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4404           continue;
4405         }
4406 
4407         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4408 
4409         if (I - Intr->VAddrStart < NumPacked)
4410           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4411         else
4412           SrcOp.setReg(AMDGPU::NoRegister);
4413       }
4414     }
4415   } else {
4416     // If the register allocator cannot place the address registers contiguously
4417     // without introducing moves, then using the non-sequential address encoding
4418     // is always preferable, since it saves VALU instructions and is usually a
4419     // wash in terms of code size or even better.
4420     //
4421     // However, we currently have no way of hinting to the register allocator
4422     // that MIMG addresses should be placed contiguously when it is possible to
4423     // do so, so force non-NSA for the common 2-address case as a heuristic.
4424     //
4425     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4426     // allocation when possible.
4427     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4428                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4429 
4430     if (!UseNSA && Intr->NumVAddrs > 1)
4431       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4432                                Intr->NumVAddrs);
4433   }
4434 
4435   int Flags = 0;
4436   if (IsA16)
4437     Flags |= 1;
4438   if (IsG16)
4439     Flags |= 2;
4440   MI.addOperand(MachineOperand::CreateImm(Flags));
4441 
4442   if (BaseOpcode->Store) { // No TFE for stores?
4443     // TODO: Handle dmask trim
4444     Register VData = MI.getOperand(1).getReg();
4445     LLT Ty = MRI->getType(VData);
4446     if (!Ty.isVector() || Ty.getElementType() != S16)
4447       return true;
4448 
4449     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4450     if (RepackedReg != VData) {
4451       MI.getOperand(1).setReg(RepackedReg);
4452     }
4453 
4454     return true;
4455   }
4456 
4457   Register DstReg = MI.getOperand(0).getReg();
4458   LLT Ty = MRI->getType(DstReg);
4459   const LLT EltTy = Ty.getScalarType();
4460   const bool IsD16 = Ty.getScalarType() == S16;
4461   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4462 
4463   // Confirm that the return type is large enough for the dmask specified
4464   if (NumElts < DMaskLanes)
4465     return false;
4466 
4467   if (NumElts > 4 || DMaskLanes > 4)
4468     return false;
4469 
4470   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4471   const LLT AdjustedTy =
4472       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4473 
4474   // The raw dword aligned data component of the load. The only legal cases
4475   // where this matters should be when using the packed D16 format, for
4476   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4477   LLT RoundedTy;
4478 
4479   // S32 vector to to cover all data, plus TFE result element.
4480   LLT TFETy;
4481 
4482   // Register type to use for each loaded component. Will be S32 or V2S16.
4483   LLT RegTy;
4484 
4485   if (IsD16 && ST.hasUnpackedD16VMem()) {
4486     RoundedTy =
4487         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4488     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4489     RegTy = S32;
4490   } else {
4491     unsigned EltSize = EltTy.getSizeInBits();
4492     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4493     unsigned RoundedSize = 32 * RoundedElts;
4494     RoundedTy = LLT::scalarOrVector(
4495         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4496     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4497     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4498   }
4499 
4500   // The return type does not need adjustment.
4501   // TODO: Should we change s16 case to s32 or <2 x s16>?
4502   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4503     return true;
4504 
4505   Register Dst1Reg;
4506 
4507   // Insert after the instruction.
4508   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4509 
4510   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4511   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4512   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4513   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4514 
4515   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4516 
4517   MI.getOperand(0).setReg(NewResultReg);
4518 
4519   // In the IR, TFE is supposed to be used with a 2 element struct return
4520   // type. The intruction really returns these two values in one contiguous
4521   // register, with one additional dword beyond the loaded data. Rewrite the
4522   // return type to use a single register result.
4523 
4524   if (IsTFE) {
4525     Dst1Reg = MI.getOperand(1).getReg();
4526     if (MRI->getType(Dst1Reg) != S32)
4527       return false;
4528 
4529     // TODO: Make sure the TFE operand bit is set.
4530     MI.RemoveOperand(1);
4531 
4532     // Handle the easy case that requires no repack instructions.
4533     if (Ty == S32) {
4534       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4535       return true;
4536     }
4537   }
4538 
4539   // Now figure out how to copy the new result register back into the old
4540   // result.
4541   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4542 
4543   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4544 
4545   if (ResultNumRegs == 1) {
4546     assert(!IsTFE);
4547     ResultRegs[0] = NewResultReg;
4548   } else {
4549     // We have to repack into a new vector of some kind.
4550     for (int I = 0; I != NumDataRegs; ++I)
4551       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4552     B.buildUnmerge(ResultRegs, NewResultReg);
4553 
4554     // Drop the final TFE element to get the data part. The TFE result is
4555     // directly written to the right place already.
4556     if (IsTFE)
4557       ResultRegs.resize(NumDataRegs);
4558   }
4559 
4560   // For an s16 scalar result, we form an s32 result with a truncate regardless
4561   // of packed vs. unpacked.
4562   if (IsD16 && !Ty.isVector()) {
4563     B.buildTrunc(DstReg, ResultRegs[0]);
4564     return true;
4565   }
4566 
4567   // Avoid a build/concat_vector of 1 entry.
4568   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4569     B.buildBitcast(DstReg, ResultRegs[0]);
4570     return true;
4571   }
4572 
4573   assert(Ty.isVector());
4574 
4575   if (IsD16) {
4576     // For packed D16 results with TFE enabled, all the data components are
4577     // S32. Cast back to the expected type.
4578     //
4579     // TODO: We don't really need to use load s32 elements. We would only need one
4580     // cast for the TFE result if a multiple of v2s16 was used.
4581     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4582       for (Register &Reg : ResultRegs)
4583         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4584     } else if (ST.hasUnpackedD16VMem()) {
4585       for (Register &Reg : ResultRegs)
4586         Reg = B.buildTrunc(S16, Reg).getReg(0);
4587     }
4588   }
4589 
4590   auto padWithUndef = [&](LLT Ty, int NumElts) {
4591     if (NumElts == 0)
4592       return;
4593     Register Undef = B.buildUndef(Ty).getReg(0);
4594     for (int I = 0; I != NumElts; ++I)
4595       ResultRegs.push_back(Undef);
4596   };
4597 
4598   // Pad out any elements eliminated due to the dmask.
4599   LLT ResTy = MRI->getType(ResultRegs[0]);
4600   if (!ResTy.isVector()) {
4601     padWithUndef(ResTy, NumElts - ResultRegs.size());
4602     B.buildBuildVector(DstReg, ResultRegs);
4603     return true;
4604   }
4605 
4606   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4607   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4608 
4609   // Deal with the one annoying legal case.
4610   const LLT V3S16 = LLT::fixed_vector(3, 16);
4611   if (Ty == V3S16) {
4612     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4613     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4614     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4615     return true;
4616   }
4617 
4618   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4619   B.buildConcatVectors(DstReg, ResultRegs);
4620   return true;
4621 }
4622 
4623 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4624   LegalizerHelper &Helper, MachineInstr &MI) const {
4625   MachineIRBuilder &B = Helper.MIRBuilder;
4626   GISelChangeObserver &Observer = Helper.Observer;
4627 
4628   Register Dst = MI.getOperand(0).getReg();
4629   LLT Ty = B.getMRI()->getType(Dst);
4630   unsigned Size = Ty.getSizeInBits();
4631   MachineFunction &MF = B.getMF();
4632 
4633   Observer.changingInstr(MI);
4634 
4635   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4636     Ty = getBitcastRegisterType(Ty);
4637     Helper.bitcastDst(MI, Ty, 0);
4638     Dst = MI.getOperand(0).getReg();
4639     B.setInsertPt(B.getMBB(), MI);
4640   }
4641 
4642   // FIXME: We don't really need this intermediate instruction. The intrinsic
4643   // should be fixed to have a memory operand. Since it's readnone, we're not
4644   // allowed to add one.
4645   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4646   MI.RemoveOperand(1); // Remove intrinsic ID
4647 
4648   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4649   // TODO: Should this use datalayout alignment?
4650   const unsigned MemSize = (Size + 7) / 8;
4651   const Align MemAlign(4);
4652   MachineMemOperand *MMO = MF.getMachineMemOperand(
4653       MachinePointerInfo(),
4654       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4655           MachineMemOperand::MOInvariant,
4656       MemSize, MemAlign);
4657   MI.addMemOperand(MF, MMO);
4658 
4659   // There are no 96-bit result scalar loads, but widening to 128-bit should
4660   // always be legal. We may need to restore this to a 96-bit result if it turns
4661   // out this needs to be converted to a vector load during RegBankSelect.
4662   if (!isPowerOf2_32(Size)) {
4663     if (Ty.isVector())
4664       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4665     else
4666       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4667   }
4668 
4669   Observer.changedInstr(MI);
4670   return true;
4671 }
4672 
4673 // TODO: Move to selection
4674 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4675                                                 MachineRegisterInfo &MRI,
4676                                                 MachineIRBuilder &B) const {
4677   if (!ST.isTrapHandlerEnabled() ||
4678       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4679     return legalizeTrapEndpgm(MI, MRI, B);
4680 
4681   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4682     switch (*HsaAbiVer) {
4683     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4684     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4685       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4686     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4687       return ST.supportsGetDoorbellID() ?
4688           legalizeTrapHsa(MI, MRI, B) :
4689           legalizeTrapHsaQueuePtr(MI, MRI, B);
4690     }
4691   }
4692 
4693   llvm_unreachable("Unknown trap handler");
4694 }
4695 
4696 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4697     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4698   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4699   MI.eraseFromParent();
4700   return true;
4701 }
4702 
4703 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4704     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4705   // Pass queue pointer to trap handler as input, and insert trap instruction
4706   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4707   Register LiveIn =
4708     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4709   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4710     return false;
4711 
4712   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4713   B.buildCopy(SGPR01, LiveIn);
4714   B.buildInstr(AMDGPU::S_TRAP)
4715       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4716       .addReg(SGPR01, RegState::Implicit);
4717 
4718   MI.eraseFromParent();
4719   return true;
4720 }
4721 
4722 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4723     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4724   B.buildInstr(AMDGPU::S_TRAP)
4725       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4726   MI.eraseFromParent();
4727   return true;
4728 }
4729 
4730 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4731     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4732   // Is non-HSA path or trap-handler disabled? then, report a warning
4733   // accordingly
4734   if (!ST.isTrapHandlerEnabled() ||
4735       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4736     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4737                                      "debugtrap handler not supported",
4738                                      MI.getDebugLoc(), DS_Warning);
4739     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4740     Ctx.diagnose(NoTrap);
4741   } else {
4742     // Insert debug-trap instruction
4743     B.buildInstr(AMDGPU::S_TRAP)
4744         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4745   }
4746 
4747   MI.eraseFromParent();
4748   return true;
4749 }
4750 
4751 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4752                                                MachineIRBuilder &B) const {
4753   MachineRegisterInfo &MRI = *B.getMRI();
4754   const LLT S16 = LLT::scalar(16);
4755   const LLT S32 = LLT::scalar(32);
4756 
4757   Register DstReg = MI.getOperand(0).getReg();
4758   Register NodePtr = MI.getOperand(2).getReg();
4759   Register RayExtent = MI.getOperand(3).getReg();
4760   Register RayOrigin = MI.getOperand(4).getReg();
4761   Register RayDir = MI.getOperand(5).getReg();
4762   Register RayInvDir = MI.getOperand(6).getReg();
4763   Register TDescr = MI.getOperand(7).getReg();
4764 
4765   if (!ST.hasGFX10_AEncoding()) {
4766     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4767                                         "intrinsic not supported on subtarget",
4768                                         MI.getDebugLoc());
4769     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4770     return false;
4771   }
4772 
4773   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4774   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4775   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4776                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4777                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4778                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4779 
4780   SmallVector<Register, 12> Ops;
4781   if (Is64) {
4782     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4783     Ops.push_back(Unmerge.getReg(0));
4784     Ops.push_back(Unmerge.getReg(1));
4785   } else {
4786     Ops.push_back(NodePtr);
4787   }
4788   Ops.push_back(RayExtent);
4789 
4790   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4791     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4792     Ops.push_back(Unmerge.getReg(0));
4793     Ops.push_back(Unmerge.getReg(1));
4794     Ops.push_back(Unmerge.getReg(2));
4795   };
4796 
4797   packLanes(RayOrigin);
4798   if (IsA16) {
4799     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4800     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4801     Register R1 = MRI.createGenericVirtualRegister(S32);
4802     Register R2 = MRI.createGenericVirtualRegister(S32);
4803     Register R3 = MRI.createGenericVirtualRegister(S32);
4804     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4805     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4806     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4807     Ops.push_back(R1);
4808     Ops.push_back(R2);
4809     Ops.push_back(R3);
4810   } else {
4811     packLanes(RayDir);
4812     packLanes(RayInvDir);
4813   }
4814 
4815   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4816     .addDef(DstReg)
4817     .addImm(Opcode);
4818 
4819   for (Register R : Ops) {
4820     MIB.addUse(R);
4821   }
4822 
4823   MIB.addUse(TDescr)
4824      .addImm(IsA16 ? 1 : 0)
4825      .cloneMemRefs(MI);
4826 
4827   MI.eraseFromParent();
4828   return true;
4829 }
4830 
4831 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4832                                             MachineInstr &MI) const {
4833   MachineIRBuilder &B = Helper.MIRBuilder;
4834   MachineRegisterInfo &MRI = *B.getMRI();
4835 
4836   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4837   auto IntrID = MI.getIntrinsicID();
4838   switch (IntrID) {
4839   case Intrinsic::amdgcn_if:
4840   case Intrinsic::amdgcn_else: {
4841     MachineInstr *Br = nullptr;
4842     MachineBasicBlock *UncondBrTarget = nullptr;
4843     bool Negated = false;
4844     if (MachineInstr *BrCond =
4845             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4846       const SIRegisterInfo *TRI
4847         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4848 
4849       Register Def = MI.getOperand(1).getReg();
4850       Register Use = MI.getOperand(3).getReg();
4851 
4852       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4853 
4854       if (Negated)
4855         std::swap(CondBrTarget, UncondBrTarget);
4856 
4857       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4858       if (IntrID == Intrinsic::amdgcn_if) {
4859         B.buildInstr(AMDGPU::SI_IF)
4860           .addDef(Def)
4861           .addUse(Use)
4862           .addMBB(UncondBrTarget);
4863       } else {
4864         B.buildInstr(AMDGPU::SI_ELSE)
4865             .addDef(Def)
4866             .addUse(Use)
4867             .addMBB(UncondBrTarget);
4868       }
4869 
4870       if (Br) {
4871         Br->getOperand(0).setMBB(CondBrTarget);
4872       } else {
4873         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4874         // since we're swapping branch targets it needs to be reinserted.
4875         // FIXME: IRTranslator should probably not do this
4876         B.buildBr(*CondBrTarget);
4877       }
4878 
4879       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4880       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4881       MI.eraseFromParent();
4882       BrCond->eraseFromParent();
4883       return true;
4884     }
4885 
4886     return false;
4887   }
4888   case Intrinsic::amdgcn_loop: {
4889     MachineInstr *Br = nullptr;
4890     MachineBasicBlock *UncondBrTarget = nullptr;
4891     bool Negated = false;
4892     if (MachineInstr *BrCond =
4893             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4894       const SIRegisterInfo *TRI
4895         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4896 
4897       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4898       Register Reg = MI.getOperand(2).getReg();
4899 
4900       if (Negated)
4901         std::swap(CondBrTarget, UncondBrTarget);
4902 
4903       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4904       B.buildInstr(AMDGPU::SI_LOOP)
4905         .addUse(Reg)
4906         .addMBB(UncondBrTarget);
4907 
4908       if (Br)
4909         Br->getOperand(0).setMBB(CondBrTarget);
4910       else
4911         B.buildBr(*CondBrTarget);
4912 
4913       MI.eraseFromParent();
4914       BrCond->eraseFromParent();
4915       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4916       return true;
4917     }
4918 
4919     return false;
4920   }
4921   case Intrinsic::amdgcn_kernarg_segment_ptr:
4922     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4923       // This only makes sense to call in a kernel, so just lower to null.
4924       B.buildConstant(MI.getOperand(0).getReg(), 0);
4925       MI.eraseFromParent();
4926       return true;
4927     }
4928 
4929     return legalizePreloadedArgIntrin(
4930       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4931   case Intrinsic::amdgcn_implicitarg_ptr:
4932     return legalizeImplicitArgPtr(MI, MRI, B);
4933   case Intrinsic::amdgcn_workitem_id_x:
4934     return legalizePreloadedArgIntrin(MI, MRI, B,
4935                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4936   case Intrinsic::amdgcn_workitem_id_y:
4937     return legalizePreloadedArgIntrin(MI, MRI, B,
4938                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4939   case Intrinsic::amdgcn_workitem_id_z:
4940     return legalizePreloadedArgIntrin(MI, MRI, B,
4941                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4942   case Intrinsic::amdgcn_workgroup_id_x:
4943     return legalizePreloadedArgIntrin(MI, MRI, B,
4944                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4945   case Intrinsic::amdgcn_workgroup_id_y:
4946     return legalizePreloadedArgIntrin(MI, MRI, B,
4947                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4948   case Intrinsic::amdgcn_workgroup_id_z:
4949     return legalizePreloadedArgIntrin(MI, MRI, B,
4950                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4951   case Intrinsic::amdgcn_dispatch_ptr:
4952     return legalizePreloadedArgIntrin(MI, MRI, B,
4953                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4954   case Intrinsic::amdgcn_queue_ptr:
4955     return legalizePreloadedArgIntrin(MI, MRI, B,
4956                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4957   case Intrinsic::amdgcn_implicit_buffer_ptr:
4958     return legalizePreloadedArgIntrin(
4959       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4960   case Intrinsic::amdgcn_dispatch_id:
4961     return legalizePreloadedArgIntrin(MI, MRI, B,
4962                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4963   case Intrinsic::amdgcn_fdiv_fast:
4964     return legalizeFDIVFastIntrin(MI, MRI, B);
4965   case Intrinsic::amdgcn_is_shared:
4966     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4967   case Intrinsic::amdgcn_is_private:
4968     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4969   case Intrinsic::amdgcn_wavefrontsize: {
4970     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4971     MI.eraseFromParent();
4972     return true;
4973   }
4974   case Intrinsic::amdgcn_s_buffer_load:
4975     return legalizeSBufferLoad(Helper, MI);
4976   case Intrinsic::amdgcn_raw_buffer_store:
4977   case Intrinsic::amdgcn_struct_buffer_store:
4978     return legalizeBufferStore(MI, MRI, B, false, false);
4979   case Intrinsic::amdgcn_raw_buffer_store_format:
4980   case Intrinsic::amdgcn_struct_buffer_store_format:
4981     return legalizeBufferStore(MI, MRI, B, false, true);
4982   case Intrinsic::amdgcn_raw_tbuffer_store:
4983   case Intrinsic::amdgcn_struct_tbuffer_store:
4984     return legalizeBufferStore(MI, MRI, B, true, true);
4985   case Intrinsic::amdgcn_raw_buffer_load:
4986   case Intrinsic::amdgcn_struct_buffer_load:
4987     return legalizeBufferLoad(MI, MRI, B, false, false);
4988   case Intrinsic::amdgcn_raw_buffer_load_format:
4989   case Intrinsic::amdgcn_struct_buffer_load_format:
4990     return legalizeBufferLoad(MI, MRI, B, true, false);
4991   case Intrinsic::amdgcn_raw_tbuffer_load:
4992   case Intrinsic::amdgcn_struct_tbuffer_load:
4993     return legalizeBufferLoad(MI, MRI, B, true, true);
4994   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4995   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4996   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4997   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4998   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4999   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5000   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5001   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5002   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5003   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5004   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5005   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5006   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5007   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5008   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5009   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5010   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5011   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5012   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5013   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5014   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5015   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5016   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5017   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5018   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5019   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5020   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5021   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5022   case Intrinsic::amdgcn_buffer_atomic_fadd:
5023   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5024   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5025   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5026   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5027     return legalizeBufferAtomic(MI, B, IntrID);
5028   case Intrinsic::amdgcn_atomic_inc:
5029     return legalizeAtomicIncDec(MI, B, true);
5030   case Intrinsic::amdgcn_atomic_dec:
5031     return legalizeAtomicIncDec(MI, B, false);
5032   case Intrinsic::trap:
5033     return legalizeTrapIntrinsic(MI, MRI, B);
5034   case Intrinsic::debugtrap:
5035     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5036   case Intrinsic::amdgcn_rsq_clamp:
5037     return legalizeRsqClampIntrinsic(MI, MRI, B);
5038   case Intrinsic::amdgcn_ds_fadd:
5039   case Intrinsic::amdgcn_ds_fmin:
5040   case Intrinsic::amdgcn_ds_fmax:
5041     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5042   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5043     return legalizeBVHIntrinsic(MI, B);
5044   default: {
5045     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5046             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5047       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5048     return true;
5049   }
5050   }
5051 
5052   return true;
5053 }
5054