1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816                     .lowerIf(typeIs(1, S1))
817                     .customFor({{S32, S64}, {S64, S64}});
818   if (ST.has16BitInsts())
819     IToFP.legalFor({{S16, S16}});
820   IToFP.clampScalar(1, S32, S64)
821        .minScalar(0, S32)
822        .scalarize(0)
823        .widenScalarToNextPow2(1);
824 
825   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
826     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
827     .customFor({{S64, S32}, {S64, S64}})
828     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
829   if (ST.has16BitInsts())
830     FPToI.legalFor({{S16, S16}});
831   else
832     FPToI.minScalar(1, S32);
833 
834   FPToI.minScalar(0, S32)
835        .widenScalarToNextPow2(0, 32)
836        .scalarize(0)
837        .lower();
838 
839   // Lower roundeven into G_FRINT
840   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
841     .scalarize(0)
842     .lower();
843 
844   if (ST.has16BitInsts()) {
845     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
846       .legalFor({S16, S32, S64})
847       .clampScalar(0, S16, S64)
848       .scalarize(0);
849   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
850     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
851       .legalFor({S32, S64})
852       .clampScalar(0, S32, S64)
853       .scalarize(0);
854   } else {
855     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
856       .legalFor({S32})
857       .customFor({S64})
858       .clampScalar(0, S32, S64)
859       .scalarize(0);
860   }
861 
862   getActionDefinitionsBuilder(G_PTR_ADD)
863     .legalIf(all(isPointer(0), sameSize(0, 1)))
864     .scalarize(0)
865     .scalarSameSizeAs(1, 0);
866 
867   getActionDefinitionsBuilder(G_PTRMASK)
868     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
869     .scalarSameSizeAs(1, 0)
870     .scalarize(0);
871 
872   auto &CmpBuilder =
873     getActionDefinitionsBuilder(G_ICMP)
874     // The compare output type differs based on the register bank of the output,
875     // so make both s1 and s32 legal.
876     //
877     // Scalar compares producing output in scc will be promoted to s32, as that
878     // is the allocatable register type that will be needed for the copy from
879     // scc. This will be promoted during RegBankSelect, and we assume something
880     // before that won't try to use s32 result types.
881     //
882     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
883     // bank.
884     .legalForCartesianProduct(
885       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
886     .legalForCartesianProduct(
887       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
888   if (ST.has16BitInsts()) {
889     CmpBuilder.legalFor({{S1, S16}});
890   }
891 
892   CmpBuilder
893     .widenScalarToNextPow2(1)
894     .clampScalar(1, S32, S64)
895     .scalarize(0)
896     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
897 
898   getActionDefinitionsBuilder(G_FCMP)
899     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
900     .widenScalarToNextPow2(1)
901     .clampScalar(1, S32, S64)
902     .scalarize(0);
903 
904   // FIXME: fpow has a selection pattern that should move to custom lowering.
905   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
906   if (ST.has16BitInsts())
907     Exp2Ops.legalFor({S32, S16});
908   else
909     Exp2Ops.legalFor({S32});
910   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
911   Exp2Ops.scalarize(0);
912 
913   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
914   if (ST.has16BitInsts())
915     ExpOps.customFor({{S32}, {S16}});
916   else
917     ExpOps.customFor({S32});
918   ExpOps.clampScalar(0, MinScalarFPTy, S32)
919         .scalarize(0);
920 
921   getActionDefinitionsBuilder(G_FPOWI)
922     .clampScalar(0, MinScalarFPTy, S32)
923     .lower();
924 
925   // The 64-bit versions produce 32-bit results, but only on the SALU.
926   getActionDefinitionsBuilder(G_CTPOP)
927     .legalFor({{S32, S32}, {S32, S64}})
928     .clampScalar(0, S32, S32)
929     .clampScalar(1, S32, S64)
930     .scalarize(0)
931     .widenScalarToNextPow2(0, 32)
932     .widenScalarToNextPow2(1, 32);
933 
934   // The hardware instructions return a different result on 0 than the generic
935   // instructions expect. The hardware produces -1, but these produce the
936   // bitwidth.
937   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
938     .scalarize(0)
939     .clampScalar(0, S32, S32)
940     .clampScalar(1, S32, S64)
941     .widenScalarToNextPow2(0, 32)
942     .widenScalarToNextPow2(1, 32)
943     .lower();
944 
945   // The 64-bit versions produce 32-bit results, but only on the SALU.
946   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
947     .legalFor({{S32, S32}, {S32, S64}})
948     .clampScalar(0, S32, S32)
949     .clampScalar(1, S32, S64)
950     .scalarize(0)
951     .widenScalarToNextPow2(0, 32)
952     .widenScalarToNextPow2(1, 32);
953 
954   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
955   // RegBankSelect.
956   getActionDefinitionsBuilder(G_BITREVERSE)
957     .legalFor({S32, S64})
958     .clampScalar(0, S32, S64)
959     .scalarize(0)
960     .widenScalarToNextPow2(0);
961 
962   if (ST.has16BitInsts()) {
963     getActionDefinitionsBuilder(G_BSWAP)
964       .legalFor({S16, S32, V2S16})
965       .clampMaxNumElements(0, S16, 2)
966       // FIXME: Fixing non-power-of-2 before clamp is workaround for
967       // narrowScalar limitation.
968       .widenScalarToNextPow2(0)
969       .clampScalar(0, S16, S32)
970       .scalarize(0);
971 
972     if (ST.hasVOP3PInsts()) {
973       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
974         .legalFor({S32, S16, V2S16})
975         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
976         .clampMaxNumElements(0, S16, 2)
977         .minScalar(0, S16)
978         .widenScalarToNextPow2(0)
979         .scalarize(0)
980         .lower();
981     } else {
982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983         .legalFor({S32, S16})
984         .widenScalarToNextPow2(0)
985         .minScalar(0, S16)
986         .scalarize(0)
987         .lower();
988     }
989   } else {
990     // TODO: Should have same legality without v_perm_b32
991     getActionDefinitionsBuilder(G_BSWAP)
992       .legalFor({S32})
993       .lowerIf(scalarNarrowerThan(0, 32))
994       // FIXME: Fixing non-power-of-2 before clamp is workaround for
995       // narrowScalar limitation.
996       .widenScalarToNextPow2(0)
997       .maxScalar(0, S32)
998       .scalarize(0)
999       .lower();
1000 
1001     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1002       .legalFor({S32})
1003       .minScalar(0, S32)
1004       .widenScalarToNextPow2(0)
1005       .scalarize(0)
1006       .lower();
1007   }
1008 
1009   getActionDefinitionsBuilder(G_INTTOPTR)
1010     // List the common cases
1011     .legalForCartesianProduct(AddrSpaces64, {S64})
1012     .legalForCartesianProduct(AddrSpaces32, {S32})
1013     .scalarize(0)
1014     // Accept any address space as long as the size matches
1015     .legalIf(sameSize(0, 1))
1016     .widenScalarIf(smallerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       })
1020     .narrowScalarIf(largerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       });
1024 
1025   getActionDefinitionsBuilder(G_PTRTOINT)
1026     // List the common cases
1027     .legalForCartesianProduct(AddrSpaces64, {S64})
1028     .legalForCartesianProduct(AddrSpaces32, {S32})
1029     .scalarize(0)
1030     // Accept any address space as long as the size matches
1031     .legalIf(sameSize(0, 1))
1032     .widenScalarIf(smallerThan(0, 1),
1033       [](const LegalityQuery &Query) {
1034         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1035       })
1036     .narrowScalarIf(
1037       largerThan(0, 1),
1038       [](const LegalityQuery &Query) {
1039         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1040       });
1041 
1042   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1043     .scalarize(0)
1044     .custom();
1045 
1046   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1047                                     bool IsLoad) -> bool {
1048     const LLT DstTy = Query.Types[0];
1049 
1050     // Split vector extloads.
1051     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1052     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1053 
1054     if (MemSize < DstTy.getSizeInBits())
1055       MemSize = std::max(MemSize, AlignBits);
1056 
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059 
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064 
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076 
1077     if (AlignBits < MemSize) {
1078       const SITargetLowering *TLI = ST.getTargetLowering();
1079       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1080                                                       Align(AlignBits / 8));
1081     }
1082 
1083     return false;
1084   };
1085 
1086   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1089 
1090   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1091   // LDS
1092   // TODO: Unsupported flat for SI.
1093 
1094   for (unsigned Op : {G_LOAD, G_STORE}) {
1095     const bool IsStore = Op == G_STORE;
1096 
1097     auto &Actions = getActionDefinitionsBuilder(Op);
1098     // Explicitly list some common cases.
1099     // TODO: Does this help compile time at all?
1100     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1101                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1102                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1103                                       {S64, GlobalPtr, S64, GlobalAlign32},
1104                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1105                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1106                                       {S32, GlobalPtr, S8, GlobalAlign8},
1107                                       {S32, GlobalPtr, S16, GlobalAlign16},
1108 
1109                                       {S32, LocalPtr, S32, 32},
1110                                       {S64, LocalPtr, S64, 32},
1111                                       {V2S32, LocalPtr, V2S32, 32},
1112                                       {S32, LocalPtr, S8, 8},
1113                                       {S32, LocalPtr, S16, 16},
1114                                       {V2S16, LocalPtr, S32, 32},
1115 
1116                                       {S32, PrivatePtr, S32, 32},
1117                                       {S32, PrivatePtr, S8, 8},
1118                                       {S32, PrivatePtr, S16, 16},
1119                                       {V2S16, PrivatePtr, S32, 32},
1120 
1121                                       {S32, ConstantPtr, S32, GlobalAlign32},
1122                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1123                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1124                                       {S64, ConstantPtr, S64, GlobalAlign32},
1125                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1126     Actions.legalIf(
1127       [=](const LegalityQuery &Query) -> bool {
1128         return isLoadStoreLegal(ST, Query);
1129       });
1130 
1131     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1132     // 64-bits.
1133     //
1134     // TODO: Should generalize bitcast action into coerce, which will also cover
1135     // inserting addrspacecasts.
1136     Actions.customIf(typeIs(1, Constant32Ptr));
1137 
1138     // Turn any illegal element vectors into something easier to deal
1139     // with. These will ultimately produce 32-bit scalar shifts to extract the
1140     // parts anyway.
1141     //
1142     // For odd 16-bit element vectors, prefer to split those into pieces with
1143     // 16-bit vector parts.
1144     Actions.bitcastIf(
1145       [=](const LegalityQuery &Query) -> bool {
1146         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1147                                           Query.MMODescrs[0].MemoryTy);
1148       }, bitcastToRegisterType(0));
1149 
1150     if (!IsStore) {
1151       // Widen suitably aligned loads by loading extra bytes. The standard
1152       // legalization actions can't properly express widening memory operands.
1153       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1154         return shouldWidenLoad(ST, Query, G_LOAD);
1155       });
1156     }
1157 
1158     // FIXME: load/store narrowing should be moved to lower action
1159     Actions
1160         .narrowScalarIf(
1161             [=](const LegalityQuery &Query) -> bool {
1162               return !Query.Types[0].isVector() &&
1163                      needToSplitMemOp(Query, Op == G_LOAD);
1164             },
1165             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1166               const LLT DstTy = Query.Types[0];
1167               const LLT PtrTy = Query.Types[1];
1168 
1169               const unsigned DstSize = DstTy.getSizeInBits();
1170               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1171 
1172               // Split extloads.
1173               if (DstSize > MemSize)
1174                 return std::make_pair(0, LLT::scalar(MemSize));
1175 
1176               if (!isPowerOf2_32(DstSize)) {
1177                 // We're probably decomposing an odd sized store. Try to split
1178                 // to the widest type. TODO: Account for alignment. As-is it
1179                 // should be OK, since the new parts will be further legalized.
1180                 unsigned FloorSize = PowerOf2Floor(DstSize);
1181                 return std::make_pair(0, LLT::scalar(FloorSize));
1182               }
1183 
1184               if (DstSize > 32 && (DstSize % 32 != 0)) {
1185                 // FIXME: Need a way to specify non-extload of larger size if
1186                 // suitably aligned.
1187                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1188               }
1189 
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193               if (MemSize > MaxSize)
1194                 return std::make_pair(0, LLT::scalar(MaxSize));
1195 
1196               unsigned Align = Query.MMODescrs[0].AlignInBits;
1197               return std::make_pair(0, LLT::scalar(Align));
1198             })
1199         .fewerElementsIf(
1200             [=](const LegalityQuery &Query) -> bool {
1201               return Query.Types[0].isVector() &&
1202                      needToSplitMemOp(Query, Op == G_LOAD);
1203             },
1204             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1205               const LLT DstTy = Query.Types[0];
1206               const LLT PtrTy = Query.Types[1];
1207 
1208               LLT EltTy = DstTy.getElementType();
1209               unsigned MaxSize = maxSizeForAddrSpace(ST,
1210                                                      PtrTy.getAddressSpace(),
1211                                                      Op == G_LOAD);
1212 
1213               // FIXME: Handle widened to power of 2 results better. This ends
1214               // up scalarizing.
1215               // FIXME: 3 element stores scalarized on SI
1216 
1217               // Split if it's too large for the address space.
1218               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1219               if (MemSize > MaxSize) {
1220                 unsigned NumElts = DstTy.getNumElements();
1221                 unsigned EltSize = EltTy.getSizeInBits();
1222 
1223                 if (MaxSize % EltSize == 0) {
1224                   return std::make_pair(
1225                       0, LLT::scalarOrVector(
1226                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1227                 }
1228 
1229                 unsigned NumPieces = MemSize / MaxSize;
1230 
1231                 // FIXME: Refine when odd breakdowns handled
1232                 // The scalars will need to be re-legalized.
1233                 if (NumPieces == 1 || NumPieces >= NumElts ||
1234                     NumElts % NumPieces != 0)
1235                   return std::make_pair(0, EltTy);
1236 
1237                 return std::make_pair(
1238                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1239               }
1240 
1241               // FIXME: We could probably handle weird extending loads better.
1242               if (DstTy.getSizeInBits() > MemSize)
1243                 return std::make_pair(0, EltTy);
1244 
1245               unsigned EltSize = EltTy.getSizeInBits();
1246               unsigned DstSize = DstTy.getSizeInBits();
1247               if (!isPowerOf2_32(DstSize)) {
1248                 // We're probably decomposing an odd sized store. Try to split
1249                 // to the widest type. TODO: Account for alignment. As-is it
1250                 // should be OK, since the new parts will be further legalized.
1251                 unsigned FloorSize = PowerOf2Floor(DstSize);
1252                 return std::make_pair(
1253                     0, LLT::scalarOrVector(
1254                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1255               }
1256 
1257               // Need to split because of alignment.
1258               unsigned Align = Query.MMODescrs[0].AlignInBits;
1259               if (EltSize > Align &&
1260                   (EltSize / Align < DstTy.getNumElements())) {
1261                 return std::make_pair(
1262                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1263               }
1264 
1265               // May need relegalization for the scalars.
1266               return std::make_pair(0, EltTy);
1267             })
1268     .minScalar(0, S32)
1269     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1270     .widenScalarToNextPow2(0)
1271     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1272     .lower();
1273   }
1274 
1275   // FIXME: Unaligned accesses not lowered.
1276   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1277                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1278                                                   {S32, GlobalPtr, S16, 2 * 8},
1279                                                   {S32, LocalPtr, S8, 8},
1280                                                   {S32, LocalPtr, S16, 16},
1281                                                   {S32, PrivatePtr, S8, 8},
1282                                                   {S32, PrivatePtr, S16, 16},
1283                                                   {S32, ConstantPtr, S8, 8},
1284                                                   {S32, ConstantPtr, S16, 2 * 8}})
1285                        .legalIf(
1286                          [=](const LegalityQuery &Query) -> bool {
1287                            return isLoadStoreLegal(ST, Query);
1288                          });
1289 
1290   if (ST.hasFlatAddressSpace()) {
1291     ExtLoads.legalForTypesWithMemDesc(
1292         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1293   }
1294 
1295   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1296   // 64-bits.
1297   //
1298   // TODO: Should generalize bitcast action into coerce, which will also cover
1299   // inserting addrspacecasts.
1300   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1301 
1302   ExtLoads.clampScalar(0, S32, S32)
1303           .widenScalarToNextPow2(0)
1304           .lower();
1305 
1306   auto &Atomics = getActionDefinitionsBuilder(
1307     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1308      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1309      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1310      G_ATOMICRMW_UMIN})
1311     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1312                {S64, GlobalPtr}, {S64, LocalPtr},
1313                {S32, RegionPtr}, {S64, RegionPtr}});
1314   if (ST.hasFlatAddressSpace()) {
1315     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1316   }
1317 
1318   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1319   if (ST.hasLDSFPAtomics()) {
1320     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1321     if (ST.hasGFX90AInsts())
1322       Atomic.legalFor({{S64, LocalPtr}});
1323   }
1324   if (ST.hasAtomicFaddInsts())
1325     Atomic.legalFor({{S32, GlobalPtr}});
1326 
1327   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1328   // demarshalling
1329   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1330     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1331                 {S32, FlatPtr}, {S64, FlatPtr}})
1332     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1333                {S32, RegionPtr}, {S64, RegionPtr}});
1334   // TODO: Pointer types, any 32-bit or 64-bit vector
1335 
1336   // Condition should be s32 for scalar, s1 for vector.
1337   getActionDefinitionsBuilder(G_SELECT)
1338       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1339                                  LocalPtr, FlatPtr, PrivatePtr,
1340                                  LLT::fixed_vector(2, LocalPtr),
1341                                  LLT::fixed_vector(2, PrivatePtr)},
1342                                 {S1, S32})
1343       .clampScalar(0, S16, S64)
1344       .scalarize(1)
1345       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347       .clampMaxNumElements(0, S32, 2)
1348       .clampMaxNumElements(0, LocalPtr, 2)
1349       .clampMaxNumElements(0, PrivatePtr, 2)
1350       .scalarize(0)
1351       .widenScalarToNextPow2(0)
1352       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1353 
1354   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355   // be more flexible with the shift amount type.
1356   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1357     .legalFor({{S32, S32}, {S64, S32}});
1358   if (ST.has16BitInsts()) {
1359     if (ST.hasVOP3PInsts()) {
1360       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1361             .clampMaxNumElements(0, S16, 2);
1362     } else
1363       Shifts.legalFor({{S16, S16}});
1364 
1365     // TODO: Support 16-bit shift amounts for all types
1366     Shifts.widenScalarIf(
1367       [=](const LegalityQuery &Query) {
1368         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1369         // 32-bit amount.
1370         const LLT ValTy = Query.Types[0];
1371         const LLT AmountTy = Query.Types[1];
1372         return ValTy.getSizeInBits() <= 16 &&
1373                AmountTy.getSizeInBits() < 16;
1374       }, changeTo(1, S16));
1375     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1376     Shifts.clampScalar(1, S32, S32);
1377     Shifts.clampScalar(0, S16, S64);
1378     Shifts.widenScalarToNextPow2(0, 16);
1379 
1380     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381       .minScalar(0, S16)
1382       .scalarize(0)
1383       .lower();
1384   } else {
1385     // Make sure we legalize the shift amount type first, as the general
1386     // expansion for the shifted type will produce much worse code if it hasn't
1387     // been truncated already.
1388     Shifts.clampScalar(1, S32, S32);
1389     Shifts.clampScalar(0, S32, S64);
1390     Shifts.widenScalarToNextPow2(0, 32);
1391 
1392     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1393       .minScalar(0, S32)
1394       .scalarize(0)
1395       .lower();
1396   }
1397   Shifts.scalarize(0);
1398 
1399   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1400     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1401     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1402     unsigned IdxTypeIdx = 2;
1403 
1404     getActionDefinitionsBuilder(Op)
1405       .customIf([=](const LegalityQuery &Query) {
1406           const LLT EltTy = Query.Types[EltTypeIdx];
1407           const LLT VecTy = Query.Types[VecTypeIdx];
1408           const LLT IdxTy = Query.Types[IdxTypeIdx];
1409           const unsigned EltSize = EltTy.getSizeInBits();
1410           return (EltSize == 32 || EltSize == 64) &&
1411                   VecTy.getSizeInBits() % 32 == 0 &&
1412                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1413                   IdxTy.getSizeInBits() == 32;
1414         })
1415       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1416                  bitcastToVectorElement32(VecTypeIdx))
1417       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1418       .bitcastIf(
1419         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1420         [=](const LegalityQuery &Query) {
1421           // For > 64-bit element types, try to turn this into a 64-bit
1422           // element vector since we may be able to do better indexing
1423           // if this is scalar. If not, fall back to 32.
1424           const LLT EltTy = Query.Types[EltTypeIdx];
1425           const LLT VecTy = Query.Types[VecTypeIdx];
1426           const unsigned DstEltSize = EltTy.getSizeInBits();
1427           const unsigned VecSize = VecTy.getSizeInBits();
1428 
1429           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1430           return std::make_pair(
1431               VecTypeIdx,
1432               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1433         })
1434       .clampScalar(EltTypeIdx, S32, S64)
1435       .clampScalar(VecTypeIdx, S32, S64)
1436       .clampScalar(IdxTypeIdx, S32, S32)
1437       .clampMaxNumElements(VecTypeIdx, S32, 32)
1438       // TODO: Clamp elements for 64-bit vectors?
1439       // It should only be necessary with variable indexes.
1440       // As a last resort, lower to the stack
1441       .lower();
1442   }
1443 
1444   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1445     .unsupportedIf([=](const LegalityQuery &Query) {
1446         const LLT &EltTy = Query.Types[1].getElementType();
1447         return Query.Types[0] != EltTy;
1448       });
1449 
1450   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1451     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1452     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1453 
1454     // FIXME: Doesn't handle extract of illegal sizes.
1455     getActionDefinitionsBuilder(Op)
1456       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1457       // FIXME: Multiples of 16 should not be legal.
1458       .legalIf([=](const LegalityQuery &Query) {
1459           const LLT BigTy = Query.Types[BigTyIdx];
1460           const LLT LitTy = Query.Types[LitTyIdx];
1461           return (BigTy.getSizeInBits() % 32 == 0) &&
1462                  (LitTy.getSizeInBits() % 16 == 0);
1463         })
1464       .widenScalarIf(
1465         [=](const LegalityQuery &Query) {
1466           const LLT BigTy = Query.Types[BigTyIdx];
1467           return (BigTy.getScalarSizeInBits() < 16);
1468         },
1469         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1470       .widenScalarIf(
1471         [=](const LegalityQuery &Query) {
1472           const LLT LitTy = Query.Types[LitTyIdx];
1473           return (LitTy.getScalarSizeInBits() < 16);
1474         },
1475         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1476       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1477       .widenScalarToNextPow2(BigTyIdx, 32);
1478 
1479   }
1480 
1481   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1482     .legalForCartesianProduct(AllS32Vectors, {S32})
1483     .legalForCartesianProduct(AllS64Vectors, {S64})
1484     .clampNumElements(0, V16S32, V32S32)
1485     .clampNumElements(0, V2S64, V16S64)
1486     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1487 
1488   if (ST.hasScalarPackInsts()) {
1489     BuildVector
1490       // FIXME: Should probably widen s1 vectors straight to s32
1491       .minScalarOrElt(0, S16)
1492       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1493       .minScalar(1, S32);
1494 
1495     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496       .legalFor({V2S16, S32})
1497       .lower();
1498     BuildVector.minScalarOrElt(0, S32);
1499   } else {
1500     BuildVector.customFor({V2S16, S16});
1501     BuildVector.minScalarOrElt(0, S32);
1502 
1503     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1504       .customFor({V2S16, S32})
1505       .lower();
1506   }
1507 
1508   BuildVector.legalIf(isRegisterType(0));
1509 
1510   // FIXME: Clamp maximum size
1511   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1512     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513     .clampMaxNumElements(0, S32, 32)
1514     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1515     .clampMaxNumElements(0, S16, 64);
1516 
1517   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1518   // pre-legalize.
1519   if (ST.hasVOP3PInsts()) {
1520     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1521       .customFor({V2S16, V2S16})
1522       .lower();
1523   } else
1524     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1525 
1526   // Merge/Unmerge
1527   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1528     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1529     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1530 
1531     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1532       const LLT Ty = Query.Types[TypeIdx];
1533       if (Ty.isVector()) {
1534         const LLT &EltTy = Ty.getElementType();
1535         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1536           return true;
1537         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1538           return true;
1539       }
1540       return false;
1541     };
1542 
1543     auto &Builder = getActionDefinitionsBuilder(Op)
1544       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545       .lowerFor({{S16, V2S16}})
1546       .lowerIf([=](const LegalityQuery &Query) {
1547           const LLT BigTy = Query.Types[BigTyIdx];
1548           return BigTy.getSizeInBits() == 32;
1549         })
1550       // Try to widen to s16 first for small types.
1551       // TODO: Only do this on targets with legal s16 shifts
1552       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1554       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1555       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1556                            elementTypeIs(1, S16)),
1557                        changeTo(1, V2S16))
1558       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1560       // valid.
1561       .clampScalar(LitTyIdx, S32, S512)
1562       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1563       // Break up vectors with weird elements into scalars
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1566         scalarize(0))
1567       .fewerElementsIf(
1568         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1569         scalarize(1))
1570       .clampScalar(BigTyIdx, S32, MaxScalar);
1571 
1572     if (Op == G_MERGE_VALUES) {
1573       Builder.widenScalarIf(
1574         // TODO: Use 16-bit shifts if legal for 8-bit values?
1575         [=](const LegalityQuery &Query) {
1576           const LLT Ty = Query.Types[LitTyIdx];
1577           return Ty.getSizeInBits() < 32;
1578         },
1579         changeTo(LitTyIdx, S32));
1580     }
1581 
1582     Builder.widenScalarIf(
1583       [=](const LegalityQuery &Query) {
1584         const LLT Ty = Query.Types[BigTyIdx];
1585         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1586           Ty.getSizeInBits() % 16 != 0;
1587       },
1588       [=](const LegalityQuery &Query) {
1589         // Pick the next power of 2, or a multiple of 64 over 128.
1590         // Whichever is smaller.
1591         const LLT &Ty = Query.Types[BigTyIdx];
1592         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1593         if (NewSizeInBits >= 256) {
1594           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1595           if (RoundedTo < NewSizeInBits)
1596             NewSizeInBits = RoundedTo;
1597         }
1598         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1599       })
1600       // Any vectors left are the wrong size. Scalarize them.
1601       .scalarize(0)
1602       .scalarize(1);
1603   }
1604 
1605   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1606   // RegBankSelect.
1607   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1608     .legalFor({{S32}, {S64}});
1609 
1610   if (ST.hasVOP3PInsts()) {
1611     SextInReg.lowerFor({{V2S16}})
1612       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613       // get more vector shift opportunities, since we'll get those when
1614       // expanded.
1615       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1616   } else if (ST.has16BitInsts()) {
1617     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1618   } else {
1619     // Prefer to promote to s32 before lowering if we don't have 16-bit
1620     // shifts. This avoid a lot of intermediate truncate and extend operations.
1621     SextInReg.lowerFor({{S32}, {S64}});
1622   }
1623 
1624   SextInReg
1625     .scalarize(0)
1626     .clampScalar(0, S32, S64)
1627     .lower();
1628 
1629   // TODO: Only Try to form v2s16 with legal packed instructions.
1630   getActionDefinitionsBuilder(G_FSHR)
1631     .legalFor({{S32, S32}})
1632     .lowerFor({{V2S16, V2S16}})
1633     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1634     .scalarize(0)
1635     .lower();
1636 
1637   if (ST.hasVOP3PInsts()) {
1638     getActionDefinitionsBuilder(G_FSHL)
1639       .lowerFor({{V2S16, V2S16}})
1640       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641       .scalarize(0)
1642       .lower();
1643   } else {
1644     getActionDefinitionsBuilder(G_FSHL)
1645       .scalarize(0)
1646       .lower();
1647   }
1648 
1649   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650     .legalFor({S64});
1651 
1652   getActionDefinitionsBuilder(G_FENCE)
1653     .alwaysLegal();
1654 
1655   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656       .scalarize(0)
1657       .minScalar(0, S32)
1658       .lower();
1659 
1660   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661       .legalFor({{S32, S32}, {S64, S32}})
1662       .clampScalar(1, S32, S32)
1663       .clampScalar(0, S32, S64)
1664       .widenScalarToNextPow2(0)
1665       .scalarize(0);
1666 
1667   getActionDefinitionsBuilder({
1668       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669       G_FCOPYSIGN,
1670 
1671       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672       G_ATOMICRMW_NAND,
1673       G_ATOMICRMW_FSUB,
1674       G_READ_REGISTER,
1675       G_WRITE_REGISTER,
1676 
1677       G_SADDO, G_SSUBO,
1678 
1679        // TODO: Implement
1680       G_FMINIMUM, G_FMAXIMUM}).lower();
1681 
1682   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1683         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1684         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1685     .unsupported();
1686 
1687   getLegacyLegalizerInfo().computeTables();
1688   verify(*ST.getInstrInfo());
1689 }
1690 
1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1692                                          MachineInstr &MI) const {
1693   MachineIRBuilder &B = Helper.MIRBuilder;
1694   MachineRegisterInfo &MRI = *B.getMRI();
1695 
1696   switch (MI.getOpcode()) {
1697   case TargetOpcode::G_ADDRSPACE_CAST:
1698     return legalizeAddrSpaceCast(MI, MRI, B);
1699   case TargetOpcode::G_FRINT:
1700     return legalizeFrint(MI, MRI, B);
1701   case TargetOpcode::G_FCEIL:
1702     return legalizeFceil(MI, MRI, B);
1703   case TargetOpcode::G_FREM:
1704     return legalizeFrem(MI, MRI, B);
1705   case TargetOpcode::G_INTRINSIC_TRUNC:
1706     return legalizeIntrinsicTrunc(MI, MRI, B);
1707   case TargetOpcode::G_SITOFP:
1708     return legalizeITOFP(MI, MRI, B, true);
1709   case TargetOpcode::G_UITOFP:
1710     return legalizeITOFP(MI, MRI, B, false);
1711   case TargetOpcode::G_FPTOSI:
1712     return legalizeFPTOI(MI, MRI, B, true);
1713   case TargetOpcode::G_FPTOUI:
1714     return legalizeFPTOI(MI, MRI, B, false);
1715   case TargetOpcode::G_FMINNUM:
1716   case TargetOpcode::G_FMAXNUM:
1717   case TargetOpcode::G_FMINNUM_IEEE:
1718   case TargetOpcode::G_FMAXNUM_IEEE:
1719     return legalizeMinNumMaxNum(Helper, MI);
1720   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1721     return legalizeExtractVectorElt(MI, MRI, B);
1722   case TargetOpcode::G_INSERT_VECTOR_ELT:
1723     return legalizeInsertVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_SHUFFLE_VECTOR:
1725     return legalizeShuffleVector(MI, MRI, B);
1726   case TargetOpcode::G_FSIN:
1727   case TargetOpcode::G_FCOS:
1728     return legalizeSinCos(MI, MRI, B);
1729   case TargetOpcode::G_GLOBAL_VALUE:
1730     return legalizeGlobalValue(MI, MRI, B);
1731   case TargetOpcode::G_LOAD:
1732   case TargetOpcode::G_SEXTLOAD:
1733   case TargetOpcode::G_ZEXTLOAD:
1734     return legalizeLoad(Helper, MI);
1735   case TargetOpcode::G_FMAD:
1736     return legalizeFMad(MI, MRI, B);
1737   case TargetOpcode::G_FDIV:
1738     return legalizeFDIV(MI, MRI, B);
1739   case TargetOpcode::G_UDIV:
1740   case TargetOpcode::G_UREM:
1741   case TargetOpcode::G_UDIVREM:
1742     return legalizeUnsignedDIV_REM(MI, MRI, B);
1743   case TargetOpcode::G_SDIV:
1744   case TargetOpcode::G_SREM:
1745   case TargetOpcode::G_SDIVREM:
1746     return legalizeSignedDIV_REM(MI, MRI, B);
1747   case TargetOpcode::G_ATOMIC_CMPXCHG:
1748     return legalizeAtomicCmpXChg(MI, MRI, B);
1749   case TargetOpcode::G_FLOG:
1750     return legalizeFlog(MI, B, numbers::ln2f);
1751   case TargetOpcode::G_FLOG10:
1752     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1753   case TargetOpcode::G_FEXP:
1754     return legalizeFExp(MI, B);
1755   case TargetOpcode::G_FPOW:
1756     return legalizeFPow(MI, B);
1757   case TargetOpcode::G_FFLOOR:
1758     return legalizeFFloor(MI, MRI, B);
1759   case TargetOpcode::G_BUILD_VECTOR:
1760     return legalizeBuildVector(MI, MRI, B);
1761   default:
1762     return false;
1763   }
1764 
1765   llvm_unreachable("expected switch to return");
1766 }
1767 
1768 Register AMDGPULegalizerInfo::getSegmentAperture(
1769   unsigned AS,
1770   MachineRegisterInfo &MRI,
1771   MachineIRBuilder &B) const {
1772   MachineFunction &MF = B.getMF();
1773   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1774   const LLT S32 = LLT::scalar(32);
1775 
1776   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1777 
1778   if (ST.hasApertureRegs()) {
1779     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1780     // getreg.
1781     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1782         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1783         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1784     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1786         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1787     unsigned Encoding =
1788         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1789         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1790         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1791 
1792     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1793 
1794     B.buildInstr(AMDGPU::S_GETREG_B32)
1795       .addDef(GetReg)
1796       .addImm(Encoding);
1797     MRI.setType(GetReg, S32);
1798 
1799     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1800     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1801   }
1802 
1803   Register QueuePtr = MRI.createGenericVirtualRegister(
1804     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1805 
1806   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1807     return Register();
1808 
1809   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1810   // private_segment_aperture_base_hi.
1811   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1812 
1813   // TODO: can we be smarter about machine pointer info?
1814   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1815   MachineMemOperand *MMO = MF.getMachineMemOperand(
1816       PtrInfo,
1817       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1818           MachineMemOperand::MOInvariant,
1819       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1820 
1821   Register LoadAddr;
1822 
1823   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1824   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1825 }
1826 
1827 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1828   MachineInstr &MI, MachineRegisterInfo &MRI,
1829   MachineIRBuilder &B) const {
1830   MachineFunction &MF = B.getMF();
1831 
1832   const LLT S32 = LLT::scalar(32);
1833   Register Dst = MI.getOperand(0).getReg();
1834   Register Src = MI.getOperand(1).getReg();
1835 
1836   LLT DstTy = MRI.getType(Dst);
1837   LLT SrcTy = MRI.getType(Src);
1838   unsigned DestAS = DstTy.getAddressSpace();
1839   unsigned SrcAS = SrcTy.getAddressSpace();
1840 
1841   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1842   // vector element.
1843   assert(!DstTy.isVector());
1844 
1845   const AMDGPUTargetMachine &TM
1846     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1847 
1848   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1849     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1850     return true;
1851   }
1852 
1853   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1854     // Truncate.
1855     B.buildExtract(Dst, Src, 0);
1856     MI.eraseFromParent();
1857     return true;
1858   }
1859 
1860   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1861     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1862     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1863 
1864     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1865     // another. Merge operands are required to be the same type, but creating an
1866     // extra ptrtoint would be kind of pointless.
1867     auto HighAddr = B.buildConstant(
1868       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1869     B.buildMerge(Dst, {Src, HighAddr});
1870     MI.eraseFromParent();
1871     return true;
1872   }
1873 
1874   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1875     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1876            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1877     unsigned NullVal = TM.getNullPointerValue(DestAS);
1878 
1879     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1880     auto FlatNull = B.buildConstant(SrcTy, 0);
1881 
1882     // Extract low 32-bits of the pointer.
1883     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1884 
1885     auto CmpRes =
1886         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1887     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1888 
1889     MI.eraseFromParent();
1890     return true;
1891   }
1892 
1893   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1894     return false;
1895 
1896   if (!ST.hasFlatAddressSpace())
1897     return false;
1898 
1899   auto SegmentNull =
1900       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1901   auto FlatNull =
1902       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1903 
1904   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1905   if (!ApertureReg.isValid())
1906     return false;
1907 
1908   auto CmpRes =
1909       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1910 
1911   // Coerce the type of the low half of the result so we can use merge_values.
1912   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1913 
1914   // TODO: Should we allow mismatched types but matching sizes in merges to
1915   // avoid the ptrtoint?
1916   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1917   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1918 
1919   MI.eraseFromParent();
1920   return true;
1921 }
1922 
1923 bool AMDGPULegalizerInfo::legalizeFrint(
1924   MachineInstr &MI, MachineRegisterInfo &MRI,
1925   MachineIRBuilder &B) const {
1926   Register Src = MI.getOperand(1).getReg();
1927   LLT Ty = MRI.getType(Src);
1928   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1929 
1930   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1931   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1932 
1933   auto C1 = B.buildFConstant(Ty, C1Val);
1934   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1935 
1936   // TODO: Should this propagate fast-math-flags?
1937   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1938   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1939 
1940   auto C2 = B.buildFConstant(Ty, C2Val);
1941   auto Fabs = B.buildFAbs(Ty, Src);
1942 
1943   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1944   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1945   MI.eraseFromParent();
1946   return true;
1947 }
1948 
1949 bool AMDGPULegalizerInfo::legalizeFceil(
1950   MachineInstr &MI, MachineRegisterInfo &MRI,
1951   MachineIRBuilder &B) const {
1952 
1953   const LLT S1 = LLT::scalar(1);
1954   const LLT S64 = LLT::scalar(64);
1955 
1956   Register Src = MI.getOperand(1).getReg();
1957   assert(MRI.getType(Src) == S64);
1958 
1959   // result = trunc(src)
1960   // if (src > 0.0 && src != result)
1961   //   result += 1.0
1962 
1963   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1964 
1965   const auto Zero = B.buildFConstant(S64, 0.0);
1966   const auto One = B.buildFConstant(S64, 1.0);
1967   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1968   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1969   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1970   auto Add = B.buildSelect(S64, And, One, Zero);
1971 
1972   // TODO: Should this propagate fast-math-flags?
1973   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1974   return true;
1975 }
1976 
1977 bool AMDGPULegalizerInfo::legalizeFrem(
1978   MachineInstr &MI, MachineRegisterInfo &MRI,
1979   MachineIRBuilder &B) const {
1980     Register DstReg = MI.getOperand(0).getReg();
1981     Register Src0Reg = MI.getOperand(1).getReg();
1982     Register Src1Reg = MI.getOperand(2).getReg();
1983     auto Flags = MI.getFlags();
1984     LLT Ty = MRI.getType(DstReg);
1985 
1986     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1987     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1988     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1989     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1990     MI.eraseFromParent();
1991     return true;
1992 }
1993 
1994 static MachineInstrBuilder extractF64Exponent(Register Hi,
1995                                               MachineIRBuilder &B) {
1996   const unsigned FractBits = 52;
1997   const unsigned ExpBits = 11;
1998   LLT S32 = LLT::scalar(32);
1999 
2000   auto Const0 = B.buildConstant(S32, FractBits - 32);
2001   auto Const1 = B.buildConstant(S32, ExpBits);
2002 
2003   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2004     .addUse(Hi)
2005     .addUse(Const0.getReg(0))
2006     .addUse(Const1.getReg(0));
2007 
2008   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2009 }
2010 
2011 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2012   MachineInstr &MI, MachineRegisterInfo &MRI,
2013   MachineIRBuilder &B) const {
2014   const LLT S1 = LLT::scalar(1);
2015   const LLT S32 = LLT::scalar(32);
2016   const LLT S64 = LLT::scalar(64);
2017 
2018   Register Src = MI.getOperand(1).getReg();
2019   assert(MRI.getType(Src) == S64);
2020 
2021   // TODO: Should this use extract since the low half is unused?
2022   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2023   Register Hi = Unmerge.getReg(1);
2024 
2025   // Extract the upper half, since this is where we will find the sign and
2026   // exponent.
2027   auto Exp = extractF64Exponent(Hi, B);
2028 
2029   const unsigned FractBits = 52;
2030 
2031   // Extract the sign bit.
2032   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2033   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2034 
2035   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2036 
2037   const auto Zero32 = B.buildConstant(S32, 0);
2038 
2039   // Extend back to 64-bits.
2040   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2041 
2042   auto Shr = B.buildAShr(S64, FractMask, Exp);
2043   auto Not = B.buildNot(S64, Shr);
2044   auto Tmp0 = B.buildAnd(S64, Src, Not);
2045   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2046 
2047   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2048   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2049 
2050   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2051   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2052   MI.eraseFromParent();
2053   return true;
2054 }
2055 
2056 bool AMDGPULegalizerInfo::legalizeITOFP(
2057   MachineInstr &MI, MachineRegisterInfo &MRI,
2058   MachineIRBuilder &B, bool Signed) const {
2059 
2060   Register Dst = MI.getOperand(0).getReg();
2061   Register Src = MI.getOperand(1).getReg();
2062 
2063   const LLT S64 = LLT::scalar(64);
2064   const LLT S32 = LLT::scalar(32);
2065   const LLT S1 = LLT::scalar(1);
2066 
2067   assert(MRI.getType(Src) == S64);
2068 
2069   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2070   auto ThirtyTwo = B.buildConstant(S32, 32);
2071 
2072   if (MRI.getType(Dst) == S64) {
2073     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2074                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2075 
2076     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2077     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2078                      .addUse(CvtHi.getReg(0))
2079                      .addUse(ThirtyTwo.getReg(0));
2080 
2081     // TODO: Should this propagate fast-math-flags?
2082     B.buildFAdd(Dst, LdExp, CvtLo);
2083     MI.eraseFromParent();
2084     return true;
2085   }
2086 
2087   assert(MRI.getType(Dst) == S32);
2088 
2089   auto Zero = B.buildConstant(S32, 0);
2090   auto One = B.buildConstant(S32, 1);
2091   auto AllOnes = B.buildConstant(S32, -1);
2092 
2093   MachineInstrBuilder ShAmt;
2094   if (Signed) {
2095     auto ThirtyThree = B.buildConstant(S32, 33);
2096     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2097     auto HasSameSign = B.buildICmp(CmpInst::ICMP_SGE, S1, X, Zero);
2098     auto MaxShAmt = B.buildSelect(S32, HasSameSign, ThirtyThree, ThirtyTwo);
2099     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2100                                /*HasSideEffects=*/false)
2101                   .addUse(Unmerge.getReg(1));
2102     auto NotAllSameBits = B.buildICmp(CmpInst::ICMP_NE, S1, LS, AllOnes);
2103     auto LS2 = B.buildSelect(S32, NotAllSameBits, LS, MaxShAmt);
2104     ShAmt = B.buildSub(S32, LS2, One);
2105   } else
2106     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2107   auto Norm = B.buildShl(S64, Src, ShAmt);
2108   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2109   auto NotAllZeros =
2110       B.buildICmp(CmpInst::ICMP_NE, S1, Unmerge2.getReg(0), Zero);
2111   auto Adjust = B.buildSelect(S32, NotAllZeros, One, Zero);
2112   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2113   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2114   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2115   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2116                    /*HasSideEffects=*/false)
2117       .addUse(FVal.getReg(0))
2118       .addUse(Scale.getReg(0));
2119   MI.eraseFromParent();
2120   return true;
2121 }
2122 
2123 // TODO: Copied from DAG implementation. Verify logic and document how this
2124 // actually works.
2125 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2126                                         MachineRegisterInfo &MRI,
2127                                         MachineIRBuilder &B,
2128                                         bool Signed) const {
2129 
2130   Register Dst = MI.getOperand(0).getReg();
2131   Register Src = MI.getOperand(1).getReg();
2132 
2133   const LLT S64 = LLT::scalar(64);
2134   const LLT S32 = LLT::scalar(32);
2135 
2136   const LLT SrcLT = MRI.getType(Src);
2137   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2138 
2139   unsigned Flags = MI.getFlags();
2140 
2141   // The basic idea of converting a floating point number into a pair of 32-bit
2142   // integers is illustrated as follows:
2143   //
2144   //     tf := trunc(val);
2145   //    hif := floor(tf * 2^-32);
2146   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2147   //     hi := fptoi(hif);
2148   //     lo := fptoi(lof);
2149   //
2150   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2151   MachineInstrBuilder Sign;
2152   if (Signed && SrcLT == S32) {
2153     // However, a 32-bit floating point number has only 23 bits mantissa and
2154     // it's not enough to hold all the significant bits of `lof` if val is
2155     // negative. To avoid the loss of precision, We need to take the absolute
2156     // value after truncating and flip the result back based on the original
2157     // signedness.
2158     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2159     Trunc = B.buildFAbs(S32, Trunc, Flags);
2160   }
2161   MachineInstrBuilder K0, K1;
2162   if (SrcLT == S64) {
2163     K0 = B.buildFConstant(S64,
2164                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2165     K1 = B.buildFConstant(S64,
2166                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2167   } else {
2168     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2169     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2170   }
2171 
2172   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2173   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2174   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2175 
2176   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2177                                      : B.buildFPTOUI(S32, FloorMul);
2178   auto Lo = B.buildFPTOUI(S32, Fma);
2179 
2180   if (Signed && SrcLT == S32) {
2181     // Flip the result based on the signedness, which is either all 0s or 1s.
2182     Sign = B.buildMerge(S64, {Sign, Sign});
2183     // r := xor({lo, hi}, sign) - sign;
2184     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2185   } else
2186     B.buildMerge(Dst, {Lo, Hi});
2187   MI.eraseFromParent();
2188 
2189   return true;
2190 }
2191 
2192 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2193                                                MachineInstr &MI) const {
2194   MachineFunction &MF = Helper.MIRBuilder.getMF();
2195   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2196 
2197   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2198                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2199 
2200   // With ieee_mode disabled, the instructions have the correct behavior
2201   // already for G_FMINNUM/G_FMAXNUM
2202   if (!MFI->getMode().IEEE)
2203     return !IsIEEEOp;
2204 
2205   if (IsIEEEOp)
2206     return true;
2207 
2208   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2209 }
2210 
2211 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2212   MachineInstr &MI, MachineRegisterInfo &MRI,
2213   MachineIRBuilder &B) const {
2214   // TODO: Should move some of this into LegalizerHelper.
2215 
2216   // TODO: Promote dynamic indexing of s16 to s32
2217 
2218   // FIXME: Artifact combiner probably should have replaced the truncated
2219   // constant before this, so we shouldn't need
2220   // getConstantVRegValWithLookThrough.
2221   Optional<ValueAndVReg> MaybeIdxVal =
2222       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2223   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2224     return true;
2225   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2226 
2227   Register Dst = MI.getOperand(0).getReg();
2228   Register Vec = MI.getOperand(1).getReg();
2229 
2230   LLT VecTy = MRI.getType(Vec);
2231   LLT EltTy = VecTy.getElementType();
2232   assert(EltTy == MRI.getType(Dst));
2233 
2234   if (IdxVal < VecTy.getNumElements())
2235     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2236   else
2237     B.buildUndef(Dst);
2238 
2239   MI.eraseFromParent();
2240   return true;
2241 }
2242 
2243 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2244   MachineInstr &MI, MachineRegisterInfo &MRI,
2245   MachineIRBuilder &B) const {
2246   // TODO: Should move some of this into LegalizerHelper.
2247 
2248   // TODO: Promote dynamic indexing of s16 to s32
2249 
2250   // FIXME: Artifact combiner probably should have replaced the truncated
2251   // constant before this, so we shouldn't need
2252   // getConstantVRegValWithLookThrough.
2253   Optional<ValueAndVReg> MaybeIdxVal =
2254       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2255   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2256     return true;
2257 
2258   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2259   Register Dst = MI.getOperand(0).getReg();
2260   Register Vec = MI.getOperand(1).getReg();
2261   Register Ins = MI.getOperand(2).getReg();
2262 
2263   LLT VecTy = MRI.getType(Vec);
2264   LLT EltTy = VecTy.getElementType();
2265   assert(EltTy == MRI.getType(Ins));
2266 
2267   if (IdxVal < VecTy.getNumElements())
2268     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2269   else
2270     B.buildUndef(Dst);
2271 
2272   MI.eraseFromParent();
2273   return true;
2274 }
2275 
2276 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2277   MachineInstr &MI, MachineRegisterInfo &MRI,
2278   MachineIRBuilder &B) const {
2279   const LLT V2S16 = LLT::fixed_vector(2, 16);
2280 
2281   Register Dst = MI.getOperand(0).getReg();
2282   Register Src0 = MI.getOperand(1).getReg();
2283   LLT DstTy = MRI.getType(Dst);
2284   LLT SrcTy = MRI.getType(Src0);
2285 
2286   if (SrcTy == V2S16 && DstTy == V2S16 &&
2287       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2288     return true;
2289 
2290   MachineIRBuilder HelperBuilder(MI);
2291   GISelObserverWrapper DummyObserver;
2292   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2293   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2294 }
2295 
2296 bool AMDGPULegalizerInfo::legalizeSinCos(
2297   MachineInstr &MI, MachineRegisterInfo &MRI,
2298   MachineIRBuilder &B) const {
2299 
2300   Register DstReg = MI.getOperand(0).getReg();
2301   Register SrcReg = MI.getOperand(1).getReg();
2302   LLT Ty = MRI.getType(DstReg);
2303   unsigned Flags = MI.getFlags();
2304 
2305   Register TrigVal;
2306   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2307   if (ST.hasTrigReducedRange()) {
2308     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2309     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2310       .addUse(MulVal.getReg(0))
2311       .setMIFlags(Flags).getReg(0);
2312   } else
2313     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2314 
2315   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2316     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2317   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2318     .addUse(TrigVal)
2319     .setMIFlags(Flags);
2320   MI.eraseFromParent();
2321   return true;
2322 }
2323 
2324 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2325                                                   MachineIRBuilder &B,
2326                                                   const GlobalValue *GV,
2327                                                   int64_t Offset,
2328                                                   unsigned GAFlags) const {
2329   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2330   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2331   // to the following code sequence:
2332   //
2333   // For constant address space:
2334   //   s_getpc_b64 s[0:1]
2335   //   s_add_u32 s0, s0, $symbol
2336   //   s_addc_u32 s1, s1, 0
2337   //
2338   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2339   //   a fixup or relocation is emitted to replace $symbol with a literal
2340   //   constant, which is a pc-relative offset from the encoding of the $symbol
2341   //   operand to the global variable.
2342   //
2343   // For global address space:
2344   //   s_getpc_b64 s[0:1]
2345   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2346   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2347   //
2348   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2349   //   fixups or relocations are emitted to replace $symbol@*@lo and
2350   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2351   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2352   //   operand to the global variable.
2353   //
2354   // What we want here is an offset from the value returned by s_getpc
2355   // (which is the address of the s_add_u32 instruction) to the global
2356   // variable, but since the encoding of $symbol starts 4 bytes after the start
2357   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2358   // small. This requires us to add 4 to the global variable offset in order to
2359   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2360   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2361   // instruction.
2362 
2363   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2364 
2365   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2366     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2367 
2368   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2369     .addDef(PCReg);
2370 
2371   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2372   if (GAFlags == SIInstrInfo::MO_NONE)
2373     MIB.addImm(0);
2374   else
2375     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2376 
2377   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2378 
2379   if (PtrTy.getSizeInBits() == 32)
2380     B.buildExtract(DstReg, PCReg, 0);
2381   return true;
2382  }
2383 
2384 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2385   MachineInstr &MI, MachineRegisterInfo &MRI,
2386   MachineIRBuilder &B) const {
2387   Register DstReg = MI.getOperand(0).getReg();
2388   LLT Ty = MRI.getType(DstReg);
2389   unsigned AS = Ty.getAddressSpace();
2390 
2391   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2392   MachineFunction &MF = B.getMF();
2393   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2394 
2395   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2396     if (!MFI->isModuleEntryFunction() &&
2397         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2398       const Function &Fn = MF.getFunction();
2399       DiagnosticInfoUnsupported BadLDSDecl(
2400         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2401         DS_Warning);
2402       Fn.getContext().diagnose(BadLDSDecl);
2403 
2404       // We currently don't have a way to correctly allocate LDS objects that
2405       // aren't directly associated with a kernel. We do force inlining of
2406       // functions that use local objects. However, if these dead functions are
2407       // not eliminated, we don't want a compile time error. Just emit a warning
2408       // and a trap, since there should be no callable path here.
2409       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2410       B.buildUndef(DstReg);
2411       MI.eraseFromParent();
2412       return true;
2413     }
2414 
2415     // TODO: We could emit code to handle the initialization somewhere.
2416     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2417       const SITargetLowering *TLI = ST.getTargetLowering();
2418       if (!TLI->shouldUseLDSConstAddress(GV)) {
2419         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2420         return true; // Leave in place;
2421       }
2422 
2423       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2424         Type *Ty = GV->getValueType();
2425         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2426         // zero-sized type in other languages to declare the dynamic shared
2427         // memory which size is not known at the compile time. They will be
2428         // allocated by the runtime and placed directly after the static
2429         // allocated ones. They all share the same offset.
2430         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2431           // Adjust alignment for that dynamic shared memory array.
2432           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2433           LLT S32 = LLT::scalar(32);
2434           auto Sz =
2435               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2436           B.buildIntToPtr(DstReg, Sz);
2437           MI.eraseFromParent();
2438           return true;
2439         }
2440       }
2441 
2442       B.buildConstant(
2443           DstReg,
2444           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2445       MI.eraseFromParent();
2446       return true;
2447     }
2448 
2449     const Function &Fn = MF.getFunction();
2450     DiagnosticInfoUnsupported BadInit(
2451       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2452     Fn.getContext().diagnose(BadInit);
2453     return true;
2454   }
2455 
2456   const SITargetLowering *TLI = ST.getTargetLowering();
2457 
2458   if (TLI->shouldEmitFixup(GV)) {
2459     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2460     MI.eraseFromParent();
2461     return true;
2462   }
2463 
2464   if (TLI->shouldEmitPCReloc(GV)) {
2465     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2466     MI.eraseFromParent();
2467     return true;
2468   }
2469 
2470   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2471   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2472 
2473   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2474   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2475       MachinePointerInfo::getGOT(MF),
2476       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2477           MachineMemOperand::MOInvariant,
2478       LoadTy, Align(8));
2479 
2480   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2481 
2482   if (Ty.getSizeInBits() == 32) {
2483     // Truncate if this is a 32-bit constant adrdess.
2484     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2485     B.buildExtract(DstReg, Load, 0);
2486   } else
2487     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2488 
2489   MI.eraseFromParent();
2490   return true;
2491 }
2492 
2493 static LLT widenToNextPowerOf2(LLT Ty) {
2494   if (Ty.isVector())
2495     return Ty.changeElementCount(
2496         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2497   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2498 }
2499 
2500 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2501                                        MachineInstr &MI) const {
2502   MachineIRBuilder &B = Helper.MIRBuilder;
2503   MachineRegisterInfo &MRI = *B.getMRI();
2504   GISelChangeObserver &Observer = Helper.Observer;
2505 
2506   Register PtrReg = MI.getOperand(1).getReg();
2507   LLT PtrTy = MRI.getType(PtrReg);
2508   unsigned AddrSpace = PtrTy.getAddressSpace();
2509 
2510   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2511     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2512     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2513     Observer.changingInstr(MI);
2514     MI.getOperand(1).setReg(Cast.getReg(0));
2515     Observer.changedInstr(MI);
2516     return true;
2517   }
2518 
2519   if (MI.getOpcode() != AMDGPU::G_LOAD)
2520     return false;
2521 
2522   Register ValReg = MI.getOperand(0).getReg();
2523   LLT ValTy = MRI.getType(ValReg);
2524 
2525   MachineMemOperand *MMO = *MI.memoperands_begin();
2526   const unsigned ValSize = ValTy.getSizeInBits();
2527   const LLT MemTy = MMO->getMemoryType();
2528   const Align MemAlign = MMO->getAlign();
2529   const unsigned MemSize = MemTy.getSizeInBits();
2530   const unsigned AlignInBits = 8 * MemAlign.value();
2531 
2532   // Widen non-power-of-2 loads to the alignment if needed
2533   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2534     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2535 
2536     // This was already the correct extending load result type, so just adjust
2537     // the memory type.
2538     if (WideMemSize == ValSize) {
2539       MachineFunction &MF = B.getMF();
2540 
2541       MachineMemOperand *WideMMO =
2542           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2543       Observer.changingInstr(MI);
2544       MI.setMemRefs(MF, {WideMMO});
2545       Observer.changedInstr(MI);
2546       return true;
2547     }
2548 
2549     // Don't bother handling edge case that should probably never be produced.
2550     if (ValSize > WideMemSize)
2551       return false;
2552 
2553     LLT WideTy = widenToNextPowerOf2(ValTy);
2554 
2555     Register WideLoad;
2556     if (!WideTy.isVector()) {
2557       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2558       B.buildTrunc(ValReg, WideLoad).getReg(0);
2559     } else {
2560       // Extract the subvector.
2561 
2562       if (isRegisterType(ValTy)) {
2563         // If this a case where G_EXTRACT is legal, use it.
2564         // (e.g. <3 x s32> -> <4 x s32>)
2565         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2566         B.buildExtract(ValReg, WideLoad, 0);
2567       } else {
2568         // For cases where the widened type isn't a nice register value, unmerge
2569         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2570         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2571         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2572         B.setInsertPt(B.getMBB(), MI.getIterator());
2573         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2574       }
2575     }
2576 
2577     MI.eraseFromParent();
2578     return true;
2579   }
2580 
2581   return false;
2582 }
2583 
2584 bool AMDGPULegalizerInfo::legalizeFMad(
2585   MachineInstr &MI, MachineRegisterInfo &MRI,
2586   MachineIRBuilder &B) const {
2587   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2588   assert(Ty.isScalar());
2589 
2590   MachineFunction &MF = B.getMF();
2591   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2592 
2593   // TODO: Always legal with future ftz flag.
2594   // FIXME: Do we need just output?
2595   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2596     return true;
2597   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2598     return true;
2599 
2600   MachineIRBuilder HelperBuilder(MI);
2601   GISelObserverWrapper DummyObserver;
2602   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2603   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2604 }
2605 
2606 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2607   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2608   Register DstReg = MI.getOperand(0).getReg();
2609   Register PtrReg = MI.getOperand(1).getReg();
2610   Register CmpVal = MI.getOperand(2).getReg();
2611   Register NewVal = MI.getOperand(3).getReg();
2612 
2613   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2614          "this should not have been custom lowered");
2615 
2616   LLT ValTy = MRI.getType(CmpVal);
2617   LLT VecTy = LLT::fixed_vector(2, ValTy);
2618 
2619   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2620 
2621   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2622     .addDef(DstReg)
2623     .addUse(PtrReg)
2624     .addUse(PackedVal)
2625     .setMemRefs(MI.memoperands());
2626 
2627   MI.eraseFromParent();
2628   return true;
2629 }
2630 
2631 bool AMDGPULegalizerInfo::legalizeFlog(
2632   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2633   Register Dst = MI.getOperand(0).getReg();
2634   Register Src = MI.getOperand(1).getReg();
2635   LLT Ty = B.getMRI()->getType(Dst);
2636   unsigned Flags = MI.getFlags();
2637 
2638   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2639   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2640 
2641   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2642   MI.eraseFromParent();
2643   return true;
2644 }
2645 
2646 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2647                                        MachineIRBuilder &B) const {
2648   Register Dst = MI.getOperand(0).getReg();
2649   Register Src = MI.getOperand(1).getReg();
2650   unsigned Flags = MI.getFlags();
2651   LLT Ty = B.getMRI()->getType(Dst);
2652 
2653   auto K = B.buildFConstant(Ty, numbers::log2e);
2654   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2655   B.buildFExp2(Dst, Mul, Flags);
2656   MI.eraseFromParent();
2657   return true;
2658 }
2659 
2660 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2661                                        MachineIRBuilder &B) const {
2662   Register Dst = MI.getOperand(0).getReg();
2663   Register Src0 = MI.getOperand(1).getReg();
2664   Register Src1 = MI.getOperand(2).getReg();
2665   unsigned Flags = MI.getFlags();
2666   LLT Ty = B.getMRI()->getType(Dst);
2667   const LLT S16 = LLT::scalar(16);
2668   const LLT S32 = LLT::scalar(32);
2669 
2670   if (Ty == S32) {
2671     auto Log = B.buildFLog2(S32, Src0, Flags);
2672     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2673       .addUse(Log.getReg(0))
2674       .addUse(Src1)
2675       .setMIFlags(Flags);
2676     B.buildFExp2(Dst, Mul, Flags);
2677   } else if (Ty == S16) {
2678     // There's no f16 fmul_legacy, so we need to convert for it.
2679     auto Log = B.buildFLog2(S16, Src0, Flags);
2680     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2681     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2682     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2683       .addUse(Ext0.getReg(0))
2684       .addUse(Ext1.getReg(0))
2685       .setMIFlags(Flags);
2686 
2687     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2688   } else
2689     return false;
2690 
2691   MI.eraseFromParent();
2692   return true;
2693 }
2694 
2695 // Find a source register, ignoring any possible source modifiers.
2696 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2697   Register ModSrc = OrigSrc;
2698   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2699     ModSrc = SrcFNeg->getOperand(1).getReg();
2700     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2701       ModSrc = SrcFAbs->getOperand(1).getReg();
2702   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2703     ModSrc = SrcFAbs->getOperand(1).getReg();
2704   return ModSrc;
2705 }
2706 
2707 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2708                                          MachineRegisterInfo &MRI,
2709                                          MachineIRBuilder &B) const {
2710 
2711   const LLT S1 = LLT::scalar(1);
2712   const LLT S64 = LLT::scalar(64);
2713   Register Dst = MI.getOperand(0).getReg();
2714   Register OrigSrc = MI.getOperand(1).getReg();
2715   unsigned Flags = MI.getFlags();
2716   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2717          "this should not have been custom lowered");
2718 
2719   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2720   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2721   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2722   // V_FRACT bug is:
2723   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2724   //
2725   // Convert floor(x) to (x - fract(x))
2726 
2727   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2728     .addUse(OrigSrc)
2729     .setMIFlags(Flags);
2730 
2731   // Give source modifier matching some assistance before obscuring a foldable
2732   // pattern.
2733 
2734   // TODO: We can avoid the neg on the fract? The input sign to fract
2735   // shouldn't matter?
2736   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2737 
2738   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2739 
2740   Register Min = MRI.createGenericVirtualRegister(S64);
2741 
2742   // We don't need to concern ourselves with the snan handling difference, so
2743   // use the one which will directly select.
2744   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2745   if (MFI->getMode().IEEE)
2746     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2747   else
2748     B.buildFMinNum(Min, Fract, Const, Flags);
2749 
2750   Register CorrectedFract = Min;
2751   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2752     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2753     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2754   }
2755 
2756   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2757   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2758 
2759   MI.eraseFromParent();
2760   return true;
2761 }
2762 
2763 // Turn an illegal packed v2s16 build vector into bit operations.
2764 // TODO: This should probably be a bitcast action in LegalizerHelper.
2765 bool AMDGPULegalizerInfo::legalizeBuildVector(
2766   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2767   Register Dst = MI.getOperand(0).getReg();
2768   const LLT S32 = LLT::scalar(32);
2769   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2770 
2771   Register Src0 = MI.getOperand(1).getReg();
2772   Register Src1 = MI.getOperand(2).getReg();
2773   assert(MRI.getType(Src0) == LLT::scalar(16));
2774 
2775   auto Merge = B.buildMerge(S32, {Src0, Src1});
2776   B.buildBitcast(Dst, Merge);
2777 
2778   MI.eraseFromParent();
2779   return true;
2780 }
2781 
2782 // Check that this is a G_XOR x, -1
2783 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2784   if (MI.getOpcode() != TargetOpcode::G_XOR)
2785     return false;
2786   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2787   return ConstVal && *ConstVal == -1;
2788 }
2789 
2790 // Return the use branch instruction, otherwise null if the usage is invalid.
2791 static MachineInstr *
2792 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2793                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2794   Register CondDef = MI.getOperand(0).getReg();
2795   if (!MRI.hasOneNonDBGUse(CondDef))
2796     return nullptr;
2797 
2798   MachineBasicBlock *Parent = MI.getParent();
2799   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2800 
2801   if (isNot(MRI, *UseMI)) {
2802     Register NegatedCond = UseMI->getOperand(0).getReg();
2803     if (!MRI.hasOneNonDBGUse(NegatedCond))
2804       return nullptr;
2805 
2806     // We're deleting the def of this value, so we need to remove it.
2807     UseMI->eraseFromParent();
2808 
2809     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2810     Negated = true;
2811   }
2812 
2813   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2814     return nullptr;
2815 
2816   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2817   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2818   if (Next == Parent->end()) {
2819     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2820     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2821       return nullptr;
2822     UncondBrTarget = &*NextMBB;
2823   } else {
2824     if (Next->getOpcode() != AMDGPU::G_BR)
2825       return nullptr;
2826     Br = &*Next;
2827     UncondBrTarget = Br->getOperand(0).getMBB();
2828   }
2829 
2830   return UseMI;
2831 }
2832 
2833 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2834                                          const ArgDescriptor *Arg,
2835                                          const TargetRegisterClass *ArgRC,
2836                                          LLT ArgTy) const {
2837   MCRegister SrcReg = Arg->getRegister();
2838   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2839   assert(DstReg.isVirtual() && "Virtual register expected");
2840 
2841   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2842                                              ArgTy);
2843   if (Arg->isMasked()) {
2844     // TODO: Should we try to emit this once in the entry block?
2845     const LLT S32 = LLT::scalar(32);
2846     const unsigned Mask = Arg->getMask();
2847     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2848 
2849     Register AndMaskSrc = LiveIn;
2850 
2851     if (Shift != 0) {
2852       auto ShiftAmt = B.buildConstant(S32, Shift);
2853       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2854     }
2855 
2856     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2857   } else {
2858     B.buildCopy(DstReg, LiveIn);
2859   }
2860 
2861   return true;
2862 }
2863 
2864 bool AMDGPULegalizerInfo::loadInputValue(
2865     Register DstReg, MachineIRBuilder &B,
2866     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2867   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2868   const ArgDescriptor *Arg;
2869   const TargetRegisterClass *ArgRC;
2870   LLT ArgTy;
2871   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2872 
2873   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2874     return false; // TODO: Handle these
2875   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2876 }
2877 
2878 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2879     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2880     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2881   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2882     return false;
2883 
2884   MI.eraseFromParent();
2885   return true;
2886 }
2887 
2888 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2889                                        MachineRegisterInfo &MRI,
2890                                        MachineIRBuilder &B) const {
2891   Register Dst = MI.getOperand(0).getReg();
2892   LLT DstTy = MRI.getType(Dst);
2893   LLT S16 = LLT::scalar(16);
2894   LLT S32 = LLT::scalar(32);
2895   LLT S64 = LLT::scalar(64);
2896 
2897   if (DstTy == S16)
2898     return legalizeFDIV16(MI, MRI, B);
2899   if (DstTy == S32)
2900     return legalizeFDIV32(MI, MRI, B);
2901   if (DstTy == S64)
2902     return legalizeFDIV64(MI, MRI, B);
2903 
2904   return false;
2905 }
2906 
2907 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2908                                                         Register DstDivReg,
2909                                                         Register DstRemReg,
2910                                                         Register X,
2911                                                         Register Y) const {
2912   const LLT S1 = LLT::scalar(1);
2913   const LLT S32 = LLT::scalar(32);
2914 
2915   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2916   // algorithm used here.
2917 
2918   // Initial estimate of inv(y).
2919   auto FloatY = B.buildUITOFP(S32, Y);
2920   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2921   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2922   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2923   auto Z = B.buildFPTOUI(S32, ScaledY);
2924 
2925   // One round of UNR.
2926   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2927   auto NegYZ = B.buildMul(S32, NegY, Z);
2928   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2929 
2930   // Quotient/remainder estimate.
2931   auto Q = B.buildUMulH(S32, X, Z);
2932   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2933 
2934   // First quotient/remainder refinement.
2935   auto One = B.buildConstant(S32, 1);
2936   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2937   if (DstDivReg)
2938     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2939   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2940 
2941   // Second quotient/remainder refinement.
2942   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2943   if (DstDivReg)
2944     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2945 
2946   if (DstRemReg)
2947     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2948 }
2949 
2950 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2951 //
2952 // Return lo, hi of result
2953 //
2954 // %cvt.lo = G_UITOFP Val.lo
2955 // %cvt.hi = G_UITOFP Val.hi
2956 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2957 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2958 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2959 // %mul2 = G_FMUL %mul1, 2**(-32)
2960 // %trunc = G_INTRINSIC_TRUNC %mul2
2961 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2962 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2963 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2964                                                        Register Val) {
2965   const LLT S32 = LLT::scalar(32);
2966   auto Unmerge = B.buildUnmerge(S32, Val);
2967 
2968   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2969   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2970 
2971   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2972                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2973 
2974   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2975   auto Mul1 =
2976       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2977 
2978   // 2**(-32)
2979   auto Mul2 =
2980       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2981   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2982 
2983   // -(2**32)
2984   auto Mad2 = B.buildFMAD(S32, Trunc,
2985                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2986 
2987   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2988   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2989 
2990   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2991 }
2992 
2993 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2994                                                         Register DstDivReg,
2995                                                         Register DstRemReg,
2996                                                         Register Numer,
2997                                                         Register Denom) const {
2998   const LLT S32 = LLT::scalar(32);
2999   const LLT S64 = LLT::scalar(64);
3000   const LLT S1 = LLT::scalar(1);
3001   Register RcpLo, RcpHi;
3002 
3003   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3004 
3005   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3006 
3007   auto Zero64 = B.buildConstant(S64, 0);
3008   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3009 
3010   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3011   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3012 
3013   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3014   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3015   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3016 
3017   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3018   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3019   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3020   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3021 
3022   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3023   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3024   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3025   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3026   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3027 
3028   auto Zero32 = B.buildConstant(S32, 0);
3029   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3030   auto Add2_HiC =
3031       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3032   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3033   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3034 
3035   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3036   Register NumerLo = UnmergeNumer.getReg(0);
3037   Register NumerHi = UnmergeNumer.getReg(1);
3038 
3039   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3040   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3041   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3042   Register Mul3_Lo = UnmergeMul3.getReg(0);
3043   Register Mul3_Hi = UnmergeMul3.getReg(1);
3044   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3045   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3046   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3047   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3048 
3049   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3050   Register DenomLo = UnmergeDenom.getReg(0);
3051   Register DenomHi = UnmergeDenom.getReg(1);
3052 
3053   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3054   auto C1 = B.buildSExt(S32, CmpHi);
3055 
3056   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3057   auto C2 = B.buildSExt(S32, CmpLo);
3058 
3059   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3060   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3061 
3062   // TODO: Here and below portions of the code can be enclosed into if/endif.
3063   // Currently control flow is unconditional and we have 4 selects after
3064   // potential endif to substitute PHIs.
3065 
3066   // if C3 != 0 ...
3067   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3068   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3069   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3070   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3071 
3072   auto One64 = B.buildConstant(S64, 1);
3073   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3074 
3075   auto C4 =
3076       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3077   auto C5 =
3078       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3079   auto C6 = B.buildSelect(
3080       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3081 
3082   // if (C6 != 0)
3083   auto Add4 = B.buildAdd(S64, Add3, One64);
3084   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3085 
3086   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3087   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3088   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3089 
3090   // endif C6
3091   // endif C3
3092 
3093   if (DstDivReg) {
3094     auto Sel1 = B.buildSelect(
3095         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3096     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3097                   Sel1, MulHi3);
3098   }
3099 
3100   if (DstRemReg) {
3101     auto Sel2 = B.buildSelect(
3102         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3103     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3104                   Sel2, Sub1);
3105   }
3106 }
3107 
3108 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3109                                                   MachineRegisterInfo &MRI,
3110                                                   MachineIRBuilder &B) const {
3111   Register DstDivReg, DstRemReg;
3112   switch (MI.getOpcode()) {
3113   default:
3114     llvm_unreachable("Unexpected opcode!");
3115   case AMDGPU::G_UDIV: {
3116     DstDivReg = MI.getOperand(0).getReg();
3117     break;
3118   }
3119   case AMDGPU::G_UREM: {
3120     DstRemReg = MI.getOperand(0).getReg();
3121     break;
3122   }
3123   case AMDGPU::G_UDIVREM: {
3124     DstDivReg = MI.getOperand(0).getReg();
3125     DstRemReg = MI.getOperand(1).getReg();
3126     break;
3127   }
3128   }
3129 
3130   const LLT S64 = LLT::scalar(64);
3131   const LLT S32 = LLT::scalar(32);
3132   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3133   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3134   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3135   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3136 
3137   if (Ty == S32)
3138     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3139   else if (Ty == S64)
3140     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3141   else
3142     return false;
3143 
3144   MI.eraseFromParent();
3145   return true;
3146 }
3147 
3148 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3149                                                 MachineRegisterInfo &MRI,
3150                                                 MachineIRBuilder &B) const {
3151   const LLT S64 = LLT::scalar(64);
3152   const LLT S32 = LLT::scalar(32);
3153 
3154   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3155   if (Ty != S32 && Ty != S64)
3156     return false;
3157 
3158   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3159   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3160   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3161 
3162   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3163   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3164   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3165 
3166   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3167   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3168 
3169   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3170   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3171 
3172   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3173   switch (MI.getOpcode()) {
3174   default:
3175     llvm_unreachable("Unexpected opcode!");
3176   case AMDGPU::G_SDIV: {
3177     DstDivReg = MI.getOperand(0).getReg();
3178     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3179     break;
3180   }
3181   case AMDGPU::G_SREM: {
3182     DstRemReg = MI.getOperand(0).getReg();
3183     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3184     break;
3185   }
3186   case AMDGPU::G_SDIVREM: {
3187     DstDivReg = MI.getOperand(0).getReg();
3188     DstRemReg = MI.getOperand(1).getReg();
3189     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3190     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3191     break;
3192   }
3193   }
3194 
3195   if (Ty == S32)
3196     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3197   else
3198     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3199 
3200   if (DstDivReg) {
3201     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3202     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3203     B.buildSub(DstDivReg, SignXor, Sign);
3204   }
3205 
3206   if (DstRemReg) {
3207     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3208     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3209     B.buildSub(DstRemReg, SignXor, Sign);
3210   }
3211 
3212   MI.eraseFromParent();
3213   return true;
3214 }
3215 
3216 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3217                                                  MachineRegisterInfo &MRI,
3218                                                  MachineIRBuilder &B) const {
3219   Register Res = MI.getOperand(0).getReg();
3220   Register LHS = MI.getOperand(1).getReg();
3221   Register RHS = MI.getOperand(2).getReg();
3222   uint16_t Flags = MI.getFlags();
3223   LLT ResTy = MRI.getType(Res);
3224 
3225   const MachineFunction &MF = B.getMF();
3226   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3227                             MI.getFlag(MachineInstr::FmAfn);
3228 
3229   if (!AllowInaccurateRcp)
3230     return false;
3231 
3232   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3233     // 1 / x -> RCP(x)
3234     if (CLHS->isExactlyValue(1.0)) {
3235       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3236         .addUse(RHS)
3237         .setMIFlags(Flags);
3238 
3239       MI.eraseFromParent();
3240       return true;
3241     }
3242 
3243     // -1 / x -> RCP( FNEG(x) )
3244     if (CLHS->isExactlyValue(-1.0)) {
3245       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3246       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3247         .addUse(FNeg.getReg(0))
3248         .setMIFlags(Flags);
3249 
3250       MI.eraseFromParent();
3251       return true;
3252     }
3253   }
3254 
3255   // x / y -> x * (1.0 / y)
3256   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3257     .addUse(RHS)
3258     .setMIFlags(Flags);
3259   B.buildFMul(Res, LHS, RCP, Flags);
3260 
3261   MI.eraseFromParent();
3262   return true;
3263 }
3264 
3265 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3266                                                    MachineRegisterInfo &MRI,
3267                                                    MachineIRBuilder &B) const {
3268   Register Res = MI.getOperand(0).getReg();
3269   Register X = MI.getOperand(1).getReg();
3270   Register Y = MI.getOperand(2).getReg();
3271   uint16_t Flags = MI.getFlags();
3272   LLT ResTy = MRI.getType(Res);
3273 
3274   const MachineFunction &MF = B.getMF();
3275   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3276                             MI.getFlag(MachineInstr::FmAfn);
3277 
3278   if (!AllowInaccurateRcp)
3279     return false;
3280 
3281   auto NegY = B.buildFNeg(ResTy, Y);
3282   auto One = B.buildFConstant(ResTy, 1.0);
3283 
3284   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3285     .addUse(Y)
3286     .setMIFlags(Flags);
3287 
3288   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3289   R = B.buildFMA(ResTy, Tmp0, R, R);
3290 
3291   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3292   R = B.buildFMA(ResTy, Tmp1, R, R);
3293 
3294   auto Ret = B.buildFMul(ResTy, X, R);
3295   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3296 
3297   B.buildFMA(Res, Tmp2, R, Ret);
3298   MI.eraseFromParent();
3299   return true;
3300 }
3301 
3302 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3303                                          MachineRegisterInfo &MRI,
3304                                          MachineIRBuilder &B) const {
3305   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3306     return true;
3307 
3308   Register Res = MI.getOperand(0).getReg();
3309   Register LHS = MI.getOperand(1).getReg();
3310   Register RHS = MI.getOperand(2).getReg();
3311 
3312   uint16_t Flags = MI.getFlags();
3313 
3314   LLT S16 = LLT::scalar(16);
3315   LLT S32 = LLT::scalar(32);
3316 
3317   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3318   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3319 
3320   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3321     .addUse(RHSExt.getReg(0))
3322     .setMIFlags(Flags);
3323 
3324   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3325   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3326 
3327   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3328     .addUse(RDst.getReg(0))
3329     .addUse(RHS)
3330     .addUse(LHS)
3331     .setMIFlags(Flags);
3332 
3333   MI.eraseFromParent();
3334   return true;
3335 }
3336 
3337 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3338 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3339 static void toggleSPDenormMode(bool Enable,
3340                                MachineIRBuilder &B,
3341                                const GCNSubtarget &ST,
3342                                AMDGPU::SIModeRegisterDefaults Mode) {
3343   // Set SP denorm mode to this value.
3344   unsigned SPDenormMode =
3345     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3346 
3347   if (ST.hasDenormModeInst()) {
3348     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3349     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3350 
3351     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3352     B.buildInstr(AMDGPU::S_DENORM_MODE)
3353       .addImm(NewDenormModeValue);
3354 
3355   } else {
3356     // Select FP32 bit field in mode register.
3357     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3358                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3359                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3360 
3361     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3362       .addImm(SPDenormMode)
3363       .addImm(SPDenormModeBitField);
3364   }
3365 }
3366 
3367 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3368                                          MachineRegisterInfo &MRI,
3369                                          MachineIRBuilder &B) const {
3370   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3371     return true;
3372 
3373   Register Res = MI.getOperand(0).getReg();
3374   Register LHS = MI.getOperand(1).getReg();
3375   Register RHS = MI.getOperand(2).getReg();
3376   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3377   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3378 
3379   uint16_t Flags = MI.getFlags();
3380 
3381   LLT S32 = LLT::scalar(32);
3382   LLT S1 = LLT::scalar(1);
3383 
3384   auto One = B.buildFConstant(S32, 1.0f);
3385 
3386   auto DenominatorScaled =
3387     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3388       .addUse(LHS)
3389       .addUse(RHS)
3390       .addImm(0)
3391       .setMIFlags(Flags);
3392   auto NumeratorScaled =
3393     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3394       .addUse(LHS)
3395       .addUse(RHS)
3396       .addImm(1)
3397       .setMIFlags(Flags);
3398 
3399   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3400     .addUse(DenominatorScaled.getReg(0))
3401     .setMIFlags(Flags);
3402   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3403 
3404   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3405   // aren't modeled as reading it.
3406   if (!Mode.allFP32Denormals())
3407     toggleSPDenormMode(true, B, ST, Mode);
3408 
3409   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3410   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3411   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3412   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3413   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3414   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3415 
3416   if (!Mode.allFP32Denormals())
3417     toggleSPDenormMode(false, B, ST, Mode);
3418 
3419   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3420     .addUse(Fma4.getReg(0))
3421     .addUse(Fma1.getReg(0))
3422     .addUse(Fma3.getReg(0))
3423     .addUse(NumeratorScaled.getReg(1))
3424     .setMIFlags(Flags);
3425 
3426   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3427     .addUse(Fmas.getReg(0))
3428     .addUse(RHS)
3429     .addUse(LHS)
3430     .setMIFlags(Flags);
3431 
3432   MI.eraseFromParent();
3433   return true;
3434 }
3435 
3436 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3437                                          MachineRegisterInfo &MRI,
3438                                          MachineIRBuilder &B) const {
3439   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3440     return true;
3441 
3442   Register Res = MI.getOperand(0).getReg();
3443   Register LHS = MI.getOperand(1).getReg();
3444   Register RHS = MI.getOperand(2).getReg();
3445 
3446   uint16_t Flags = MI.getFlags();
3447 
3448   LLT S64 = LLT::scalar(64);
3449   LLT S1 = LLT::scalar(1);
3450 
3451   auto One = B.buildFConstant(S64, 1.0);
3452 
3453   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3454     .addUse(LHS)
3455     .addUse(RHS)
3456     .addImm(0)
3457     .setMIFlags(Flags);
3458 
3459   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3460 
3461   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3462     .addUse(DivScale0.getReg(0))
3463     .setMIFlags(Flags);
3464 
3465   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3466   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3467   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3468 
3469   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3470     .addUse(LHS)
3471     .addUse(RHS)
3472     .addImm(1)
3473     .setMIFlags(Flags);
3474 
3475   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3476   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3477   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3478 
3479   Register Scale;
3480   if (!ST.hasUsableDivScaleConditionOutput()) {
3481     // Workaround a hardware bug on SI where the condition output from div_scale
3482     // is not usable.
3483 
3484     LLT S32 = LLT::scalar(32);
3485 
3486     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3487     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3488     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3489     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3490 
3491     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3492                               Scale1Unmerge.getReg(1));
3493     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3494                               Scale0Unmerge.getReg(1));
3495     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3496   } else {
3497     Scale = DivScale1.getReg(1);
3498   }
3499 
3500   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3501     .addUse(Fma4.getReg(0))
3502     .addUse(Fma3.getReg(0))
3503     .addUse(Mul.getReg(0))
3504     .addUse(Scale)
3505     .setMIFlags(Flags);
3506 
3507   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3508     .addUse(Fmas.getReg(0))
3509     .addUse(RHS)
3510     .addUse(LHS)
3511     .setMIFlags(Flags);
3512 
3513   MI.eraseFromParent();
3514   return true;
3515 }
3516 
3517 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3518                                                  MachineRegisterInfo &MRI,
3519                                                  MachineIRBuilder &B) const {
3520   Register Res = MI.getOperand(0).getReg();
3521   Register LHS = MI.getOperand(2).getReg();
3522   Register RHS = MI.getOperand(3).getReg();
3523   uint16_t Flags = MI.getFlags();
3524 
3525   LLT S32 = LLT::scalar(32);
3526   LLT S1 = LLT::scalar(1);
3527 
3528   auto Abs = B.buildFAbs(S32, RHS, Flags);
3529   const APFloat C0Val(1.0f);
3530 
3531   auto C0 = B.buildConstant(S32, 0x6f800000);
3532   auto C1 = B.buildConstant(S32, 0x2f800000);
3533   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3534 
3535   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3536   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3537 
3538   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3539 
3540   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3541     .addUse(Mul0.getReg(0))
3542     .setMIFlags(Flags);
3543 
3544   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3545 
3546   B.buildFMul(Res, Sel, Mul1, Flags);
3547 
3548   MI.eraseFromParent();
3549   return true;
3550 }
3551 
3552 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3553 // FIXME: Why do we handle this one but not other removed instructions?
3554 //
3555 // Reciprocal square root.  The clamp prevents infinite results, clamping
3556 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3557 // +-max_float.
3558 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3559                                                     MachineRegisterInfo &MRI,
3560                                                     MachineIRBuilder &B) const {
3561   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3562     return true;
3563 
3564   Register Dst = MI.getOperand(0).getReg();
3565   Register Src = MI.getOperand(2).getReg();
3566   auto Flags = MI.getFlags();
3567 
3568   LLT Ty = MRI.getType(Dst);
3569 
3570   const fltSemantics *FltSemantics;
3571   if (Ty == LLT::scalar(32))
3572     FltSemantics = &APFloat::IEEEsingle();
3573   else if (Ty == LLT::scalar(64))
3574     FltSemantics = &APFloat::IEEEdouble();
3575   else
3576     return false;
3577 
3578   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3579     .addUse(Src)
3580     .setMIFlags(Flags);
3581 
3582   // We don't need to concern ourselves with the snan handling difference, since
3583   // the rsq quieted (or not) so use the one which will directly select.
3584   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3585   const bool UseIEEE = MFI->getMode().IEEE;
3586 
3587   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3588   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3589                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3590 
3591   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3592 
3593   if (UseIEEE)
3594     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3595   else
3596     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3597   MI.eraseFromParent();
3598   return true;
3599 }
3600 
3601 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3602   switch (IID) {
3603   case Intrinsic::amdgcn_ds_fadd:
3604     return AMDGPU::G_ATOMICRMW_FADD;
3605   case Intrinsic::amdgcn_ds_fmin:
3606     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3607   case Intrinsic::amdgcn_ds_fmax:
3608     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3609   default:
3610     llvm_unreachable("not a DS FP intrinsic");
3611   }
3612 }
3613 
3614 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3615                                                       MachineInstr &MI,
3616                                                       Intrinsic::ID IID) const {
3617   GISelChangeObserver &Observer = Helper.Observer;
3618   Observer.changingInstr(MI);
3619 
3620   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3621 
3622   // The remaining operands were used to set fields in the MemOperand on
3623   // construction.
3624   for (int I = 6; I > 3; --I)
3625     MI.RemoveOperand(I);
3626 
3627   MI.RemoveOperand(1); // Remove the intrinsic ID.
3628   Observer.changedInstr(MI);
3629   return true;
3630 }
3631 
3632 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3633                                             MachineRegisterInfo &MRI,
3634                                             MachineIRBuilder &B) const {
3635   uint64_t Offset =
3636     ST.getTargetLowering()->getImplicitParameterOffset(
3637       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3638   LLT DstTy = MRI.getType(DstReg);
3639   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3640 
3641   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3642   if (!loadInputValue(KernargPtrReg, B,
3643                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3644     return false;
3645 
3646   // FIXME: This should be nuw
3647   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3648   return true;
3649 }
3650 
3651 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3652                                                  MachineRegisterInfo &MRI,
3653                                                  MachineIRBuilder &B) const {
3654   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3655   if (!MFI->isEntryFunction()) {
3656     return legalizePreloadedArgIntrin(MI, MRI, B,
3657                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3658   }
3659 
3660   Register DstReg = MI.getOperand(0).getReg();
3661   if (!getImplicitArgPtr(DstReg, MRI, B))
3662     return false;
3663 
3664   MI.eraseFromParent();
3665   return true;
3666 }
3667 
3668 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3669                                               MachineRegisterInfo &MRI,
3670                                               MachineIRBuilder &B,
3671                                               unsigned AddrSpace) const {
3672   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3673   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3674   Register Hi32 = Unmerge.getReg(1);
3675 
3676   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3677   MI.eraseFromParent();
3678   return true;
3679 }
3680 
3681 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3682 // offset (the offset that is included in bounds checking and swizzling, to be
3683 // split between the instruction's voffset and immoffset fields) and soffset
3684 // (the offset that is excluded from bounds checking and swizzling, to go in
3685 // the instruction's soffset field).  This function takes the first kind of
3686 // offset and figures out how to split it between voffset and immoffset.
3687 std::pair<Register, unsigned>
3688 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3689                                         Register OrigOffset) const {
3690   const unsigned MaxImm = 4095;
3691   Register BaseReg;
3692   unsigned ImmOffset;
3693   const LLT S32 = LLT::scalar(32);
3694   MachineRegisterInfo &MRI = *B.getMRI();
3695 
3696   std::tie(BaseReg, ImmOffset) =
3697       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3698 
3699   // If BaseReg is a pointer, convert it to int.
3700   if (MRI.getType(BaseReg).isPointer())
3701     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3702 
3703   // If the immediate value is too big for the immoffset field, put the value
3704   // and -4096 into the immoffset field so that the value that is copied/added
3705   // for the voffset field is a multiple of 4096, and it stands more chance
3706   // of being CSEd with the copy/add for another similar load/store.
3707   // However, do not do that rounding down to a multiple of 4096 if that is a
3708   // negative number, as it appears to be illegal to have a negative offset
3709   // in the vgpr, even if adding the immediate offset makes it positive.
3710   unsigned Overflow = ImmOffset & ~MaxImm;
3711   ImmOffset -= Overflow;
3712   if ((int32_t)Overflow < 0) {
3713     Overflow += ImmOffset;
3714     ImmOffset = 0;
3715   }
3716 
3717   if (Overflow != 0) {
3718     if (!BaseReg) {
3719       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3720     } else {
3721       auto OverflowVal = B.buildConstant(S32, Overflow);
3722       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3723     }
3724   }
3725 
3726   if (!BaseReg)
3727     BaseReg = B.buildConstant(S32, 0).getReg(0);
3728 
3729   return std::make_pair(BaseReg, ImmOffset);
3730 }
3731 
3732 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3733 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3734                                           Register VOffset, Register SOffset,
3735                                           unsigned ImmOffset, Register VIndex,
3736                                           MachineRegisterInfo &MRI) const {
3737   Optional<ValueAndVReg> MaybeVOffsetVal =
3738       getConstantVRegValWithLookThrough(VOffset, MRI);
3739   Optional<ValueAndVReg> MaybeSOffsetVal =
3740       getConstantVRegValWithLookThrough(SOffset, MRI);
3741   Optional<ValueAndVReg> MaybeVIndexVal =
3742       getConstantVRegValWithLookThrough(VIndex, MRI);
3743   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3744   // update the MMO with that offset. The stride is unknown so we can only do
3745   // this if VIndex is constant 0.
3746   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3747       MaybeVIndexVal->Value == 0) {
3748     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3749                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3750     MMO->setOffset(TotalOffset);
3751   } else {
3752     // We don't have a constant combined offset to use in the MMO. Give up.
3753     MMO->setValue((Value *)nullptr);
3754   }
3755 }
3756 
3757 /// Handle register layout difference for f16 images for some subtargets.
3758 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3759                                              MachineRegisterInfo &MRI,
3760                                              Register Reg,
3761                                              bool ImageStore) const {
3762   const LLT S16 = LLT::scalar(16);
3763   const LLT S32 = LLT::scalar(32);
3764   LLT StoreVT = MRI.getType(Reg);
3765   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3766 
3767   if (ST.hasUnpackedD16VMem()) {
3768     auto Unmerge = B.buildUnmerge(S16, Reg);
3769 
3770     SmallVector<Register, 4> WideRegs;
3771     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3772       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3773 
3774     int NumElts = StoreVT.getNumElements();
3775 
3776     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3777         .getReg(0);
3778   }
3779 
3780   if (ImageStore && ST.hasImageStoreD16Bug()) {
3781     if (StoreVT.getNumElements() == 2) {
3782       SmallVector<Register, 4> PackedRegs;
3783       Reg = B.buildBitcast(S32, Reg).getReg(0);
3784       PackedRegs.push_back(Reg);
3785       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3786       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3787           .getReg(0);
3788     }
3789 
3790     if (StoreVT.getNumElements() == 3) {
3791       SmallVector<Register, 4> PackedRegs;
3792       auto Unmerge = B.buildUnmerge(S16, Reg);
3793       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3794         PackedRegs.push_back(Unmerge.getReg(I));
3795       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3796       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3797       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3798     }
3799 
3800     if (StoreVT.getNumElements() == 4) {
3801       SmallVector<Register, 4> PackedRegs;
3802       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3803       auto Unmerge = B.buildUnmerge(S32, Reg);
3804       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3805         PackedRegs.push_back(Unmerge.getReg(I));
3806       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3807       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3808           .getReg(0);
3809     }
3810 
3811     llvm_unreachable("invalid data type");
3812   }
3813 
3814   return Reg;
3815 }
3816 
3817 Register AMDGPULegalizerInfo::fixStoreSourceType(
3818   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3819   MachineRegisterInfo *MRI = B.getMRI();
3820   LLT Ty = MRI->getType(VData);
3821 
3822   const LLT S16 = LLT::scalar(16);
3823 
3824   // Fixup illegal register types for i8 stores.
3825   if (Ty == LLT::scalar(8) || Ty == S16) {
3826     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3827     return AnyExt;
3828   }
3829 
3830   if (Ty.isVector()) {
3831     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3832       if (IsFormat)
3833         return handleD16VData(B, *MRI, VData);
3834     }
3835   }
3836 
3837   return VData;
3838 }
3839 
3840 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3841                                               MachineRegisterInfo &MRI,
3842                                               MachineIRBuilder &B,
3843                                               bool IsTyped,
3844                                               bool IsFormat) const {
3845   Register VData = MI.getOperand(1).getReg();
3846   LLT Ty = MRI.getType(VData);
3847   LLT EltTy = Ty.getScalarType();
3848   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3849   const LLT S32 = LLT::scalar(32);
3850 
3851   VData = fixStoreSourceType(B, VData, IsFormat);
3852   Register RSrc = MI.getOperand(2).getReg();
3853 
3854   MachineMemOperand *MMO = *MI.memoperands_begin();
3855   const int MemSize = MMO->getSize();
3856 
3857   unsigned ImmOffset;
3858 
3859   // The typed intrinsics add an immediate after the registers.
3860   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3861 
3862   // The struct intrinsic variants add one additional operand over raw.
3863   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3864   Register VIndex;
3865   int OpOffset = 0;
3866   if (HasVIndex) {
3867     VIndex = MI.getOperand(3).getReg();
3868     OpOffset = 1;
3869   } else {
3870     VIndex = B.buildConstant(S32, 0).getReg(0);
3871   }
3872 
3873   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3874   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3875 
3876   unsigned Format = 0;
3877   if (IsTyped) {
3878     Format = MI.getOperand(5 + OpOffset).getImm();
3879     ++OpOffset;
3880   }
3881 
3882   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3883 
3884   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3885   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3886 
3887   unsigned Opc;
3888   if (IsTyped) {
3889     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3890                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3891   } else if (IsFormat) {
3892     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3893                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3894   } else {
3895     switch (MemSize) {
3896     case 1:
3897       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3898       break;
3899     case 2:
3900       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3901       break;
3902     default:
3903       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3904       break;
3905     }
3906   }
3907 
3908   auto MIB = B.buildInstr(Opc)
3909     .addUse(VData)              // vdata
3910     .addUse(RSrc)               // rsrc
3911     .addUse(VIndex)             // vindex
3912     .addUse(VOffset)            // voffset
3913     .addUse(SOffset)            // soffset
3914     .addImm(ImmOffset);         // offset(imm)
3915 
3916   if (IsTyped)
3917     MIB.addImm(Format);
3918 
3919   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3920      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3921      .addMemOperand(MMO);
3922 
3923   MI.eraseFromParent();
3924   return true;
3925 }
3926 
3927 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3928                                              MachineRegisterInfo &MRI,
3929                                              MachineIRBuilder &B,
3930                                              bool IsFormat,
3931                                              bool IsTyped) const {
3932   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3933   MachineMemOperand *MMO = *MI.memoperands_begin();
3934   const LLT MemTy = MMO->getMemoryType();
3935   const LLT S32 = LLT::scalar(32);
3936 
3937   Register Dst = MI.getOperand(0).getReg();
3938   Register RSrc = MI.getOperand(2).getReg();
3939 
3940   // The typed intrinsics add an immediate after the registers.
3941   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3942 
3943   // The struct intrinsic variants add one additional operand over raw.
3944   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3945   Register VIndex;
3946   int OpOffset = 0;
3947   if (HasVIndex) {
3948     VIndex = MI.getOperand(3).getReg();
3949     OpOffset = 1;
3950   } else {
3951     VIndex = B.buildConstant(S32, 0).getReg(0);
3952   }
3953 
3954   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3955   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3956 
3957   unsigned Format = 0;
3958   if (IsTyped) {
3959     Format = MI.getOperand(5 + OpOffset).getImm();
3960     ++OpOffset;
3961   }
3962 
3963   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3964   unsigned ImmOffset;
3965 
3966   LLT Ty = MRI.getType(Dst);
3967   LLT EltTy = Ty.getScalarType();
3968   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3969   const bool Unpacked = ST.hasUnpackedD16VMem();
3970 
3971   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3972   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3973 
3974   unsigned Opc;
3975 
3976   if (IsTyped) {
3977     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3978                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3979   } else if (IsFormat) {
3980     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3981                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3982   } else {
3983     switch (MemTy.getSizeInBits()) {
3984     case 8:
3985       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3986       break;
3987     case 16:
3988       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3989       break;
3990     default:
3991       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3992       break;
3993     }
3994   }
3995 
3996   Register LoadDstReg;
3997 
3998   bool IsExtLoad =
3999       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4000   LLT UnpackedTy = Ty.changeElementSize(32);
4001 
4002   if (IsExtLoad)
4003     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4004   else if (Unpacked && IsD16 && Ty.isVector())
4005     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4006   else
4007     LoadDstReg = Dst;
4008 
4009   auto MIB = B.buildInstr(Opc)
4010     .addDef(LoadDstReg)         // vdata
4011     .addUse(RSrc)               // rsrc
4012     .addUse(VIndex)             // vindex
4013     .addUse(VOffset)            // voffset
4014     .addUse(SOffset)            // soffset
4015     .addImm(ImmOffset);         // offset(imm)
4016 
4017   if (IsTyped)
4018     MIB.addImm(Format);
4019 
4020   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4021      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4022      .addMemOperand(MMO);
4023 
4024   if (LoadDstReg != Dst) {
4025     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4026 
4027     // Widen result for extending loads was widened.
4028     if (IsExtLoad)
4029       B.buildTrunc(Dst, LoadDstReg);
4030     else {
4031       // Repack to original 16-bit vector result
4032       // FIXME: G_TRUNC should work, but legalization currently fails
4033       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4034       SmallVector<Register, 4> Repack;
4035       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4036         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4037       B.buildMerge(Dst, Repack);
4038     }
4039   }
4040 
4041   MI.eraseFromParent();
4042   return true;
4043 }
4044 
4045 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4046                                                MachineIRBuilder &B,
4047                                                bool IsInc) const {
4048   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4049                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4050   B.buildInstr(Opc)
4051     .addDef(MI.getOperand(0).getReg())
4052     .addUse(MI.getOperand(2).getReg())
4053     .addUse(MI.getOperand(3).getReg())
4054     .cloneMemRefs(MI);
4055   MI.eraseFromParent();
4056   return true;
4057 }
4058 
4059 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4060   switch (IntrID) {
4061   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4062   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4063     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4064   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4065   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4066     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4067   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4068   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4069     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4070   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4071   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4072     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4073   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4074   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4075     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4076   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4077   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4078     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4079   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4080   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4081     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4082   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4083   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4084     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4085   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4086   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4087     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4088   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4089   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4090     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4091   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4092   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4093     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4094   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4095   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4096     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4097   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4098   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4099     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4100   case Intrinsic::amdgcn_buffer_atomic_fadd:
4101   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4102   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4103     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4104   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4105   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4106     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4107   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4108   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4109     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4110   default:
4111     llvm_unreachable("unhandled atomic opcode");
4112   }
4113 }
4114 
4115 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4116                                                MachineIRBuilder &B,
4117                                                Intrinsic::ID IID) const {
4118   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4119                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4120   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4121 
4122   Register Dst;
4123 
4124   int OpOffset = 0;
4125   if (HasReturn) {
4126     // A few FP atomics do not support return values.
4127     Dst = MI.getOperand(0).getReg();
4128   } else {
4129     OpOffset = -1;
4130   }
4131 
4132   Register VData = MI.getOperand(2 + OpOffset).getReg();
4133   Register CmpVal;
4134 
4135   if (IsCmpSwap) {
4136     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4137     ++OpOffset;
4138   }
4139 
4140   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4141   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4142 
4143   // The struct intrinsic variants add one additional operand over raw.
4144   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4145   Register VIndex;
4146   if (HasVIndex) {
4147     VIndex = MI.getOperand(4 + OpOffset).getReg();
4148     ++OpOffset;
4149   } else {
4150     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4151   }
4152 
4153   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4154   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4155   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4156 
4157   MachineMemOperand *MMO = *MI.memoperands_begin();
4158 
4159   unsigned ImmOffset;
4160   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4161   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4162 
4163   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4164 
4165   if (HasReturn)
4166     MIB.addDef(Dst);
4167 
4168   MIB.addUse(VData); // vdata
4169 
4170   if (IsCmpSwap)
4171     MIB.addReg(CmpVal);
4172 
4173   MIB.addUse(RSrc)               // rsrc
4174      .addUse(VIndex)             // vindex
4175      .addUse(VOffset)            // voffset
4176      .addUse(SOffset)            // soffset
4177      .addImm(ImmOffset)          // offset(imm)
4178      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4179      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4180      .addMemOperand(MMO);
4181 
4182   MI.eraseFromParent();
4183   return true;
4184 }
4185 
4186 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4187 /// vector with s16 typed elements.
4188 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4189                                       SmallVectorImpl<Register> &PackedAddrs,
4190                                       unsigned ArgOffset,
4191                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4192                                       bool IsA16, bool IsG16) {
4193   const LLT S16 = LLT::scalar(16);
4194   const LLT V2S16 = LLT::fixed_vector(2, 16);
4195   auto EndIdx = Intr->VAddrEnd;
4196 
4197   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4198     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4199     if (!SrcOp.isReg())
4200       continue; // _L to _LZ may have eliminated this.
4201 
4202     Register AddrReg = SrcOp.getReg();
4203 
4204     if ((I < Intr->GradientStart) ||
4205         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4206         (I >= Intr->CoordStart && !IsA16)) {
4207       // Handle any gradient or coordinate operands that should not be packed
4208       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4209       PackedAddrs.push_back(AddrReg);
4210     } else {
4211       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4212       // derivatives dx/dh and dx/dv are packed with undef.
4213       if (((I + 1) >= EndIdx) ||
4214           ((Intr->NumGradients / 2) % 2 == 1 &&
4215            (I == static_cast<unsigned>(Intr->GradientStart +
4216                                        (Intr->NumGradients / 2) - 1) ||
4217             I == static_cast<unsigned>(Intr->GradientStart +
4218                                        Intr->NumGradients - 1))) ||
4219           // Check for _L to _LZ optimization
4220           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4221         PackedAddrs.push_back(
4222             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4223                 .getReg(0));
4224       } else {
4225         PackedAddrs.push_back(
4226             B.buildBuildVector(
4227                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4228                 .getReg(0));
4229         ++I;
4230       }
4231     }
4232   }
4233 }
4234 
4235 /// Convert from separate vaddr components to a single vector address register,
4236 /// and replace the remaining operands with $noreg.
4237 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4238                                      int DimIdx, int NumVAddrs) {
4239   const LLT S32 = LLT::scalar(32);
4240 
4241   SmallVector<Register, 8> AddrRegs;
4242   for (int I = 0; I != NumVAddrs; ++I) {
4243     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4244     if (SrcOp.isReg()) {
4245       AddrRegs.push_back(SrcOp.getReg());
4246       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4247     }
4248   }
4249 
4250   int NumAddrRegs = AddrRegs.size();
4251   if (NumAddrRegs != 1) {
4252     // Above 8 elements round up to next power of 2 (i.e. 16).
4253     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4254       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4255       auto Undef = B.buildUndef(S32);
4256       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4257       NumAddrRegs = RoundedNumRegs;
4258     }
4259 
4260     auto VAddr =
4261         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4262     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4263   }
4264 
4265   for (int I = 1; I != NumVAddrs; ++I) {
4266     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4267     if (SrcOp.isReg())
4268       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4269   }
4270 }
4271 
4272 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4273 ///
4274 /// Depending on the subtarget, load/store with 16-bit element data need to be
4275 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4276 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4277 /// registers.
4278 ///
4279 /// We don't want to directly select image instructions just yet, but also want
4280 /// to exposes all register repacking to the legalizer/combiners. We also don't
4281 /// want a selected instrution entering RegBankSelect. In order to avoid
4282 /// defining a multitude of intermediate image instructions, directly hack on
4283 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4284 /// now unnecessary arguments with $noreg.
4285 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4286     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4287     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4288 
4289   const unsigned NumDefs = MI.getNumExplicitDefs();
4290   const unsigned ArgOffset = NumDefs + 1;
4291   bool IsTFE = NumDefs == 2;
4292   // We are only processing the operands of d16 image operations on subtargets
4293   // that use the unpacked register layout, or need to repack the TFE result.
4294 
4295   // TODO: Do we need to guard against already legalized intrinsics?
4296   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4297       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4298 
4299   MachineRegisterInfo *MRI = B.getMRI();
4300   const LLT S32 = LLT::scalar(32);
4301   const LLT S16 = LLT::scalar(16);
4302   const LLT V2S16 = LLT::fixed_vector(2, 16);
4303 
4304   unsigned DMask = 0;
4305 
4306   // Check for 16 bit addresses and pack if true.
4307   LLT GradTy =
4308       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4309   LLT AddrTy =
4310       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4311   const bool IsG16 = GradTy == S16;
4312   const bool IsA16 = AddrTy == S16;
4313 
4314   int DMaskLanes = 0;
4315   if (!BaseOpcode->Atomic) {
4316     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4317     if (BaseOpcode->Gather4) {
4318       DMaskLanes = 4;
4319     } else if (DMask != 0) {
4320       DMaskLanes = countPopulation(DMask);
4321     } else if (!IsTFE && !BaseOpcode->Store) {
4322       // If dmask is 0, this is a no-op load. This can be eliminated.
4323       B.buildUndef(MI.getOperand(0));
4324       MI.eraseFromParent();
4325       return true;
4326     }
4327   }
4328 
4329   Observer.changingInstr(MI);
4330   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4331 
4332   unsigned NewOpcode = NumDefs == 0 ?
4333     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4334 
4335   // Track that we legalized this
4336   MI.setDesc(B.getTII().get(NewOpcode));
4337 
4338   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4339   // dmask to be at least 1 otherwise the instruction will fail
4340   if (IsTFE && DMask == 0) {
4341     DMask = 0x1;
4342     DMaskLanes = 1;
4343     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4344   }
4345 
4346   if (BaseOpcode->Atomic) {
4347     Register VData0 = MI.getOperand(2).getReg();
4348     LLT Ty = MRI->getType(VData0);
4349 
4350     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4351     if (Ty.isVector())
4352       return false;
4353 
4354     if (BaseOpcode->AtomicX2) {
4355       Register VData1 = MI.getOperand(3).getReg();
4356       // The two values are packed in one register.
4357       LLT PackedTy = LLT::fixed_vector(2, Ty);
4358       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4359       MI.getOperand(2).setReg(Concat.getReg(0));
4360       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4361     }
4362   }
4363 
4364   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4365 
4366   // Optimize _L to _LZ when _L is zero
4367   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4368           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4369     const ConstantFP *ConstantLod;
4370 
4371     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4372                  m_GFCst(ConstantLod))) {
4373       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4374         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4375         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4376             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4377                                                       Intr->Dim);
4378 
4379         // The starting indexes should remain in the same place.
4380         --CorrectedNumVAddrs;
4381 
4382         MI.getOperand(MI.getNumExplicitDefs())
4383             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4384         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4385         Intr = NewImageDimIntr;
4386       }
4387     }
4388   }
4389 
4390   // Optimize _mip away, when 'lod' is zero
4391   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4392     int64_t ConstantLod;
4393     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4394                  m_ICst(ConstantLod))) {
4395       if (ConstantLod == 0) {
4396         // TODO: Change intrinsic opcode and remove operand instead or replacing
4397         // it with 0, as the _L to _LZ handling is done above.
4398         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4399         --CorrectedNumVAddrs;
4400       }
4401     }
4402   }
4403 
4404   // Rewrite the addressing register layout before doing anything else.
4405   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4406     // 16 bit gradients are supported, but are tied to the A16 control
4407     // so both gradients and addresses must be 16 bit
4408     return false;
4409   }
4410 
4411   if (IsA16 && !ST.hasA16()) {
4412     // A16 not supported
4413     return false;
4414   }
4415 
4416   if (IsA16 || IsG16) {
4417     if (Intr->NumVAddrs > 1) {
4418       SmallVector<Register, 4> PackedRegs;
4419 
4420       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4421                                 IsG16);
4422 
4423       // See also below in the non-a16 branch
4424       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4425                           PackedRegs.size() <= ST.getNSAMaxSize();
4426 
4427       if (!UseNSA && PackedRegs.size() > 1) {
4428         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4429         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4430         PackedRegs[0] = Concat.getReg(0);
4431         PackedRegs.resize(1);
4432       }
4433 
4434       const unsigned NumPacked = PackedRegs.size();
4435       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4436         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4437         if (!SrcOp.isReg()) {
4438           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4439           continue;
4440         }
4441 
4442         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4443 
4444         if (I - Intr->VAddrStart < NumPacked)
4445           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4446         else
4447           SrcOp.setReg(AMDGPU::NoRegister);
4448       }
4449     }
4450   } else {
4451     // If the register allocator cannot place the address registers contiguously
4452     // without introducing moves, then using the non-sequential address encoding
4453     // is always preferable, since it saves VALU instructions and is usually a
4454     // wash in terms of code size or even better.
4455     //
4456     // However, we currently have no way of hinting to the register allocator
4457     // that MIMG addresses should be placed contiguously when it is possible to
4458     // do so, so force non-NSA for the common 2-address case as a heuristic.
4459     //
4460     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4461     // allocation when possible.
4462     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4463                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4464 
4465     if (!UseNSA && Intr->NumVAddrs > 1)
4466       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4467                                Intr->NumVAddrs);
4468   }
4469 
4470   int Flags = 0;
4471   if (IsA16)
4472     Flags |= 1;
4473   if (IsG16)
4474     Flags |= 2;
4475   MI.addOperand(MachineOperand::CreateImm(Flags));
4476 
4477   if (BaseOpcode->Store) { // No TFE for stores?
4478     // TODO: Handle dmask trim
4479     Register VData = MI.getOperand(1).getReg();
4480     LLT Ty = MRI->getType(VData);
4481     if (!Ty.isVector() || Ty.getElementType() != S16)
4482       return true;
4483 
4484     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4485     if (RepackedReg != VData) {
4486       MI.getOperand(1).setReg(RepackedReg);
4487     }
4488 
4489     return true;
4490   }
4491 
4492   Register DstReg = MI.getOperand(0).getReg();
4493   LLT Ty = MRI->getType(DstReg);
4494   const LLT EltTy = Ty.getScalarType();
4495   const bool IsD16 = Ty.getScalarType() == S16;
4496   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4497 
4498   // Confirm that the return type is large enough for the dmask specified
4499   if (NumElts < DMaskLanes)
4500     return false;
4501 
4502   if (NumElts > 4 || DMaskLanes > 4)
4503     return false;
4504 
4505   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4506   const LLT AdjustedTy =
4507       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4508 
4509   // The raw dword aligned data component of the load. The only legal cases
4510   // where this matters should be when using the packed D16 format, for
4511   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4512   LLT RoundedTy;
4513 
4514   // S32 vector to to cover all data, plus TFE result element.
4515   LLT TFETy;
4516 
4517   // Register type to use for each loaded component. Will be S32 or V2S16.
4518   LLT RegTy;
4519 
4520   if (IsD16 && ST.hasUnpackedD16VMem()) {
4521     RoundedTy =
4522         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4523     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4524     RegTy = S32;
4525   } else {
4526     unsigned EltSize = EltTy.getSizeInBits();
4527     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4528     unsigned RoundedSize = 32 * RoundedElts;
4529     RoundedTy = LLT::scalarOrVector(
4530         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4531     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4532     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4533   }
4534 
4535   // The return type does not need adjustment.
4536   // TODO: Should we change s16 case to s32 or <2 x s16>?
4537   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4538     return true;
4539 
4540   Register Dst1Reg;
4541 
4542   // Insert after the instruction.
4543   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4544 
4545   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4546   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4547   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4548   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4549 
4550   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4551 
4552   MI.getOperand(0).setReg(NewResultReg);
4553 
4554   // In the IR, TFE is supposed to be used with a 2 element struct return
4555   // type. The intruction really returns these two values in one contiguous
4556   // register, with one additional dword beyond the loaded data. Rewrite the
4557   // return type to use a single register result.
4558 
4559   if (IsTFE) {
4560     Dst1Reg = MI.getOperand(1).getReg();
4561     if (MRI->getType(Dst1Reg) != S32)
4562       return false;
4563 
4564     // TODO: Make sure the TFE operand bit is set.
4565     MI.RemoveOperand(1);
4566 
4567     // Handle the easy case that requires no repack instructions.
4568     if (Ty == S32) {
4569       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4570       return true;
4571     }
4572   }
4573 
4574   // Now figure out how to copy the new result register back into the old
4575   // result.
4576   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4577 
4578   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4579 
4580   if (ResultNumRegs == 1) {
4581     assert(!IsTFE);
4582     ResultRegs[0] = NewResultReg;
4583   } else {
4584     // We have to repack into a new vector of some kind.
4585     for (int I = 0; I != NumDataRegs; ++I)
4586       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4587     B.buildUnmerge(ResultRegs, NewResultReg);
4588 
4589     // Drop the final TFE element to get the data part. The TFE result is
4590     // directly written to the right place already.
4591     if (IsTFE)
4592       ResultRegs.resize(NumDataRegs);
4593   }
4594 
4595   // For an s16 scalar result, we form an s32 result with a truncate regardless
4596   // of packed vs. unpacked.
4597   if (IsD16 && !Ty.isVector()) {
4598     B.buildTrunc(DstReg, ResultRegs[0]);
4599     return true;
4600   }
4601 
4602   // Avoid a build/concat_vector of 1 entry.
4603   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4604     B.buildBitcast(DstReg, ResultRegs[0]);
4605     return true;
4606   }
4607 
4608   assert(Ty.isVector());
4609 
4610   if (IsD16) {
4611     // For packed D16 results with TFE enabled, all the data components are
4612     // S32. Cast back to the expected type.
4613     //
4614     // TODO: We don't really need to use load s32 elements. We would only need one
4615     // cast for the TFE result if a multiple of v2s16 was used.
4616     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4617       for (Register &Reg : ResultRegs)
4618         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4619     } else if (ST.hasUnpackedD16VMem()) {
4620       for (Register &Reg : ResultRegs)
4621         Reg = B.buildTrunc(S16, Reg).getReg(0);
4622     }
4623   }
4624 
4625   auto padWithUndef = [&](LLT Ty, int NumElts) {
4626     if (NumElts == 0)
4627       return;
4628     Register Undef = B.buildUndef(Ty).getReg(0);
4629     for (int I = 0; I != NumElts; ++I)
4630       ResultRegs.push_back(Undef);
4631   };
4632 
4633   // Pad out any elements eliminated due to the dmask.
4634   LLT ResTy = MRI->getType(ResultRegs[0]);
4635   if (!ResTy.isVector()) {
4636     padWithUndef(ResTy, NumElts - ResultRegs.size());
4637     B.buildBuildVector(DstReg, ResultRegs);
4638     return true;
4639   }
4640 
4641   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4642   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4643 
4644   // Deal with the one annoying legal case.
4645   const LLT V3S16 = LLT::fixed_vector(3, 16);
4646   if (Ty == V3S16) {
4647     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4648     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4649     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4650     return true;
4651   }
4652 
4653   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4654   B.buildConcatVectors(DstReg, ResultRegs);
4655   return true;
4656 }
4657 
4658 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4659   LegalizerHelper &Helper, MachineInstr &MI) const {
4660   MachineIRBuilder &B = Helper.MIRBuilder;
4661   GISelChangeObserver &Observer = Helper.Observer;
4662 
4663   Register Dst = MI.getOperand(0).getReg();
4664   LLT Ty = B.getMRI()->getType(Dst);
4665   unsigned Size = Ty.getSizeInBits();
4666   MachineFunction &MF = B.getMF();
4667 
4668   Observer.changingInstr(MI);
4669 
4670   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4671     Ty = getBitcastRegisterType(Ty);
4672     Helper.bitcastDst(MI, Ty, 0);
4673     Dst = MI.getOperand(0).getReg();
4674     B.setInsertPt(B.getMBB(), MI);
4675   }
4676 
4677   // FIXME: We don't really need this intermediate instruction. The intrinsic
4678   // should be fixed to have a memory operand. Since it's readnone, we're not
4679   // allowed to add one.
4680   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4681   MI.RemoveOperand(1); // Remove intrinsic ID
4682 
4683   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4684   // TODO: Should this use datalayout alignment?
4685   const unsigned MemSize = (Size + 7) / 8;
4686   const Align MemAlign(4);
4687   MachineMemOperand *MMO = MF.getMachineMemOperand(
4688       MachinePointerInfo(),
4689       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4690           MachineMemOperand::MOInvariant,
4691       MemSize, MemAlign);
4692   MI.addMemOperand(MF, MMO);
4693 
4694   // There are no 96-bit result scalar loads, but widening to 128-bit should
4695   // always be legal. We may need to restore this to a 96-bit result if it turns
4696   // out this needs to be converted to a vector load during RegBankSelect.
4697   if (!isPowerOf2_32(Size)) {
4698     if (Ty.isVector())
4699       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4700     else
4701       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4702   }
4703 
4704   Observer.changedInstr(MI);
4705   return true;
4706 }
4707 
4708 // TODO: Move to selection
4709 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4710                                                 MachineRegisterInfo &MRI,
4711                                                 MachineIRBuilder &B) const {
4712   if (!ST.isTrapHandlerEnabled() ||
4713       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4714     return legalizeTrapEndpgm(MI, MRI, B);
4715 
4716   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4717     switch (*HsaAbiVer) {
4718     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4719     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4720       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4721     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4722       return ST.supportsGetDoorbellID() ?
4723           legalizeTrapHsa(MI, MRI, B) :
4724           legalizeTrapHsaQueuePtr(MI, MRI, B);
4725     }
4726   }
4727 
4728   llvm_unreachable("Unknown trap handler");
4729 }
4730 
4731 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4732     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4733   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4734   MI.eraseFromParent();
4735   return true;
4736 }
4737 
4738 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4739     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4740   // Pass queue pointer to trap handler as input, and insert trap instruction
4741   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4742   Register LiveIn =
4743     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4744   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4745     return false;
4746 
4747   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4748   B.buildCopy(SGPR01, LiveIn);
4749   B.buildInstr(AMDGPU::S_TRAP)
4750       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4751       .addReg(SGPR01, RegState::Implicit);
4752 
4753   MI.eraseFromParent();
4754   return true;
4755 }
4756 
4757 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4758     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4759   B.buildInstr(AMDGPU::S_TRAP)
4760       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4761   MI.eraseFromParent();
4762   return true;
4763 }
4764 
4765 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4766     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4767   // Is non-HSA path or trap-handler disabled? then, report a warning
4768   // accordingly
4769   if (!ST.isTrapHandlerEnabled() ||
4770       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4771     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4772                                      "debugtrap handler not supported",
4773                                      MI.getDebugLoc(), DS_Warning);
4774     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4775     Ctx.diagnose(NoTrap);
4776   } else {
4777     // Insert debug-trap instruction
4778     B.buildInstr(AMDGPU::S_TRAP)
4779         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4780   }
4781 
4782   MI.eraseFromParent();
4783   return true;
4784 }
4785 
4786 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4787                                                MachineIRBuilder &B) const {
4788   MachineRegisterInfo &MRI = *B.getMRI();
4789   const LLT S16 = LLT::scalar(16);
4790   const LLT S32 = LLT::scalar(32);
4791 
4792   Register DstReg = MI.getOperand(0).getReg();
4793   Register NodePtr = MI.getOperand(2).getReg();
4794   Register RayExtent = MI.getOperand(3).getReg();
4795   Register RayOrigin = MI.getOperand(4).getReg();
4796   Register RayDir = MI.getOperand(5).getReg();
4797   Register RayInvDir = MI.getOperand(6).getReg();
4798   Register TDescr = MI.getOperand(7).getReg();
4799 
4800   if (!ST.hasGFX10_AEncoding()) {
4801     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4802                                         "intrinsic not supported on subtarget",
4803                                         MI.getDebugLoc());
4804     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4805     return false;
4806   }
4807 
4808   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4809   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4810   const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4811   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
4812   const unsigned Opcodes[2][2][2] = {
4813       {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
4814         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
4815        {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
4816         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
4817       {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
4818         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
4819        {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
4820         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
4821   const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
4822 
4823   SmallVector<Register, 12> Ops;
4824   if (Is64) {
4825     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4826     Ops.push_back(Unmerge.getReg(0));
4827     Ops.push_back(Unmerge.getReg(1));
4828   } else {
4829     Ops.push_back(NodePtr);
4830   }
4831   Ops.push_back(RayExtent);
4832 
4833   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4834     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4835     Ops.push_back(Unmerge.getReg(0));
4836     Ops.push_back(Unmerge.getReg(1));
4837     Ops.push_back(Unmerge.getReg(2));
4838   };
4839 
4840   packLanes(RayOrigin);
4841   if (IsA16) {
4842     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4843     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4844     Register R1 = MRI.createGenericVirtualRegister(S32);
4845     Register R2 = MRI.createGenericVirtualRegister(S32);
4846     Register R3 = MRI.createGenericVirtualRegister(S32);
4847     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4848     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4849     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4850     Ops.push_back(R1);
4851     Ops.push_back(R2);
4852     Ops.push_back(R3);
4853   } else {
4854     packLanes(RayDir);
4855     packLanes(RayInvDir);
4856   }
4857 
4858   if (!UseNSA) {
4859     // Build a single vector containing all the operands so far prepared.
4860     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4861     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4862     Ops.clear();
4863     Ops.push_back(MergedOps);
4864   }
4865 
4866   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4867     .addDef(DstReg)
4868     .addImm(Opcode);
4869 
4870   for (Register R : Ops) {
4871     MIB.addUse(R);
4872   }
4873 
4874   MIB.addUse(TDescr)
4875      .addImm(IsA16 ? 1 : 0)
4876      .cloneMemRefs(MI);
4877 
4878   MI.eraseFromParent();
4879   return true;
4880 }
4881 
4882 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4883                                             MachineInstr &MI) const {
4884   MachineIRBuilder &B = Helper.MIRBuilder;
4885   MachineRegisterInfo &MRI = *B.getMRI();
4886 
4887   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4888   auto IntrID = MI.getIntrinsicID();
4889   switch (IntrID) {
4890   case Intrinsic::amdgcn_if:
4891   case Intrinsic::amdgcn_else: {
4892     MachineInstr *Br = nullptr;
4893     MachineBasicBlock *UncondBrTarget = nullptr;
4894     bool Negated = false;
4895     if (MachineInstr *BrCond =
4896             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4897       const SIRegisterInfo *TRI
4898         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4899 
4900       Register Def = MI.getOperand(1).getReg();
4901       Register Use = MI.getOperand(3).getReg();
4902 
4903       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4904 
4905       if (Negated)
4906         std::swap(CondBrTarget, UncondBrTarget);
4907 
4908       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4909       if (IntrID == Intrinsic::amdgcn_if) {
4910         B.buildInstr(AMDGPU::SI_IF)
4911           .addDef(Def)
4912           .addUse(Use)
4913           .addMBB(UncondBrTarget);
4914       } else {
4915         B.buildInstr(AMDGPU::SI_ELSE)
4916             .addDef(Def)
4917             .addUse(Use)
4918             .addMBB(UncondBrTarget);
4919       }
4920 
4921       if (Br) {
4922         Br->getOperand(0).setMBB(CondBrTarget);
4923       } else {
4924         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4925         // since we're swapping branch targets it needs to be reinserted.
4926         // FIXME: IRTranslator should probably not do this
4927         B.buildBr(*CondBrTarget);
4928       }
4929 
4930       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4931       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4932       MI.eraseFromParent();
4933       BrCond->eraseFromParent();
4934       return true;
4935     }
4936 
4937     return false;
4938   }
4939   case Intrinsic::amdgcn_loop: {
4940     MachineInstr *Br = nullptr;
4941     MachineBasicBlock *UncondBrTarget = nullptr;
4942     bool Negated = false;
4943     if (MachineInstr *BrCond =
4944             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4945       const SIRegisterInfo *TRI
4946         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4947 
4948       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4949       Register Reg = MI.getOperand(2).getReg();
4950 
4951       if (Negated)
4952         std::swap(CondBrTarget, UncondBrTarget);
4953 
4954       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4955       B.buildInstr(AMDGPU::SI_LOOP)
4956         .addUse(Reg)
4957         .addMBB(UncondBrTarget);
4958 
4959       if (Br)
4960         Br->getOperand(0).setMBB(CondBrTarget);
4961       else
4962         B.buildBr(*CondBrTarget);
4963 
4964       MI.eraseFromParent();
4965       BrCond->eraseFromParent();
4966       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4967       return true;
4968     }
4969 
4970     return false;
4971   }
4972   case Intrinsic::amdgcn_kernarg_segment_ptr:
4973     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4974       // This only makes sense to call in a kernel, so just lower to null.
4975       B.buildConstant(MI.getOperand(0).getReg(), 0);
4976       MI.eraseFromParent();
4977       return true;
4978     }
4979 
4980     return legalizePreloadedArgIntrin(
4981       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4982   case Intrinsic::amdgcn_implicitarg_ptr:
4983     return legalizeImplicitArgPtr(MI, MRI, B);
4984   case Intrinsic::amdgcn_workitem_id_x:
4985     return legalizePreloadedArgIntrin(MI, MRI, B,
4986                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4987   case Intrinsic::amdgcn_workitem_id_y:
4988     return legalizePreloadedArgIntrin(MI, MRI, B,
4989                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4990   case Intrinsic::amdgcn_workitem_id_z:
4991     return legalizePreloadedArgIntrin(MI, MRI, B,
4992                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4993   case Intrinsic::amdgcn_workgroup_id_x:
4994     return legalizePreloadedArgIntrin(MI, MRI, B,
4995                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4996   case Intrinsic::amdgcn_workgroup_id_y:
4997     return legalizePreloadedArgIntrin(MI, MRI, B,
4998                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4999   case Intrinsic::amdgcn_workgroup_id_z:
5000     return legalizePreloadedArgIntrin(MI, MRI, B,
5001                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5002   case Intrinsic::amdgcn_dispatch_ptr:
5003     return legalizePreloadedArgIntrin(MI, MRI, B,
5004                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5005   case Intrinsic::amdgcn_queue_ptr:
5006     return legalizePreloadedArgIntrin(MI, MRI, B,
5007                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5008   case Intrinsic::amdgcn_implicit_buffer_ptr:
5009     return legalizePreloadedArgIntrin(
5010       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5011   case Intrinsic::amdgcn_dispatch_id:
5012     return legalizePreloadedArgIntrin(MI, MRI, B,
5013                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5014   case Intrinsic::amdgcn_fdiv_fast:
5015     return legalizeFDIVFastIntrin(MI, MRI, B);
5016   case Intrinsic::amdgcn_is_shared:
5017     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5018   case Intrinsic::amdgcn_is_private:
5019     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5020   case Intrinsic::amdgcn_wavefrontsize: {
5021     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5022     MI.eraseFromParent();
5023     return true;
5024   }
5025   case Intrinsic::amdgcn_s_buffer_load:
5026     return legalizeSBufferLoad(Helper, MI);
5027   case Intrinsic::amdgcn_raw_buffer_store:
5028   case Intrinsic::amdgcn_struct_buffer_store:
5029     return legalizeBufferStore(MI, MRI, B, false, false);
5030   case Intrinsic::amdgcn_raw_buffer_store_format:
5031   case Intrinsic::amdgcn_struct_buffer_store_format:
5032     return legalizeBufferStore(MI, MRI, B, false, true);
5033   case Intrinsic::amdgcn_raw_tbuffer_store:
5034   case Intrinsic::amdgcn_struct_tbuffer_store:
5035     return legalizeBufferStore(MI, MRI, B, true, true);
5036   case Intrinsic::amdgcn_raw_buffer_load:
5037   case Intrinsic::amdgcn_struct_buffer_load:
5038     return legalizeBufferLoad(MI, MRI, B, false, false);
5039   case Intrinsic::amdgcn_raw_buffer_load_format:
5040   case Intrinsic::amdgcn_struct_buffer_load_format:
5041     return legalizeBufferLoad(MI, MRI, B, true, false);
5042   case Intrinsic::amdgcn_raw_tbuffer_load:
5043   case Intrinsic::amdgcn_struct_tbuffer_load:
5044     return legalizeBufferLoad(MI, MRI, B, true, true);
5045   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5046   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5047   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5048   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5049   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5050   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5051   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5052   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5053   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5054   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5055   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5056   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5057   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5058   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5059   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5060   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5061   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5062   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5063   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5064   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5065   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5066   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5067   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5068   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5069   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5070   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5071   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5072   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5073   case Intrinsic::amdgcn_buffer_atomic_fadd:
5074   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5075   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5076   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5077   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5078     return legalizeBufferAtomic(MI, B, IntrID);
5079   case Intrinsic::amdgcn_atomic_inc:
5080     return legalizeAtomicIncDec(MI, B, true);
5081   case Intrinsic::amdgcn_atomic_dec:
5082     return legalizeAtomicIncDec(MI, B, false);
5083   case Intrinsic::trap:
5084     return legalizeTrapIntrinsic(MI, MRI, B);
5085   case Intrinsic::debugtrap:
5086     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5087   case Intrinsic::amdgcn_rsq_clamp:
5088     return legalizeRsqClampIntrinsic(MI, MRI, B);
5089   case Intrinsic::amdgcn_ds_fadd:
5090   case Intrinsic::amdgcn_ds_fmin:
5091   case Intrinsic::amdgcn_ds_fmax:
5092     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5093   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5094     return legalizeBVHIntrinsic(MI, B);
5095   default: {
5096     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5097             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5098       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5099     return true;
5100   }
5101   }
5102 
5103   return true;
5104 }
5105