1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816                     .lowerIf(typeIs(1, S1))
817                     .customFor({{S32, S64}, {S64, S64}});
818   if (ST.has16BitInsts())
819     IToFP.legalFor({{S16, S16}});
820   IToFP.clampScalar(1, S32, S64)
821        .minScalar(0, S32)
822        .scalarize(0)
823        .widenScalarToNextPow2(1);
824 
825   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
826     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
827     .customFor({{S64, S32}, {S64, S64}})
828     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
829   if (ST.has16BitInsts())
830     FPToI.legalFor({{S16, S16}});
831   else
832     FPToI.minScalar(1, S32);
833 
834   FPToI.minScalar(0, S32)
835        .widenScalarToNextPow2(0, 32)
836        .scalarize(0)
837        .lower();
838 
839   // Lower roundeven into G_FRINT
840   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
841     .scalarize(0)
842     .lower();
843 
844   if (ST.has16BitInsts()) {
845     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
846       .legalFor({S16, S32, S64})
847       .clampScalar(0, S16, S64)
848       .scalarize(0);
849   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
850     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
851       .legalFor({S32, S64})
852       .clampScalar(0, S32, S64)
853       .scalarize(0);
854   } else {
855     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
856       .legalFor({S32})
857       .customFor({S64})
858       .clampScalar(0, S32, S64)
859       .scalarize(0);
860   }
861 
862   getActionDefinitionsBuilder(G_PTR_ADD)
863     .legalIf(all(isPointer(0), sameSize(0, 1)))
864     .scalarize(0)
865     .scalarSameSizeAs(1, 0);
866 
867   getActionDefinitionsBuilder(G_PTRMASK)
868     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
869     .scalarSameSizeAs(1, 0)
870     .scalarize(0);
871 
872   auto &CmpBuilder =
873     getActionDefinitionsBuilder(G_ICMP)
874     // The compare output type differs based on the register bank of the output,
875     // so make both s1 and s32 legal.
876     //
877     // Scalar compares producing output in scc will be promoted to s32, as that
878     // is the allocatable register type that will be needed for the copy from
879     // scc. This will be promoted during RegBankSelect, and we assume something
880     // before that won't try to use s32 result types.
881     //
882     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
883     // bank.
884     .legalForCartesianProduct(
885       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
886     .legalForCartesianProduct(
887       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
888   if (ST.has16BitInsts()) {
889     CmpBuilder.legalFor({{S1, S16}});
890   }
891 
892   CmpBuilder
893     .widenScalarToNextPow2(1)
894     .clampScalar(1, S32, S64)
895     .scalarize(0)
896     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
897 
898   getActionDefinitionsBuilder(G_FCMP)
899     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
900     .widenScalarToNextPow2(1)
901     .clampScalar(1, S32, S64)
902     .scalarize(0);
903 
904   // FIXME: fpow has a selection pattern that should move to custom lowering.
905   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
906   if (ST.has16BitInsts())
907     Exp2Ops.legalFor({S32, S16});
908   else
909     Exp2Ops.legalFor({S32});
910   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
911   Exp2Ops.scalarize(0);
912 
913   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
914   if (ST.has16BitInsts())
915     ExpOps.customFor({{S32}, {S16}});
916   else
917     ExpOps.customFor({S32});
918   ExpOps.clampScalar(0, MinScalarFPTy, S32)
919         .scalarize(0);
920 
921   getActionDefinitionsBuilder(G_FPOWI)
922     .clampScalar(0, MinScalarFPTy, S32)
923     .lower();
924 
925   // The 64-bit versions produce 32-bit results, but only on the SALU.
926   getActionDefinitionsBuilder(G_CTPOP)
927     .legalFor({{S32, S32}, {S32, S64}})
928     .clampScalar(0, S32, S32)
929     .clampScalar(1, S32, S64)
930     .scalarize(0)
931     .widenScalarToNextPow2(0, 32)
932     .widenScalarToNextPow2(1, 32);
933 
934   // The hardware instructions return a different result on 0 than the generic
935   // instructions expect. The hardware produces -1, but these produce the
936   // bitwidth.
937   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
938     .scalarize(0)
939     .clampScalar(0, S32, S32)
940     .clampScalar(1, S32, S64)
941     .widenScalarToNextPow2(0, 32)
942     .widenScalarToNextPow2(1, 32)
943     .custom();
944 
945   // The 64-bit versions produce 32-bit results, but only on the SALU.
946   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
947     .legalFor({{S32, S32}, {S32, S64}})
948     .clampScalar(0, S32, S32)
949     .clampScalar(1, S32, S64)
950     .scalarize(0)
951     .widenScalarToNextPow2(0, 32)
952     .widenScalarToNextPow2(1, 32);
953 
954   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
955   // RegBankSelect.
956   getActionDefinitionsBuilder(G_BITREVERSE)
957     .legalFor({S32, S64})
958     .clampScalar(0, S32, S64)
959     .scalarize(0)
960     .widenScalarToNextPow2(0);
961 
962   if (ST.has16BitInsts()) {
963     getActionDefinitionsBuilder(G_BSWAP)
964       .legalFor({S16, S32, V2S16})
965       .clampMaxNumElements(0, S16, 2)
966       // FIXME: Fixing non-power-of-2 before clamp is workaround for
967       // narrowScalar limitation.
968       .widenScalarToNextPow2(0)
969       .clampScalar(0, S16, S32)
970       .scalarize(0);
971 
972     if (ST.hasVOP3PInsts()) {
973       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
974         .legalFor({S32, S16, V2S16})
975         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
976         .clampMaxNumElements(0, S16, 2)
977         .minScalar(0, S16)
978         .widenScalarToNextPow2(0)
979         .scalarize(0)
980         .lower();
981     } else {
982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983         .legalFor({S32, S16})
984         .widenScalarToNextPow2(0)
985         .minScalar(0, S16)
986         .scalarize(0)
987         .lower();
988     }
989   } else {
990     // TODO: Should have same legality without v_perm_b32
991     getActionDefinitionsBuilder(G_BSWAP)
992       .legalFor({S32})
993       .lowerIf(scalarNarrowerThan(0, 32))
994       // FIXME: Fixing non-power-of-2 before clamp is workaround for
995       // narrowScalar limitation.
996       .widenScalarToNextPow2(0)
997       .maxScalar(0, S32)
998       .scalarize(0)
999       .lower();
1000 
1001     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1002       .legalFor({S32})
1003       .minScalar(0, S32)
1004       .widenScalarToNextPow2(0)
1005       .scalarize(0)
1006       .lower();
1007   }
1008 
1009   getActionDefinitionsBuilder(G_INTTOPTR)
1010     // List the common cases
1011     .legalForCartesianProduct(AddrSpaces64, {S64})
1012     .legalForCartesianProduct(AddrSpaces32, {S32})
1013     .scalarize(0)
1014     // Accept any address space as long as the size matches
1015     .legalIf(sameSize(0, 1))
1016     .widenScalarIf(smallerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       })
1020     .narrowScalarIf(largerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       });
1024 
1025   getActionDefinitionsBuilder(G_PTRTOINT)
1026     // List the common cases
1027     .legalForCartesianProduct(AddrSpaces64, {S64})
1028     .legalForCartesianProduct(AddrSpaces32, {S32})
1029     .scalarize(0)
1030     // Accept any address space as long as the size matches
1031     .legalIf(sameSize(0, 1))
1032     .widenScalarIf(smallerThan(0, 1),
1033       [](const LegalityQuery &Query) {
1034         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1035       })
1036     .narrowScalarIf(
1037       largerThan(0, 1),
1038       [](const LegalityQuery &Query) {
1039         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1040       });
1041 
1042   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1043     .scalarize(0)
1044     .custom();
1045 
1046   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1047                                     bool IsLoad) -> bool {
1048     const LLT DstTy = Query.Types[0];
1049 
1050     // Split vector extloads.
1051     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1052     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1053 
1054     if (MemSize < DstTy.getSizeInBits())
1055       MemSize = std::max(MemSize, AlignBits);
1056 
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059 
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064 
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076 
1077     if (AlignBits < MemSize) {
1078       const SITargetLowering *TLI = ST.getTargetLowering();
1079       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1080                                                       Align(AlignBits / 8));
1081     }
1082 
1083     return false;
1084   };
1085 
1086   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1089 
1090   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1091   // LDS
1092   // TODO: Unsupported flat for SI.
1093 
1094   for (unsigned Op : {G_LOAD, G_STORE}) {
1095     const bool IsStore = Op == G_STORE;
1096 
1097     auto &Actions = getActionDefinitionsBuilder(Op);
1098     // Explicitly list some common cases.
1099     // TODO: Does this help compile time at all?
1100     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1101                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1102                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1103                                       {S64, GlobalPtr, S64, GlobalAlign32},
1104                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1105                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1106                                       {S32, GlobalPtr, S8, GlobalAlign8},
1107                                       {S32, GlobalPtr, S16, GlobalAlign16},
1108 
1109                                       {S32, LocalPtr, S32, 32},
1110                                       {S64, LocalPtr, S64, 32},
1111                                       {V2S32, LocalPtr, V2S32, 32},
1112                                       {S32, LocalPtr, S8, 8},
1113                                       {S32, LocalPtr, S16, 16},
1114                                       {V2S16, LocalPtr, S32, 32},
1115 
1116                                       {S32, PrivatePtr, S32, 32},
1117                                       {S32, PrivatePtr, S8, 8},
1118                                       {S32, PrivatePtr, S16, 16},
1119                                       {V2S16, PrivatePtr, S32, 32},
1120 
1121                                       {S32, ConstantPtr, S32, GlobalAlign32},
1122                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1123                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1124                                       {S64, ConstantPtr, S64, GlobalAlign32},
1125                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1126     Actions.legalIf(
1127       [=](const LegalityQuery &Query) -> bool {
1128         return isLoadStoreLegal(ST, Query);
1129       });
1130 
1131     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1132     // 64-bits.
1133     //
1134     // TODO: Should generalize bitcast action into coerce, which will also cover
1135     // inserting addrspacecasts.
1136     Actions.customIf(typeIs(1, Constant32Ptr));
1137 
1138     // Turn any illegal element vectors into something easier to deal
1139     // with. These will ultimately produce 32-bit scalar shifts to extract the
1140     // parts anyway.
1141     //
1142     // For odd 16-bit element vectors, prefer to split those into pieces with
1143     // 16-bit vector parts.
1144     Actions.bitcastIf(
1145       [=](const LegalityQuery &Query) -> bool {
1146         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1147                                           Query.MMODescrs[0].MemoryTy);
1148       }, bitcastToRegisterType(0));
1149 
1150     if (!IsStore) {
1151       // Widen suitably aligned loads by loading extra bytes. The standard
1152       // legalization actions can't properly express widening memory operands.
1153       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1154         return shouldWidenLoad(ST, Query, G_LOAD);
1155       });
1156     }
1157 
1158     // FIXME: load/store narrowing should be moved to lower action
1159     Actions
1160         .narrowScalarIf(
1161             [=](const LegalityQuery &Query) -> bool {
1162               return !Query.Types[0].isVector() &&
1163                      needToSplitMemOp(Query, Op == G_LOAD);
1164             },
1165             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1166               const LLT DstTy = Query.Types[0];
1167               const LLT PtrTy = Query.Types[1];
1168 
1169               const unsigned DstSize = DstTy.getSizeInBits();
1170               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1171 
1172               // Split extloads.
1173               if (DstSize > MemSize)
1174                 return std::make_pair(0, LLT::scalar(MemSize));
1175 
1176               if (!isPowerOf2_32(DstSize)) {
1177                 // We're probably decomposing an odd sized store. Try to split
1178                 // to the widest type. TODO: Account for alignment. As-is it
1179                 // should be OK, since the new parts will be further legalized.
1180                 unsigned FloorSize = PowerOf2Floor(DstSize);
1181                 return std::make_pair(0, LLT::scalar(FloorSize));
1182               }
1183 
1184               if (DstSize > 32 && (DstSize % 32 != 0)) {
1185                 // FIXME: Need a way to specify non-extload of larger size if
1186                 // suitably aligned.
1187                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1188               }
1189 
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193               if (MemSize > MaxSize)
1194                 return std::make_pair(0, LLT::scalar(MaxSize));
1195 
1196               unsigned Align = Query.MMODescrs[0].AlignInBits;
1197               return std::make_pair(0, LLT::scalar(Align));
1198             })
1199         .fewerElementsIf(
1200             [=](const LegalityQuery &Query) -> bool {
1201               return Query.Types[0].isVector() &&
1202                      needToSplitMemOp(Query, Op == G_LOAD);
1203             },
1204             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1205               const LLT DstTy = Query.Types[0];
1206               const LLT PtrTy = Query.Types[1];
1207 
1208               LLT EltTy = DstTy.getElementType();
1209               unsigned MaxSize = maxSizeForAddrSpace(ST,
1210                                                      PtrTy.getAddressSpace(),
1211                                                      Op == G_LOAD);
1212 
1213               // FIXME: Handle widened to power of 2 results better. This ends
1214               // up scalarizing.
1215               // FIXME: 3 element stores scalarized on SI
1216 
1217               // Split if it's too large for the address space.
1218               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1219               if (MemSize > MaxSize) {
1220                 unsigned NumElts = DstTy.getNumElements();
1221                 unsigned EltSize = EltTy.getSizeInBits();
1222 
1223                 if (MaxSize % EltSize == 0) {
1224                   return std::make_pair(
1225                       0, LLT::scalarOrVector(
1226                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1227                 }
1228 
1229                 unsigned NumPieces = MemSize / MaxSize;
1230 
1231                 // FIXME: Refine when odd breakdowns handled
1232                 // The scalars will need to be re-legalized.
1233                 if (NumPieces == 1 || NumPieces >= NumElts ||
1234                     NumElts % NumPieces != 0)
1235                   return std::make_pair(0, EltTy);
1236 
1237                 return std::make_pair(
1238                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1239               }
1240 
1241               // FIXME: We could probably handle weird extending loads better.
1242               if (DstTy.getSizeInBits() > MemSize)
1243                 return std::make_pair(0, EltTy);
1244 
1245               unsigned EltSize = EltTy.getSizeInBits();
1246               unsigned DstSize = DstTy.getSizeInBits();
1247               if (!isPowerOf2_32(DstSize)) {
1248                 // We're probably decomposing an odd sized store. Try to split
1249                 // to the widest type. TODO: Account for alignment. As-is it
1250                 // should be OK, since the new parts will be further legalized.
1251                 unsigned FloorSize = PowerOf2Floor(DstSize);
1252                 return std::make_pair(
1253                     0, LLT::scalarOrVector(
1254                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1255               }
1256 
1257               // Need to split because of alignment.
1258               unsigned Align = Query.MMODescrs[0].AlignInBits;
1259               if (EltSize > Align &&
1260                   (EltSize / Align < DstTy.getNumElements())) {
1261                 return std::make_pair(
1262                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1263               }
1264 
1265               // May need relegalization for the scalars.
1266               return std::make_pair(0, EltTy);
1267             })
1268     .minScalar(0, S32)
1269     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1270     .widenScalarToNextPow2(0)
1271     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1272     .lower();
1273   }
1274 
1275   // FIXME: Unaligned accesses not lowered.
1276   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1277                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1278                                                   {S32, GlobalPtr, S16, 2 * 8},
1279                                                   {S32, LocalPtr, S8, 8},
1280                                                   {S32, LocalPtr, S16, 16},
1281                                                   {S32, PrivatePtr, S8, 8},
1282                                                   {S32, PrivatePtr, S16, 16},
1283                                                   {S32, ConstantPtr, S8, 8},
1284                                                   {S32, ConstantPtr, S16, 2 * 8}})
1285                        .legalIf(
1286                          [=](const LegalityQuery &Query) -> bool {
1287                            return isLoadStoreLegal(ST, Query);
1288                          });
1289 
1290   if (ST.hasFlatAddressSpace()) {
1291     ExtLoads.legalForTypesWithMemDesc(
1292         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1293   }
1294 
1295   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1296   // 64-bits.
1297   //
1298   // TODO: Should generalize bitcast action into coerce, which will also cover
1299   // inserting addrspacecasts.
1300   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1301 
1302   ExtLoads.clampScalar(0, S32, S32)
1303           .widenScalarToNextPow2(0)
1304           .lower();
1305 
1306   auto &Atomics = getActionDefinitionsBuilder(
1307     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1308      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1309      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1310      G_ATOMICRMW_UMIN})
1311     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1312                {S64, GlobalPtr}, {S64, LocalPtr},
1313                {S32, RegionPtr}, {S64, RegionPtr}});
1314   if (ST.hasFlatAddressSpace()) {
1315     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1316   }
1317 
1318   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1319   if (ST.hasLDSFPAtomics()) {
1320     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1321     if (ST.hasGFX90AInsts())
1322       Atomic.legalFor({{S64, LocalPtr}});
1323   }
1324   if (ST.hasAtomicFaddInsts())
1325     Atomic.legalFor({{S32, GlobalPtr}});
1326 
1327   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1328   // demarshalling
1329   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1330     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1331                 {S32, FlatPtr}, {S64, FlatPtr}})
1332     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1333                {S32, RegionPtr}, {S64, RegionPtr}});
1334   // TODO: Pointer types, any 32-bit or 64-bit vector
1335 
1336   // Condition should be s32 for scalar, s1 for vector.
1337   getActionDefinitionsBuilder(G_SELECT)
1338       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1339                                  LocalPtr, FlatPtr, PrivatePtr,
1340                                  LLT::fixed_vector(2, LocalPtr),
1341                                  LLT::fixed_vector(2, PrivatePtr)},
1342                                 {S1, S32})
1343       .clampScalar(0, S16, S64)
1344       .scalarize(1)
1345       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347       .clampMaxNumElements(0, S32, 2)
1348       .clampMaxNumElements(0, LocalPtr, 2)
1349       .clampMaxNumElements(0, PrivatePtr, 2)
1350       .scalarize(0)
1351       .widenScalarToNextPow2(0)
1352       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1353 
1354   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355   // be more flexible with the shift amount type.
1356   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1357     .legalFor({{S32, S32}, {S64, S32}});
1358   if (ST.has16BitInsts()) {
1359     if (ST.hasVOP3PInsts()) {
1360       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1361             .clampMaxNumElements(0, S16, 2);
1362     } else
1363       Shifts.legalFor({{S16, S16}});
1364 
1365     // TODO: Support 16-bit shift amounts for all types
1366     Shifts.widenScalarIf(
1367       [=](const LegalityQuery &Query) {
1368         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1369         // 32-bit amount.
1370         const LLT ValTy = Query.Types[0];
1371         const LLT AmountTy = Query.Types[1];
1372         return ValTy.getSizeInBits() <= 16 &&
1373                AmountTy.getSizeInBits() < 16;
1374       }, changeTo(1, S16));
1375     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1376     Shifts.clampScalar(1, S32, S32);
1377     Shifts.clampScalar(0, S16, S64);
1378     Shifts.widenScalarToNextPow2(0, 16);
1379 
1380     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381       .minScalar(0, S16)
1382       .scalarize(0)
1383       .lower();
1384   } else {
1385     // Make sure we legalize the shift amount type first, as the general
1386     // expansion for the shifted type will produce much worse code if it hasn't
1387     // been truncated already.
1388     Shifts.clampScalar(1, S32, S32);
1389     Shifts.clampScalar(0, S32, S64);
1390     Shifts.widenScalarToNextPow2(0, 32);
1391 
1392     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1393       .minScalar(0, S32)
1394       .scalarize(0)
1395       .lower();
1396   }
1397   Shifts.scalarize(0);
1398 
1399   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1400     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1401     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1402     unsigned IdxTypeIdx = 2;
1403 
1404     getActionDefinitionsBuilder(Op)
1405       .customIf([=](const LegalityQuery &Query) {
1406           const LLT EltTy = Query.Types[EltTypeIdx];
1407           const LLT VecTy = Query.Types[VecTypeIdx];
1408           const LLT IdxTy = Query.Types[IdxTypeIdx];
1409           const unsigned EltSize = EltTy.getSizeInBits();
1410           return (EltSize == 32 || EltSize == 64) &&
1411                   VecTy.getSizeInBits() % 32 == 0 &&
1412                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1413                   IdxTy.getSizeInBits() == 32;
1414         })
1415       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1416                  bitcastToVectorElement32(VecTypeIdx))
1417       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1418       .bitcastIf(
1419         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1420         [=](const LegalityQuery &Query) {
1421           // For > 64-bit element types, try to turn this into a 64-bit
1422           // element vector since we may be able to do better indexing
1423           // if this is scalar. If not, fall back to 32.
1424           const LLT EltTy = Query.Types[EltTypeIdx];
1425           const LLT VecTy = Query.Types[VecTypeIdx];
1426           const unsigned DstEltSize = EltTy.getSizeInBits();
1427           const unsigned VecSize = VecTy.getSizeInBits();
1428 
1429           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1430           return std::make_pair(
1431               VecTypeIdx,
1432               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1433         })
1434       .clampScalar(EltTypeIdx, S32, S64)
1435       .clampScalar(VecTypeIdx, S32, S64)
1436       .clampScalar(IdxTypeIdx, S32, S32)
1437       .clampMaxNumElements(VecTypeIdx, S32, 32)
1438       // TODO: Clamp elements for 64-bit vectors?
1439       // It should only be necessary with variable indexes.
1440       // As a last resort, lower to the stack
1441       .lower();
1442   }
1443 
1444   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1445     .unsupportedIf([=](const LegalityQuery &Query) {
1446         const LLT &EltTy = Query.Types[1].getElementType();
1447         return Query.Types[0] != EltTy;
1448       });
1449 
1450   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1451     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1452     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1453 
1454     // FIXME: Doesn't handle extract of illegal sizes.
1455     getActionDefinitionsBuilder(Op)
1456       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1457       // FIXME: Multiples of 16 should not be legal.
1458       .legalIf([=](const LegalityQuery &Query) {
1459           const LLT BigTy = Query.Types[BigTyIdx];
1460           const LLT LitTy = Query.Types[LitTyIdx];
1461           return (BigTy.getSizeInBits() % 32 == 0) &&
1462                  (LitTy.getSizeInBits() % 16 == 0);
1463         })
1464       .widenScalarIf(
1465         [=](const LegalityQuery &Query) {
1466           const LLT BigTy = Query.Types[BigTyIdx];
1467           return (BigTy.getScalarSizeInBits() < 16);
1468         },
1469         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1470       .widenScalarIf(
1471         [=](const LegalityQuery &Query) {
1472           const LLT LitTy = Query.Types[LitTyIdx];
1473           return (LitTy.getScalarSizeInBits() < 16);
1474         },
1475         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1476       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1477       .widenScalarToNextPow2(BigTyIdx, 32);
1478 
1479   }
1480 
1481   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1482     .legalForCartesianProduct(AllS32Vectors, {S32})
1483     .legalForCartesianProduct(AllS64Vectors, {S64})
1484     .clampNumElements(0, V16S32, V32S32)
1485     .clampNumElements(0, V2S64, V16S64)
1486     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1487 
1488   if (ST.hasScalarPackInsts()) {
1489     BuildVector
1490       // FIXME: Should probably widen s1 vectors straight to s32
1491       .minScalarOrElt(0, S16)
1492       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1493       .minScalar(1, S32);
1494 
1495     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496       .legalFor({V2S16, S32})
1497       .lower();
1498     BuildVector.minScalarOrElt(0, S32);
1499   } else {
1500     BuildVector.customFor({V2S16, S16});
1501     BuildVector.minScalarOrElt(0, S32);
1502 
1503     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1504       .customFor({V2S16, S32})
1505       .lower();
1506   }
1507 
1508   BuildVector.legalIf(isRegisterType(0));
1509 
1510   // FIXME: Clamp maximum size
1511   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1512     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513     .clampMaxNumElements(0, S32, 32)
1514     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1515     .clampMaxNumElements(0, S16, 64);
1516 
1517   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1518   // pre-legalize.
1519   if (ST.hasVOP3PInsts()) {
1520     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1521       .customFor({V2S16, V2S16})
1522       .lower();
1523   } else
1524     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1525 
1526   // Merge/Unmerge
1527   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1528     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1529     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1530 
1531     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1532       const LLT Ty = Query.Types[TypeIdx];
1533       if (Ty.isVector()) {
1534         const LLT &EltTy = Ty.getElementType();
1535         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1536           return true;
1537         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1538           return true;
1539       }
1540       return false;
1541     };
1542 
1543     auto &Builder = getActionDefinitionsBuilder(Op)
1544       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545       .lowerFor({{S16, V2S16}})
1546       .lowerIf([=](const LegalityQuery &Query) {
1547           const LLT BigTy = Query.Types[BigTyIdx];
1548           return BigTy.getSizeInBits() == 32;
1549         })
1550       // Try to widen to s16 first for small types.
1551       // TODO: Only do this on targets with legal s16 shifts
1552       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1554       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1555       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1556                            elementTypeIs(1, S16)),
1557                        changeTo(1, V2S16))
1558       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1560       // valid.
1561       .clampScalar(LitTyIdx, S32, S512)
1562       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1563       // Break up vectors with weird elements into scalars
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1566         scalarize(0))
1567       .fewerElementsIf(
1568         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1569         scalarize(1))
1570       .clampScalar(BigTyIdx, S32, MaxScalar);
1571 
1572     if (Op == G_MERGE_VALUES) {
1573       Builder.widenScalarIf(
1574         // TODO: Use 16-bit shifts if legal for 8-bit values?
1575         [=](const LegalityQuery &Query) {
1576           const LLT Ty = Query.Types[LitTyIdx];
1577           return Ty.getSizeInBits() < 32;
1578         },
1579         changeTo(LitTyIdx, S32));
1580     }
1581 
1582     Builder.widenScalarIf(
1583       [=](const LegalityQuery &Query) {
1584         const LLT Ty = Query.Types[BigTyIdx];
1585         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1586           Ty.getSizeInBits() % 16 != 0;
1587       },
1588       [=](const LegalityQuery &Query) {
1589         // Pick the next power of 2, or a multiple of 64 over 128.
1590         // Whichever is smaller.
1591         const LLT &Ty = Query.Types[BigTyIdx];
1592         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1593         if (NewSizeInBits >= 256) {
1594           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1595           if (RoundedTo < NewSizeInBits)
1596             NewSizeInBits = RoundedTo;
1597         }
1598         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1599       })
1600       // Any vectors left are the wrong size. Scalarize them.
1601       .scalarize(0)
1602       .scalarize(1);
1603   }
1604 
1605   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1606   // RegBankSelect.
1607   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1608     .legalFor({{S32}, {S64}});
1609 
1610   if (ST.hasVOP3PInsts()) {
1611     SextInReg.lowerFor({{V2S16}})
1612       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613       // get more vector shift opportunities, since we'll get those when
1614       // expanded.
1615       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1616   } else if (ST.has16BitInsts()) {
1617     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1618   } else {
1619     // Prefer to promote to s32 before lowering if we don't have 16-bit
1620     // shifts. This avoid a lot of intermediate truncate and extend operations.
1621     SextInReg.lowerFor({{S32}, {S64}});
1622   }
1623 
1624   SextInReg
1625     .scalarize(0)
1626     .clampScalar(0, S32, S64)
1627     .lower();
1628 
1629   // TODO: Only Try to form v2s16 with legal packed instructions.
1630   getActionDefinitionsBuilder(G_FSHR)
1631     .legalFor({{S32, S32}})
1632     .lowerFor({{V2S16, V2S16}})
1633     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1634     .scalarize(0)
1635     .lower();
1636 
1637   if (ST.hasVOP3PInsts()) {
1638     getActionDefinitionsBuilder(G_FSHL)
1639       .lowerFor({{V2S16, V2S16}})
1640       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641       .scalarize(0)
1642       .lower();
1643   } else {
1644     getActionDefinitionsBuilder(G_FSHL)
1645       .scalarize(0)
1646       .lower();
1647   }
1648 
1649   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650     .legalFor({S64});
1651 
1652   getActionDefinitionsBuilder(G_FENCE)
1653     .alwaysLegal();
1654 
1655   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656       .scalarize(0)
1657       .minScalar(0, S32)
1658       .lower();
1659 
1660   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661       .legalFor({{S32, S32}, {S64, S32}})
1662       .clampScalar(1, S32, S32)
1663       .clampScalar(0, S32, S64)
1664       .widenScalarToNextPow2(0)
1665       .scalarize(0);
1666 
1667   getActionDefinitionsBuilder({
1668       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669       G_FCOPYSIGN,
1670 
1671       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672       G_ATOMICRMW_NAND,
1673       G_ATOMICRMW_FSUB,
1674       G_READ_REGISTER,
1675       G_WRITE_REGISTER,
1676 
1677       G_SADDO, G_SSUBO,
1678 
1679        // TODO: Implement
1680       G_FMINIMUM, G_FMAXIMUM}).lower();
1681 
1682   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1683         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1684         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1685     .unsupported();
1686 
1687   getLegacyLegalizerInfo().computeTables();
1688   verify(*ST.getInstrInfo());
1689 }
1690 
1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1692                                          MachineInstr &MI) const {
1693   MachineIRBuilder &B = Helper.MIRBuilder;
1694   MachineRegisterInfo &MRI = *B.getMRI();
1695 
1696   switch (MI.getOpcode()) {
1697   case TargetOpcode::G_ADDRSPACE_CAST:
1698     return legalizeAddrSpaceCast(MI, MRI, B);
1699   case TargetOpcode::G_FRINT:
1700     return legalizeFrint(MI, MRI, B);
1701   case TargetOpcode::G_FCEIL:
1702     return legalizeFceil(MI, MRI, B);
1703   case TargetOpcode::G_FREM:
1704     return legalizeFrem(MI, MRI, B);
1705   case TargetOpcode::G_INTRINSIC_TRUNC:
1706     return legalizeIntrinsicTrunc(MI, MRI, B);
1707   case TargetOpcode::G_SITOFP:
1708     return legalizeITOFP(MI, MRI, B, true);
1709   case TargetOpcode::G_UITOFP:
1710     return legalizeITOFP(MI, MRI, B, false);
1711   case TargetOpcode::G_FPTOSI:
1712     return legalizeFPTOI(MI, MRI, B, true);
1713   case TargetOpcode::G_FPTOUI:
1714     return legalizeFPTOI(MI, MRI, B, false);
1715   case TargetOpcode::G_FMINNUM:
1716   case TargetOpcode::G_FMAXNUM:
1717   case TargetOpcode::G_FMINNUM_IEEE:
1718   case TargetOpcode::G_FMAXNUM_IEEE:
1719     return legalizeMinNumMaxNum(Helper, MI);
1720   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1721     return legalizeExtractVectorElt(MI, MRI, B);
1722   case TargetOpcode::G_INSERT_VECTOR_ELT:
1723     return legalizeInsertVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_SHUFFLE_VECTOR:
1725     return legalizeShuffleVector(MI, MRI, B);
1726   case TargetOpcode::G_FSIN:
1727   case TargetOpcode::G_FCOS:
1728     return legalizeSinCos(MI, MRI, B);
1729   case TargetOpcode::G_GLOBAL_VALUE:
1730     return legalizeGlobalValue(MI, MRI, B);
1731   case TargetOpcode::G_LOAD:
1732   case TargetOpcode::G_SEXTLOAD:
1733   case TargetOpcode::G_ZEXTLOAD:
1734     return legalizeLoad(Helper, MI);
1735   case TargetOpcode::G_FMAD:
1736     return legalizeFMad(MI, MRI, B);
1737   case TargetOpcode::G_FDIV:
1738     return legalizeFDIV(MI, MRI, B);
1739   case TargetOpcode::G_UDIV:
1740   case TargetOpcode::G_UREM:
1741   case TargetOpcode::G_UDIVREM:
1742     return legalizeUnsignedDIV_REM(MI, MRI, B);
1743   case TargetOpcode::G_SDIV:
1744   case TargetOpcode::G_SREM:
1745   case TargetOpcode::G_SDIVREM:
1746     return legalizeSignedDIV_REM(MI, MRI, B);
1747   case TargetOpcode::G_ATOMIC_CMPXCHG:
1748     return legalizeAtomicCmpXChg(MI, MRI, B);
1749   case TargetOpcode::G_FLOG:
1750     return legalizeFlog(MI, B, numbers::ln2f);
1751   case TargetOpcode::G_FLOG10:
1752     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1753   case TargetOpcode::G_FEXP:
1754     return legalizeFExp(MI, B);
1755   case TargetOpcode::G_FPOW:
1756     return legalizeFPow(MI, B);
1757   case TargetOpcode::G_FFLOOR:
1758     return legalizeFFloor(MI, MRI, B);
1759   case TargetOpcode::G_BUILD_VECTOR:
1760     return legalizeBuildVector(MI, MRI, B);
1761   case TargetOpcode::G_CTLZ:
1762   case TargetOpcode::G_CTTZ:
1763     return legalizeCTLZ_CTTZ(MI, MRI, B);
1764   default:
1765     return false;
1766   }
1767 
1768   llvm_unreachable("expected switch to return");
1769 }
1770 
1771 Register AMDGPULegalizerInfo::getSegmentAperture(
1772   unsigned AS,
1773   MachineRegisterInfo &MRI,
1774   MachineIRBuilder &B) const {
1775   MachineFunction &MF = B.getMF();
1776   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1777   const LLT S32 = LLT::scalar(32);
1778 
1779   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1780 
1781   if (ST.hasApertureRegs()) {
1782     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1783     // getreg.
1784     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1786         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1787     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1788         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1789         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1790     unsigned Encoding =
1791         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1792         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1793         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1794 
1795     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1796 
1797     B.buildInstr(AMDGPU::S_GETREG_B32)
1798       .addDef(GetReg)
1799       .addImm(Encoding);
1800     MRI.setType(GetReg, S32);
1801 
1802     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1803     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1804   }
1805 
1806   Register QueuePtr = MRI.createGenericVirtualRegister(
1807     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1808 
1809   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1810     return Register();
1811 
1812   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1813   // private_segment_aperture_base_hi.
1814   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1815 
1816   // TODO: can we be smarter about machine pointer info?
1817   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1818   MachineMemOperand *MMO = MF.getMachineMemOperand(
1819       PtrInfo,
1820       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1821           MachineMemOperand::MOInvariant,
1822       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1823 
1824   Register LoadAddr;
1825 
1826   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1827   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1828 }
1829 
1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1831   MachineInstr &MI, MachineRegisterInfo &MRI,
1832   MachineIRBuilder &B) const {
1833   MachineFunction &MF = B.getMF();
1834 
1835   const LLT S32 = LLT::scalar(32);
1836   Register Dst = MI.getOperand(0).getReg();
1837   Register Src = MI.getOperand(1).getReg();
1838 
1839   LLT DstTy = MRI.getType(Dst);
1840   LLT SrcTy = MRI.getType(Src);
1841   unsigned DestAS = DstTy.getAddressSpace();
1842   unsigned SrcAS = SrcTy.getAddressSpace();
1843 
1844   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1845   // vector element.
1846   assert(!DstTy.isVector());
1847 
1848   const AMDGPUTargetMachine &TM
1849     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1850 
1851   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1852     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1853     return true;
1854   }
1855 
1856   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1857     // Truncate.
1858     B.buildExtract(Dst, Src, 0);
1859     MI.eraseFromParent();
1860     return true;
1861   }
1862 
1863   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1864     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1865     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1866 
1867     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1868     // another. Merge operands are required to be the same type, but creating an
1869     // extra ptrtoint would be kind of pointless.
1870     auto HighAddr = B.buildConstant(
1871       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1872     B.buildMerge(Dst, {Src, HighAddr});
1873     MI.eraseFromParent();
1874     return true;
1875   }
1876 
1877   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1878     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1879            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1880     unsigned NullVal = TM.getNullPointerValue(DestAS);
1881 
1882     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1883     auto FlatNull = B.buildConstant(SrcTy, 0);
1884 
1885     // Extract low 32-bits of the pointer.
1886     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1887 
1888     auto CmpRes =
1889         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1890     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1891 
1892     MI.eraseFromParent();
1893     return true;
1894   }
1895 
1896   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1897     return false;
1898 
1899   if (!ST.hasFlatAddressSpace())
1900     return false;
1901 
1902   auto SegmentNull =
1903       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1904   auto FlatNull =
1905       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1906 
1907   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1908   if (!ApertureReg.isValid())
1909     return false;
1910 
1911   auto CmpRes =
1912       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1913 
1914   // Coerce the type of the low half of the result so we can use merge_values.
1915   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1916 
1917   // TODO: Should we allow mismatched types but matching sizes in merges to
1918   // avoid the ptrtoint?
1919   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1920   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1921 
1922   MI.eraseFromParent();
1923   return true;
1924 }
1925 
1926 bool AMDGPULegalizerInfo::legalizeFrint(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   Register Src = MI.getOperand(1).getReg();
1930   LLT Ty = MRI.getType(Src);
1931   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1932 
1933   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1934   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1935 
1936   auto C1 = B.buildFConstant(Ty, C1Val);
1937   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1938 
1939   // TODO: Should this propagate fast-math-flags?
1940   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1941   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1942 
1943   auto C2 = B.buildFConstant(Ty, C2Val);
1944   auto Fabs = B.buildFAbs(Ty, Src);
1945 
1946   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1947   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1948   MI.eraseFromParent();
1949   return true;
1950 }
1951 
1952 bool AMDGPULegalizerInfo::legalizeFceil(
1953   MachineInstr &MI, MachineRegisterInfo &MRI,
1954   MachineIRBuilder &B) const {
1955 
1956   const LLT S1 = LLT::scalar(1);
1957   const LLT S64 = LLT::scalar(64);
1958 
1959   Register Src = MI.getOperand(1).getReg();
1960   assert(MRI.getType(Src) == S64);
1961 
1962   // result = trunc(src)
1963   // if (src > 0.0 && src != result)
1964   //   result += 1.0
1965 
1966   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1967 
1968   const auto Zero = B.buildFConstant(S64, 0.0);
1969   const auto One = B.buildFConstant(S64, 1.0);
1970   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1971   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1972   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1973   auto Add = B.buildSelect(S64, And, One, Zero);
1974 
1975   // TODO: Should this propagate fast-math-flags?
1976   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1977   return true;
1978 }
1979 
1980 bool AMDGPULegalizerInfo::legalizeFrem(
1981   MachineInstr &MI, MachineRegisterInfo &MRI,
1982   MachineIRBuilder &B) const {
1983     Register DstReg = MI.getOperand(0).getReg();
1984     Register Src0Reg = MI.getOperand(1).getReg();
1985     Register Src1Reg = MI.getOperand(2).getReg();
1986     auto Flags = MI.getFlags();
1987     LLT Ty = MRI.getType(DstReg);
1988 
1989     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1990     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1991     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1992     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1993     MI.eraseFromParent();
1994     return true;
1995 }
1996 
1997 static MachineInstrBuilder extractF64Exponent(Register Hi,
1998                                               MachineIRBuilder &B) {
1999   const unsigned FractBits = 52;
2000   const unsigned ExpBits = 11;
2001   LLT S32 = LLT::scalar(32);
2002 
2003   auto Const0 = B.buildConstant(S32, FractBits - 32);
2004   auto Const1 = B.buildConstant(S32, ExpBits);
2005 
2006   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2007     .addUse(Hi)
2008     .addUse(Const0.getReg(0))
2009     .addUse(Const1.getReg(0));
2010 
2011   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2012 }
2013 
2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2015   MachineInstr &MI, MachineRegisterInfo &MRI,
2016   MachineIRBuilder &B) const {
2017   const LLT S1 = LLT::scalar(1);
2018   const LLT S32 = LLT::scalar(32);
2019   const LLT S64 = LLT::scalar(64);
2020 
2021   Register Src = MI.getOperand(1).getReg();
2022   assert(MRI.getType(Src) == S64);
2023 
2024   // TODO: Should this use extract since the low half is unused?
2025   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2026   Register Hi = Unmerge.getReg(1);
2027 
2028   // Extract the upper half, since this is where we will find the sign and
2029   // exponent.
2030   auto Exp = extractF64Exponent(Hi, B);
2031 
2032   const unsigned FractBits = 52;
2033 
2034   // Extract the sign bit.
2035   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2036   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2037 
2038   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2039 
2040   const auto Zero32 = B.buildConstant(S32, 0);
2041 
2042   // Extend back to 64-bits.
2043   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2044 
2045   auto Shr = B.buildAShr(S64, FractMask, Exp);
2046   auto Not = B.buildNot(S64, Shr);
2047   auto Tmp0 = B.buildAnd(S64, Src, Not);
2048   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2049 
2050   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2051   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2052 
2053   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2054   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2055   MI.eraseFromParent();
2056   return true;
2057 }
2058 
2059 bool AMDGPULegalizerInfo::legalizeITOFP(
2060   MachineInstr &MI, MachineRegisterInfo &MRI,
2061   MachineIRBuilder &B, bool Signed) const {
2062 
2063   Register Dst = MI.getOperand(0).getReg();
2064   Register Src = MI.getOperand(1).getReg();
2065 
2066   const LLT S64 = LLT::scalar(64);
2067   const LLT S32 = LLT::scalar(32);
2068   const LLT S1 = LLT::scalar(1);
2069 
2070   assert(MRI.getType(Src) == S64);
2071 
2072   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2073   auto ThirtyTwo = B.buildConstant(S32, 32);
2074 
2075   if (MRI.getType(Dst) == S64) {
2076     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2077                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2078 
2079     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2080     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2081                      .addUse(CvtHi.getReg(0))
2082                      .addUse(ThirtyTwo.getReg(0));
2083 
2084     // TODO: Should this propagate fast-math-flags?
2085     B.buildFAdd(Dst, LdExp, CvtLo);
2086     MI.eraseFromParent();
2087     return true;
2088   }
2089 
2090   assert(MRI.getType(Dst) == S32);
2091 
2092   auto Zero = B.buildConstant(S32, 0);
2093   auto One = B.buildConstant(S32, 1);
2094   auto AllOnes = B.buildConstant(S32, -1);
2095 
2096   MachineInstrBuilder ShAmt;
2097   if (Signed) {
2098     auto ThirtyThree = B.buildConstant(S32, 33);
2099     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2100     auto HasSameSign = B.buildICmp(CmpInst::ICMP_SGE, S1, X, Zero);
2101     auto MaxShAmt = B.buildSelect(S32, HasSameSign, ThirtyThree, ThirtyTwo);
2102     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2103                                /*HasSideEffects=*/false)
2104                   .addUse(Unmerge.getReg(1));
2105     auto NotAllSameBits = B.buildICmp(CmpInst::ICMP_NE, S1, LS, AllOnes);
2106     auto LS2 = B.buildSelect(S32, NotAllSameBits, LS, MaxShAmt);
2107     ShAmt = B.buildSub(S32, LS2, One);
2108   } else
2109     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2110   auto Norm = B.buildShl(S64, Src, ShAmt);
2111   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2112   auto NotAllZeros =
2113       B.buildICmp(CmpInst::ICMP_NE, S1, Unmerge2.getReg(0), Zero);
2114   auto Adjust = B.buildSelect(S32, NotAllZeros, One, Zero);
2115   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2116   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2117   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2118   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2119                    /*HasSideEffects=*/false)
2120       .addUse(FVal.getReg(0))
2121       .addUse(Scale.getReg(0));
2122   MI.eraseFromParent();
2123   return true;
2124 }
2125 
2126 // TODO: Copied from DAG implementation. Verify logic and document how this
2127 // actually works.
2128 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2129                                         MachineRegisterInfo &MRI,
2130                                         MachineIRBuilder &B,
2131                                         bool Signed) const {
2132 
2133   Register Dst = MI.getOperand(0).getReg();
2134   Register Src = MI.getOperand(1).getReg();
2135 
2136   const LLT S64 = LLT::scalar(64);
2137   const LLT S32 = LLT::scalar(32);
2138 
2139   const LLT SrcLT = MRI.getType(Src);
2140   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2141 
2142   unsigned Flags = MI.getFlags();
2143 
2144   // The basic idea of converting a floating point number into a pair of 32-bit
2145   // integers is illustrated as follows:
2146   //
2147   //     tf := trunc(val);
2148   //    hif := floor(tf * 2^-32);
2149   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2150   //     hi := fptoi(hif);
2151   //     lo := fptoi(lof);
2152   //
2153   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2154   MachineInstrBuilder Sign;
2155   if (Signed && SrcLT == S32) {
2156     // However, a 32-bit floating point number has only 23 bits mantissa and
2157     // it's not enough to hold all the significant bits of `lof` if val is
2158     // negative. To avoid the loss of precision, We need to take the absolute
2159     // value after truncating and flip the result back based on the original
2160     // signedness.
2161     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2162     Trunc = B.buildFAbs(S32, Trunc, Flags);
2163   }
2164   MachineInstrBuilder K0, K1;
2165   if (SrcLT == S64) {
2166     K0 = B.buildFConstant(S64,
2167                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2168     K1 = B.buildFConstant(S64,
2169                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2170   } else {
2171     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2172     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2173   }
2174 
2175   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2176   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2177   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2178 
2179   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2180                                      : B.buildFPTOUI(S32, FloorMul);
2181   auto Lo = B.buildFPTOUI(S32, Fma);
2182 
2183   if (Signed && SrcLT == S32) {
2184     // Flip the result based on the signedness, which is either all 0s or 1s.
2185     Sign = B.buildMerge(S64, {Sign, Sign});
2186     // r := xor({lo, hi}, sign) - sign;
2187     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2188   } else
2189     B.buildMerge(Dst, {Lo, Hi});
2190   MI.eraseFromParent();
2191 
2192   return true;
2193 }
2194 
2195 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2196                                                MachineInstr &MI) const {
2197   MachineFunction &MF = Helper.MIRBuilder.getMF();
2198   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2199 
2200   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2201                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2202 
2203   // With ieee_mode disabled, the instructions have the correct behavior
2204   // already for G_FMINNUM/G_FMAXNUM
2205   if (!MFI->getMode().IEEE)
2206     return !IsIEEEOp;
2207 
2208   if (IsIEEEOp)
2209     return true;
2210 
2211   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2212 }
2213 
2214 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2215   MachineInstr &MI, MachineRegisterInfo &MRI,
2216   MachineIRBuilder &B) const {
2217   // TODO: Should move some of this into LegalizerHelper.
2218 
2219   // TODO: Promote dynamic indexing of s16 to s32
2220 
2221   // FIXME: Artifact combiner probably should have replaced the truncated
2222   // constant before this, so we shouldn't need
2223   // getConstantVRegValWithLookThrough.
2224   Optional<ValueAndVReg> MaybeIdxVal =
2225       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2226   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2227     return true;
2228   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2229 
2230   Register Dst = MI.getOperand(0).getReg();
2231   Register Vec = MI.getOperand(1).getReg();
2232 
2233   LLT VecTy = MRI.getType(Vec);
2234   LLT EltTy = VecTy.getElementType();
2235   assert(EltTy == MRI.getType(Dst));
2236 
2237   if (IdxVal < VecTy.getNumElements())
2238     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2239   else
2240     B.buildUndef(Dst);
2241 
2242   MI.eraseFromParent();
2243   return true;
2244 }
2245 
2246 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2247   MachineInstr &MI, MachineRegisterInfo &MRI,
2248   MachineIRBuilder &B) const {
2249   // TODO: Should move some of this into LegalizerHelper.
2250 
2251   // TODO: Promote dynamic indexing of s16 to s32
2252 
2253   // FIXME: Artifact combiner probably should have replaced the truncated
2254   // constant before this, so we shouldn't need
2255   // getConstantVRegValWithLookThrough.
2256   Optional<ValueAndVReg> MaybeIdxVal =
2257       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2258   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2259     return true;
2260 
2261   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2262   Register Dst = MI.getOperand(0).getReg();
2263   Register Vec = MI.getOperand(1).getReg();
2264   Register Ins = MI.getOperand(2).getReg();
2265 
2266   LLT VecTy = MRI.getType(Vec);
2267   LLT EltTy = VecTy.getElementType();
2268   assert(EltTy == MRI.getType(Ins));
2269 
2270   if (IdxVal < VecTy.getNumElements())
2271     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2272   else
2273     B.buildUndef(Dst);
2274 
2275   MI.eraseFromParent();
2276   return true;
2277 }
2278 
2279 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2280   MachineInstr &MI, MachineRegisterInfo &MRI,
2281   MachineIRBuilder &B) const {
2282   const LLT V2S16 = LLT::fixed_vector(2, 16);
2283 
2284   Register Dst = MI.getOperand(0).getReg();
2285   Register Src0 = MI.getOperand(1).getReg();
2286   LLT DstTy = MRI.getType(Dst);
2287   LLT SrcTy = MRI.getType(Src0);
2288 
2289   if (SrcTy == V2S16 && DstTy == V2S16 &&
2290       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2291     return true;
2292 
2293   MachineIRBuilder HelperBuilder(MI);
2294   GISelObserverWrapper DummyObserver;
2295   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2296   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2297 }
2298 
2299 bool AMDGPULegalizerInfo::legalizeSinCos(
2300   MachineInstr &MI, MachineRegisterInfo &MRI,
2301   MachineIRBuilder &B) const {
2302 
2303   Register DstReg = MI.getOperand(0).getReg();
2304   Register SrcReg = MI.getOperand(1).getReg();
2305   LLT Ty = MRI.getType(DstReg);
2306   unsigned Flags = MI.getFlags();
2307 
2308   Register TrigVal;
2309   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2310   if (ST.hasTrigReducedRange()) {
2311     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2312     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2313       .addUse(MulVal.getReg(0))
2314       .setMIFlags(Flags).getReg(0);
2315   } else
2316     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2317 
2318   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2319     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2320   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2321     .addUse(TrigVal)
2322     .setMIFlags(Flags);
2323   MI.eraseFromParent();
2324   return true;
2325 }
2326 
2327 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2328                                                   MachineIRBuilder &B,
2329                                                   const GlobalValue *GV,
2330                                                   int64_t Offset,
2331                                                   unsigned GAFlags) const {
2332   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2333   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2334   // to the following code sequence:
2335   //
2336   // For constant address space:
2337   //   s_getpc_b64 s[0:1]
2338   //   s_add_u32 s0, s0, $symbol
2339   //   s_addc_u32 s1, s1, 0
2340   //
2341   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2342   //   a fixup or relocation is emitted to replace $symbol with a literal
2343   //   constant, which is a pc-relative offset from the encoding of the $symbol
2344   //   operand to the global variable.
2345   //
2346   // For global address space:
2347   //   s_getpc_b64 s[0:1]
2348   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2349   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2350   //
2351   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2352   //   fixups or relocations are emitted to replace $symbol@*@lo and
2353   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2354   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2355   //   operand to the global variable.
2356   //
2357   // What we want here is an offset from the value returned by s_getpc
2358   // (which is the address of the s_add_u32 instruction) to the global
2359   // variable, but since the encoding of $symbol starts 4 bytes after the start
2360   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2361   // small. This requires us to add 4 to the global variable offset in order to
2362   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2363   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2364   // instruction.
2365 
2366   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2367 
2368   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2369     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2370 
2371   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2372     .addDef(PCReg);
2373 
2374   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2375   if (GAFlags == SIInstrInfo::MO_NONE)
2376     MIB.addImm(0);
2377   else
2378     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2379 
2380   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2381 
2382   if (PtrTy.getSizeInBits() == 32)
2383     B.buildExtract(DstReg, PCReg, 0);
2384   return true;
2385  }
2386 
2387 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2388   MachineInstr &MI, MachineRegisterInfo &MRI,
2389   MachineIRBuilder &B) const {
2390   Register DstReg = MI.getOperand(0).getReg();
2391   LLT Ty = MRI.getType(DstReg);
2392   unsigned AS = Ty.getAddressSpace();
2393 
2394   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2395   MachineFunction &MF = B.getMF();
2396   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2397 
2398   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2399     if (!MFI->isModuleEntryFunction() &&
2400         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2401       const Function &Fn = MF.getFunction();
2402       DiagnosticInfoUnsupported BadLDSDecl(
2403         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2404         DS_Warning);
2405       Fn.getContext().diagnose(BadLDSDecl);
2406 
2407       // We currently don't have a way to correctly allocate LDS objects that
2408       // aren't directly associated with a kernel. We do force inlining of
2409       // functions that use local objects. However, if these dead functions are
2410       // not eliminated, we don't want a compile time error. Just emit a warning
2411       // and a trap, since there should be no callable path here.
2412       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2413       B.buildUndef(DstReg);
2414       MI.eraseFromParent();
2415       return true;
2416     }
2417 
2418     // TODO: We could emit code to handle the initialization somewhere.
2419     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2420       const SITargetLowering *TLI = ST.getTargetLowering();
2421       if (!TLI->shouldUseLDSConstAddress(GV)) {
2422         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2423         return true; // Leave in place;
2424       }
2425 
2426       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2427         Type *Ty = GV->getValueType();
2428         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2429         // zero-sized type in other languages to declare the dynamic shared
2430         // memory which size is not known at the compile time. They will be
2431         // allocated by the runtime and placed directly after the static
2432         // allocated ones. They all share the same offset.
2433         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2434           // Adjust alignment for that dynamic shared memory array.
2435           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2436           LLT S32 = LLT::scalar(32);
2437           auto Sz =
2438               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2439           B.buildIntToPtr(DstReg, Sz);
2440           MI.eraseFromParent();
2441           return true;
2442         }
2443       }
2444 
2445       B.buildConstant(
2446           DstReg,
2447           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2448       MI.eraseFromParent();
2449       return true;
2450     }
2451 
2452     const Function &Fn = MF.getFunction();
2453     DiagnosticInfoUnsupported BadInit(
2454       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2455     Fn.getContext().diagnose(BadInit);
2456     return true;
2457   }
2458 
2459   const SITargetLowering *TLI = ST.getTargetLowering();
2460 
2461   if (TLI->shouldEmitFixup(GV)) {
2462     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2463     MI.eraseFromParent();
2464     return true;
2465   }
2466 
2467   if (TLI->shouldEmitPCReloc(GV)) {
2468     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2469     MI.eraseFromParent();
2470     return true;
2471   }
2472 
2473   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2474   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2475 
2476   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2477   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2478       MachinePointerInfo::getGOT(MF),
2479       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2480           MachineMemOperand::MOInvariant,
2481       LoadTy, Align(8));
2482 
2483   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2484 
2485   if (Ty.getSizeInBits() == 32) {
2486     // Truncate if this is a 32-bit constant adrdess.
2487     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2488     B.buildExtract(DstReg, Load, 0);
2489   } else
2490     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2491 
2492   MI.eraseFromParent();
2493   return true;
2494 }
2495 
2496 static LLT widenToNextPowerOf2(LLT Ty) {
2497   if (Ty.isVector())
2498     return Ty.changeElementCount(
2499         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2500   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2501 }
2502 
2503 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2504                                        MachineInstr &MI) const {
2505   MachineIRBuilder &B = Helper.MIRBuilder;
2506   MachineRegisterInfo &MRI = *B.getMRI();
2507   GISelChangeObserver &Observer = Helper.Observer;
2508 
2509   Register PtrReg = MI.getOperand(1).getReg();
2510   LLT PtrTy = MRI.getType(PtrReg);
2511   unsigned AddrSpace = PtrTy.getAddressSpace();
2512 
2513   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2514     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2515     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2516     Observer.changingInstr(MI);
2517     MI.getOperand(1).setReg(Cast.getReg(0));
2518     Observer.changedInstr(MI);
2519     return true;
2520   }
2521 
2522   if (MI.getOpcode() != AMDGPU::G_LOAD)
2523     return false;
2524 
2525   Register ValReg = MI.getOperand(0).getReg();
2526   LLT ValTy = MRI.getType(ValReg);
2527 
2528   MachineMemOperand *MMO = *MI.memoperands_begin();
2529   const unsigned ValSize = ValTy.getSizeInBits();
2530   const LLT MemTy = MMO->getMemoryType();
2531   const Align MemAlign = MMO->getAlign();
2532   const unsigned MemSize = MemTy.getSizeInBits();
2533   const unsigned AlignInBits = 8 * MemAlign.value();
2534 
2535   // Widen non-power-of-2 loads to the alignment if needed
2536   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2537     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2538 
2539     // This was already the correct extending load result type, so just adjust
2540     // the memory type.
2541     if (WideMemSize == ValSize) {
2542       MachineFunction &MF = B.getMF();
2543 
2544       MachineMemOperand *WideMMO =
2545           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2546       Observer.changingInstr(MI);
2547       MI.setMemRefs(MF, {WideMMO});
2548       Observer.changedInstr(MI);
2549       return true;
2550     }
2551 
2552     // Don't bother handling edge case that should probably never be produced.
2553     if (ValSize > WideMemSize)
2554       return false;
2555 
2556     LLT WideTy = widenToNextPowerOf2(ValTy);
2557 
2558     Register WideLoad;
2559     if (!WideTy.isVector()) {
2560       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2561       B.buildTrunc(ValReg, WideLoad).getReg(0);
2562     } else {
2563       // Extract the subvector.
2564 
2565       if (isRegisterType(ValTy)) {
2566         // If this a case where G_EXTRACT is legal, use it.
2567         // (e.g. <3 x s32> -> <4 x s32>)
2568         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2569         B.buildExtract(ValReg, WideLoad, 0);
2570       } else {
2571         // For cases where the widened type isn't a nice register value, unmerge
2572         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2573         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2574         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2575         B.setInsertPt(B.getMBB(), MI.getIterator());
2576         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2577       }
2578     }
2579 
2580     MI.eraseFromParent();
2581     return true;
2582   }
2583 
2584   return false;
2585 }
2586 
2587 bool AMDGPULegalizerInfo::legalizeFMad(
2588   MachineInstr &MI, MachineRegisterInfo &MRI,
2589   MachineIRBuilder &B) const {
2590   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2591   assert(Ty.isScalar());
2592 
2593   MachineFunction &MF = B.getMF();
2594   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2595 
2596   // TODO: Always legal with future ftz flag.
2597   // FIXME: Do we need just output?
2598   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2599     return true;
2600   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2601     return true;
2602 
2603   MachineIRBuilder HelperBuilder(MI);
2604   GISelObserverWrapper DummyObserver;
2605   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2606   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2607 }
2608 
2609 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2610   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2611   Register DstReg = MI.getOperand(0).getReg();
2612   Register PtrReg = MI.getOperand(1).getReg();
2613   Register CmpVal = MI.getOperand(2).getReg();
2614   Register NewVal = MI.getOperand(3).getReg();
2615 
2616   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2617          "this should not have been custom lowered");
2618 
2619   LLT ValTy = MRI.getType(CmpVal);
2620   LLT VecTy = LLT::fixed_vector(2, ValTy);
2621 
2622   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2623 
2624   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2625     .addDef(DstReg)
2626     .addUse(PtrReg)
2627     .addUse(PackedVal)
2628     .setMemRefs(MI.memoperands());
2629 
2630   MI.eraseFromParent();
2631   return true;
2632 }
2633 
2634 bool AMDGPULegalizerInfo::legalizeFlog(
2635   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2636   Register Dst = MI.getOperand(0).getReg();
2637   Register Src = MI.getOperand(1).getReg();
2638   LLT Ty = B.getMRI()->getType(Dst);
2639   unsigned Flags = MI.getFlags();
2640 
2641   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2642   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2643 
2644   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2645   MI.eraseFromParent();
2646   return true;
2647 }
2648 
2649 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2650                                        MachineIRBuilder &B) const {
2651   Register Dst = MI.getOperand(0).getReg();
2652   Register Src = MI.getOperand(1).getReg();
2653   unsigned Flags = MI.getFlags();
2654   LLT Ty = B.getMRI()->getType(Dst);
2655 
2656   auto K = B.buildFConstant(Ty, numbers::log2e);
2657   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2658   B.buildFExp2(Dst, Mul, Flags);
2659   MI.eraseFromParent();
2660   return true;
2661 }
2662 
2663 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2664                                        MachineIRBuilder &B) const {
2665   Register Dst = MI.getOperand(0).getReg();
2666   Register Src0 = MI.getOperand(1).getReg();
2667   Register Src1 = MI.getOperand(2).getReg();
2668   unsigned Flags = MI.getFlags();
2669   LLT Ty = B.getMRI()->getType(Dst);
2670   const LLT S16 = LLT::scalar(16);
2671   const LLT S32 = LLT::scalar(32);
2672 
2673   if (Ty == S32) {
2674     auto Log = B.buildFLog2(S32, Src0, Flags);
2675     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2676       .addUse(Log.getReg(0))
2677       .addUse(Src1)
2678       .setMIFlags(Flags);
2679     B.buildFExp2(Dst, Mul, Flags);
2680   } else if (Ty == S16) {
2681     // There's no f16 fmul_legacy, so we need to convert for it.
2682     auto Log = B.buildFLog2(S16, Src0, Flags);
2683     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2684     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2685     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2686       .addUse(Ext0.getReg(0))
2687       .addUse(Ext1.getReg(0))
2688       .setMIFlags(Flags);
2689 
2690     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2691   } else
2692     return false;
2693 
2694   MI.eraseFromParent();
2695   return true;
2696 }
2697 
2698 // Find a source register, ignoring any possible source modifiers.
2699 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2700   Register ModSrc = OrigSrc;
2701   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2702     ModSrc = SrcFNeg->getOperand(1).getReg();
2703     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2704       ModSrc = SrcFAbs->getOperand(1).getReg();
2705   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2706     ModSrc = SrcFAbs->getOperand(1).getReg();
2707   return ModSrc;
2708 }
2709 
2710 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2711                                          MachineRegisterInfo &MRI,
2712                                          MachineIRBuilder &B) const {
2713 
2714   const LLT S1 = LLT::scalar(1);
2715   const LLT S64 = LLT::scalar(64);
2716   Register Dst = MI.getOperand(0).getReg();
2717   Register OrigSrc = MI.getOperand(1).getReg();
2718   unsigned Flags = MI.getFlags();
2719   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2720          "this should not have been custom lowered");
2721 
2722   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2723   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2724   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2725   // V_FRACT bug is:
2726   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2727   //
2728   // Convert floor(x) to (x - fract(x))
2729 
2730   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2731     .addUse(OrigSrc)
2732     .setMIFlags(Flags);
2733 
2734   // Give source modifier matching some assistance before obscuring a foldable
2735   // pattern.
2736 
2737   // TODO: We can avoid the neg on the fract? The input sign to fract
2738   // shouldn't matter?
2739   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2740 
2741   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2742 
2743   Register Min = MRI.createGenericVirtualRegister(S64);
2744 
2745   // We don't need to concern ourselves with the snan handling difference, so
2746   // use the one which will directly select.
2747   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2748   if (MFI->getMode().IEEE)
2749     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2750   else
2751     B.buildFMinNum(Min, Fract, Const, Flags);
2752 
2753   Register CorrectedFract = Min;
2754   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2755     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2756     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2757   }
2758 
2759   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2760   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2761 
2762   MI.eraseFromParent();
2763   return true;
2764 }
2765 
2766 // Turn an illegal packed v2s16 build vector into bit operations.
2767 // TODO: This should probably be a bitcast action in LegalizerHelper.
2768 bool AMDGPULegalizerInfo::legalizeBuildVector(
2769   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2770   Register Dst = MI.getOperand(0).getReg();
2771   const LLT S32 = LLT::scalar(32);
2772   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2773 
2774   Register Src0 = MI.getOperand(1).getReg();
2775   Register Src1 = MI.getOperand(2).getReg();
2776   assert(MRI.getType(Src0) == LLT::scalar(16));
2777 
2778   auto Merge = B.buildMerge(S32, {Src0, Src1});
2779   B.buildBitcast(Dst, Merge);
2780 
2781   MI.eraseFromParent();
2782   return true;
2783 }
2784 
2785 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2786 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2787 // case with a single min instruction instead of a compare+select.
2788 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2789                                             MachineRegisterInfo &MRI,
2790                                             MachineIRBuilder &B) const {
2791   Register Dst = MI.getOperand(0).getReg();
2792   Register Src = MI.getOperand(1).getReg();
2793   LLT DstTy = MRI.getType(Dst);
2794   LLT SrcTy = MRI.getType(Src);
2795 
2796   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2797                         ? AMDGPU::G_AMDGPU_FFBH_U32
2798                         : AMDGPU::G_AMDGPU_FFBL_B32;
2799   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2800   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2801 
2802   MI.eraseFromParent();
2803   return true;
2804 }
2805 
2806 // Check that this is a G_XOR x, -1
2807 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2808   if (MI.getOpcode() != TargetOpcode::G_XOR)
2809     return false;
2810   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2811   return ConstVal && *ConstVal == -1;
2812 }
2813 
2814 // Return the use branch instruction, otherwise null if the usage is invalid.
2815 static MachineInstr *
2816 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2817                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2818   Register CondDef = MI.getOperand(0).getReg();
2819   if (!MRI.hasOneNonDBGUse(CondDef))
2820     return nullptr;
2821 
2822   MachineBasicBlock *Parent = MI.getParent();
2823   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2824 
2825   if (isNot(MRI, *UseMI)) {
2826     Register NegatedCond = UseMI->getOperand(0).getReg();
2827     if (!MRI.hasOneNonDBGUse(NegatedCond))
2828       return nullptr;
2829 
2830     // We're deleting the def of this value, so we need to remove it.
2831     UseMI->eraseFromParent();
2832 
2833     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2834     Negated = true;
2835   }
2836 
2837   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2838     return nullptr;
2839 
2840   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2841   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2842   if (Next == Parent->end()) {
2843     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2844     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2845       return nullptr;
2846     UncondBrTarget = &*NextMBB;
2847   } else {
2848     if (Next->getOpcode() != AMDGPU::G_BR)
2849       return nullptr;
2850     Br = &*Next;
2851     UncondBrTarget = Br->getOperand(0).getMBB();
2852   }
2853 
2854   return UseMI;
2855 }
2856 
2857 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2858                                          const ArgDescriptor *Arg,
2859                                          const TargetRegisterClass *ArgRC,
2860                                          LLT ArgTy) const {
2861   MCRegister SrcReg = Arg->getRegister();
2862   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2863   assert(DstReg.isVirtual() && "Virtual register expected");
2864 
2865   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2866                                              ArgTy);
2867   if (Arg->isMasked()) {
2868     // TODO: Should we try to emit this once in the entry block?
2869     const LLT S32 = LLT::scalar(32);
2870     const unsigned Mask = Arg->getMask();
2871     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2872 
2873     Register AndMaskSrc = LiveIn;
2874 
2875     if (Shift != 0) {
2876       auto ShiftAmt = B.buildConstant(S32, Shift);
2877       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2878     }
2879 
2880     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2881   } else {
2882     B.buildCopy(DstReg, LiveIn);
2883   }
2884 
2885   return true;
2886 }
2887 
2888 bool AMDGPULegalizerInfo::loadInputValue(
2889     Register DstReg, MachineIRBuilder &B,
2890     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2891   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2892   const ArgDescriptor *Arg;
2893   const TargetRegisterClass *ArgRC;
2894   LLT ArgTy;
2895   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2896 
2897   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2898     return false; // TODO: Handle these
2899   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2900 }
2901 
2902 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2903     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2904     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2905   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2906     return false;
2907 
2908   MI.eraseFromParent();
2909   return true;
2910 }
2911 
2912 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2913                                        MachineRegisterInfo &MRI,
2914                                        MachineIRBuilder &B) const {
2915   Register Dst = MI.getOperand(0).getReg();
2916   LLT DstTy = MRI.getType(Dst);
2917   LLT S16 = LLT::scalar(16);
2918   LLT S32 = LLT::scalar(32);
2919   LLT S64 = LLT::scalar(64);
2920 
2921   if (DstTy == S16)
2922     return legalizeFDIV16(MI, MRI, B);
2923   if (DstTy == S32)
2924     return legalizeFDIV32(MI, MRI, B);
2925   if (DstTy == S64)
2926     return legalizeFDIV64(MI, MRI, B);
2927 
2928   return false;
2929 }
2930 
2931 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2932                                                         Register DstDivReg,
2933                                                         Register DstRemReg,
2934                                                         Register X,
2935                                                         Register Y) const {
2936   const LLT S1 = LLT::scalar(1);
2937   const LLT S32 = LLT::scalar(32);
2938 
2939   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2940   // algorithm used here.
2941 
2942   // Initial estimate of inv(y).
2943   auto FloatY = B.buildUITOFP(S32, Y);
2944   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2945   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2946   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2947   auto Z = B.buildFPTOUI(S32, ScaledY);
2948 
2949   // One round of UNR.
2950   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2951   auto NegYZ = B.buildMul(S32, NegY, Z);
2952   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2953 
2954   // Quotient/remainder estimate.
2955   auto Q = B.buildUMulH(S32, X, Z);
2956   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2957 
2958   // First quotient/remainder refinement.
2959   auto One = B.buildConstant(S32, 1);
2960   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2961   if (DstDivReg)
2962     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2963   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2964 
2965   // Second quotient/remainder refinement.
2966   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2967   if (DstDivReg)
2968     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2969 
2970   if (DstRemReg)
2971     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2972 }
2973 
2974 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2975 //
2976 // Return lo, hi of result
2977 //
2978 // %cvt.lo = G_UITOFP Val.lo
2979 // %cvt.hi = G_UITOFP Val.hi
2980 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2981 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2982 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2983 // %mul2 = G_FMUL %mul1, 2**(-32)
2984 // %trunc = G_INTRINSIC_TRUNC %mul2
2985 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2986 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2987 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2988                                                        Register Val) {
2989   const LLT S32 = LLT::scalar(32);
2990   auto Unmerge = B.buildUnmerge(S32, Val);
2991 
2992   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2993   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2994 
2995   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2996                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2997 
2998   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2999   auto Mul1 =
3000       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3001 
3002   // 2**(-32)
3003   auto Mul2 =
3004       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3005   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3006 
3007   // -(2**32)
3008   auto Mad2 = B.buildFMAD(S32, Trunc,
3009                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3010 
3011   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3012   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3013 
3014   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3015 }
3016 
3017 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3018                                                         Register DstDivReg,
3019                                                         Register DstRemReg,
3020                                                         Register Numer,
3021                                                         Register Denom) const {
3022   const LLT S32 = LLT::scalar(32);
3023   const LLT S64 = LLT::scalar(64);
3024   const LLT S1 = LLT::scalar(1);
3025   Register RcpLo, RcpHi;
3026 
3027   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3028 
3029   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3030 
3031   auto Zero64 = B.buildConstant(S64, 0);
3032   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3033 
3034   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3035   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3036 
3037   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3038   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3039   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3040 
3041   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3042   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3043   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3044   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3045 
3046   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3047   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3048   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3049   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3050   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3051 
3052   auto Zero32 = B.buildConstant(S32, 0);
3053   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3054   auto Add2_HiC =
3055       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3056   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3057   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3058 
3059   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3060   Register NumerLo = UnmergeNumer.getReg(0);
3061   Register NumerHi = UnmergeNumer.getReg(1);
3062 
3063   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3064   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3065   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3066   Register Mul3_Lo = UnmergeMul3.getReg(0);
3067   Register Mul3_Hi = UnmergeMul3.getReg(1);
3068   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3069   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3070   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3071   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3072 
3073   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3074   Register DenomLo = UnmergeDenom.getReg(0);
3075   Register DenomHi = UnmergeDenom.getReg(1);
3076 
3077   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3078   auto C1 = B.buildSExt(S32, CmpHi);
3079 
3080   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3081   auto C2 = B.buildSExt(S32, CmpLo);
3082 
3083   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3084   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3085 
3086   // TODO: Here and below portions of the code can be enclosed into if/endif.
3087   // Currently control flow is unconditional and we have 4 selects after
3088   // potential endif to substitute PHIs.
3089 
3090   // if C3 != 0 ...
3091   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3092   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3093   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3094   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3095 
3096   auto One64 = B.buildConstant(S64, 1);
3097   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3098 
3099   auto C4 =
3100       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3101   auto C5 =
3102       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3103   auto C6 = B.buildSelect(
3104       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3105 
3106   // if (C6 != 0)
3107   auto Add4 = B.buildAdd(S64, Add3, One64);
3108   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3109 
3110   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3111   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3112   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3113 
3114   // endif C6
3115   // endif C3
3116 
3117   if (DstDivReg) {
3118     auto Sel1 = B.buildSelect(
3119         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3120     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3121                   Sel1, MulHi3);
3122   }
3123 
3124   if (DstRemReg) {
3125     auto Sel2 = B.buildSelect(
3126         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3127     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3128                   Sel2, Sub1);
3129   }
3130 }
3131 
3132 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3133                                                   MachineRegisterInfo &MRI,
3134                                                   MachineIRBuilder &B) const {
3135   Register DstDivReg, DstRemReg;
3136   switch (MI.getOpcode()) {
3137   default:
3138     llvm_unreachable("Unexpected opcode!");
3139   case AMDGPU::G_UDIV: {
3140     DstDivReg = MI.getOperand(0).getReg();
3141     break;
3142   }
3143   case AMDGPU::G_UREM: {
3144     DstRemReg = MI.getOperand(0).getReg();
3145     break;
3146   }
3147   case AMDGPU::G_UDIVREM: {
3148     DstDivReg = MI.getOperand(0).getReg();
3149     DstRemReg = MI.getOperand(1).getReg();
3150     break;
3151   }
3152   }
3153 
3154   const LLT S64 = LLT::scalar(64);
3155   const LLT S32 = LLT::scalar(32);
3156   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3157   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3158   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3159   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3160 
3161   if (Ty == S32)
3162     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3163   else if (Ty == S64)
3164     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3165   else
3166     return false;
3167 
3168   MI.eraseFromParent();
3169   return true;
3170 }
3171 
3172 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3173                                                 MachineRegisterInfo &MRI,
3174                                                 MachineIRBuilder &B) const {
3175   const LLT S64 = LLT::scalar(64);
3176   const LLT S32 = LLT::scalar(32);
3177 
3178   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3179   if (Ty != S32 && Ty != S64)
3180     return false;
3181 
3182   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3183   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3184   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3185 
3186   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3187   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3188   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3189 
3190   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3191   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3192 
3193   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3194   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3195 
3196   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3197   switch (MI.getOpcode()) {
3198   default:
3199     llvm_unreachable("Unexpected opcode!");
3200   case AMDGPU::G_SDIV: {
3201     DstDivReg = MI.getOperand(0).getReg();
3202     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3203     break;
3204   }
3205   case AMDGPU::G_SREM: {
3206     DstRemReg = MI.getOperand(0).getReg();
3207     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3208     break;
3209   }
3210   case AMDGPU::G_SDIVREM: {
3211     DstDivReg = MI.getOperand(0).getReg();
3212     DstRemReg = MI.getOperand(1).getReg();
3213     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3214     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3215     break;
3216   }
3217   }
3218 
3219   if (Ty == S32)
3220     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3221   else
3222     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3223 
3224   if (DstDivReg) {
3225     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3226     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3227     B.buildSub(DstDivReg, SignXor, Sign);
3228   }
3229 
3230   if (DstRemReg) {
3231     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3232     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3233     B.buildSub(DstRemReg, SignXor, Sign);
3234   }
3235 
3236   MI.eraseFromParent();
3237   return true;
3238 }
3239 
3240 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3241                                                  MachineRegisterInfo &MRI,
3242                                                  MachineIRBuilder &B) const {
3243   Register Res = MI.getOperand(0).getReg();
3244   Register LHS = MI.getOperand(1).getReg();
3245   Register RHS = MI.getOperand(2).getReg();
3246   uint16_t Flags = MI.getFlags();
3247   LLT ResTy = MRI.getType(Res);
3248 
3249   const MachineFunction &MF = B.getMF();
3250   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3251                             MI.getFlag(MachineInstr::FmAfn);
3252 
3253   if (!AllowInaccurateRcp)
3254     return false;
3255 
3256   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3257     // 1 / x -> RCP(x)
3258     if (CLHS->isExactlyValue(1.0)) {
3259       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3260         .addUse(RHS)
3261         .setMIFlags(Flags);
3262 
3263       MI.eraseFromParent();
3264       return true;
3265     }
3266 
3267     // -1 / x -> RCP( FNEG(x) )
3268     if (CLHS->isExactlyValue(-1.0)) {
3269       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3270       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3271         .addUse(FNeg.getReg(0))
3272         .setMIFlags(Flags);
3273 
3274       MI.eraseFromParent();
3275       return true;
3276     }
3277   }
3278 
3279   // x / y -> x * (1.0 / y)
3280   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3281     .addUse(RHS)
3282     .setMIFlags(Flags);
3283   B.buildFMul(Res, LHS, RCP, Flags);
3284 
3285   MI.eraseFromParent();
3286   return true;
3287 }
3288 
3289 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3290                                                    MachineRegisterInfo &MRI,
3291                                                    MachineIRBuilder &B) const {
3292   Register Res = MI.getOperand(0).getReg();
3293   Register X = MI.getOperand(1).getReg();
3294   Register Y = MI.getOperand(2).getReg();
3295   uint16_t Flags = MI.getFlags();
3296   LLT ResTy = MRI.getType(Res);
3297 
3298   const MachineFunction &MF = B.getMF();
3299   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3300                             MI.getFlag(MachineInstr::FmAfn);
3301 
3302   if (!AllowInaccurateRcp)
3303     return false;
3304 
3305   auto NegY = B.buildFNeg(ResTy, Y);
3306   auto One = B.buildFConstant(ResTy, 1.0);
3307 
3308   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3309     .addUse(Y)
3310     .setMIFlags(Flags);
3311 
3312   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3313   R = B.buildFMA(ResTy, Tmp0, R, R);
3314 
3315   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3316   R = B.buildFMA(ResTy, Tmp1, R, R);
3317 
3318   auto Ret = B.buildFMul(ResTy, X, R);
3319   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3320 
3321   B.buildFMA(Res, Tmp2, R, Ret);
3322   MI.eraseFromParent();
3323   return true;
3324 }
3325 
3326 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3327                                          MachineRegisterInfo &MRI,
3328                                          MachineIRBuilder &B) const {
3329   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3330     return true;
3331 
3332   Register Res = MI.getOperand(0).getReg();
3333   Register LHS = MI.getOperand(1).getReg();
3334   Register RHS = MI.getOperand(2).getReg();
3335 
3336   uint16_t Flags = MI.getFlags();
3337 
3338   LLT S16 = LLT::scalar(16);
3339   LLT S32 = LLT::scalar(32);
3340 
3341   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3342   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3343 
3344   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3345     .addUse(RHSExt.getReg(0))
3346     .setMIFlags(Flags);
3347 
3348   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3349   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3350 
3351   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3352     .addUse(RDst.getReg(0))
3353     .addUse(RHS)
3354     .addUse(LHS)
3355     .setMIFlags(Flags);
3356 
3357   MI.eraseFromParent();
3358   return true;
3359 }
3360 
3361 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3362 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3363 static void toggleSPDenormMode(bool Enable,
3364                                MachineIRBuilder &B,
3365                                const GCNSubtarget &ST,
3366                                AMDGPU::SIModeRegisterDefaults Mode) {
3367   // Set SP denorm mode to this value.
3368   unsigned SPDenormMode =
3369     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3370 
3371   if (ST.hasDenormModeInst()) {
3372     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3373     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3374 
3375     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3376     B.buildInstr(AMDGPU::S_DENORM_MODE)
3377       .addImm(NewDenormModeValue);
3378 
3379   } else {
3380     // Select FP32 bit field in mode register.
3381     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3382                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3383                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3384 
3385     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3386       .addImm(SPDenormMode)
3387       .addImm(SPDenormModeBitField);
3388   }
3389 }
3390 
3391 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3392                                          MachineRegisterInfo &MRI,
3393                                          MachineIRBuilder &B) const {
3394   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3395     return true;
3396 
3397   Register Res = MI.getOperand(0).getReg();
3398   Register LHS = MI.getOperand(1).getReg();
3399   Register RHS = MI.getOperand(2).getReg();
3400   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3401   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3402 
3403   uint16_t Flags = MI.getFlags();
3404 
3405   LLT S32 = LLT::scalar(32);
3406   LLT S1 = LLT::scalar(1);
3407 
3408   auto One = B.buildFConstant(S32, 1.0f);
3409 
3410   auto DenominatorScaled =
3411     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3412       .addUse(LHS)
3413       .addUse(RHS)
3414       .addImm(0)
3415       .setMIFlags(Flags);
3416   auto NumeratorScaled =
3417     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3418       .addUse(LHS)
3419       .addUse(RHS)
3420       .addImm(1)
3421       .setMIFlags(Flags);
3422 
3423   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3424     .addUse(DenominatorScaled.getReg(0))
3425     .setMIFlags(Flags);
3426   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3427 
3428   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3429   // aren't modeled as reading it.
3430   if (!Mode.allFP32Denormals())
3431     toggleSPDenormMode(true, B, ST, Mode);
3432 
3433   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3434   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3435   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3436   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3437   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3438   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3439 
3440   if (!Mode.allFP32Denormals())
3441     toggleSPDenormMode(false, B, ST, Mode);
3442 
3443   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3444     .addUse(Fma4.getReg(0))
3445     .addUse(Fma1.getReg(0))
3446     .addUse(Fma3.getReg(0))
3447     .addUse(NumeratorScaled.getReg(1))
3448     .setMIFlags(Flags);
3449 
3450   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3451     .addUse(Fmas.getReg(0))
3452     .addUse(RHS)
3453     .addUse(LHS)
3454     .setMIFlags(Flags);
3455 
3456   MI.eraseFromParent();
3457   return true;
3458 }
3459 
3460 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3461                                          MachineRegisterInfo &MRI,
3462                                          MachineIRBuilder &B) const {
3463   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3464     return true;
3465 
3466   Register Res = MI.getOperand(0).getReg();
3467   Register LHS = MI.getOperand(1).getReg();
3468   Register RHS = MI.getOperand(2).getReg();
3469 
3470   uint16_t Flags = MI.getFlags();
3471 
3472   LLT S64 = LLT::scalar(64);
3473   LLT S1 = LLT::scalar(1);
3474 
3475   auto One = B.buildFConstant(S64, 1.0);
3476 
3477   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3478     .addUse(LHS)
3479     .addUse(RHS)
3480     .addImm(0)
3481     .setMIFlags(Flags);
3482 
3483   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3484 
3485   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3486     .addUse(DivScale0.getReg(0))
3487     .setMIFlags(Flags);
3488 
3489   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3490   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3491   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3492 
3493   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3494     .addUse(LHS)
3495     .addUse(RHS)
3496     .addImm(1)
3497     .setMIFlags(Flags);
3498 
3499   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3500   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3501   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3502 
3503   Register Scale;
3504   if (!ST.hasUsableDivScaleConditionOutput()) {
3505     // Workaround a hardware bug on SI where the condition output from div_scale
3506     // is not usable.
3507 
3508     LLT S32 = LLT::scalar(32);
3509 
3510     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3511     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3512     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3513     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3514 
3515     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3516                               Scale1Unmerge.getReg(1));
3517     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3518                               Scale0Unmerge.getReg(1));
3519     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3520   } else {
3521     Scale = DivScale1.getReg(1);
3522   }
3523 
3524   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3525     .addUse(Fma4.getReg(0))
3526     .addUse(Fma3.getReg(0))
3527     .addUse(Mul.getReg(0))
3528     .addUse(Scale)
3529     .setMIFlags(Flags);
3530 
3531   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3532     .addUse(Fmas.getReg(0))
3533     .addUse(RHS)
3534     .addUse(LHS)
3535     .setMIFlags(Flags);
3536 
3537   MI.eraseFromParent();
3538   return true;
3539 }
3540 
3541 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3542                                                  MachineRegisterInfo &MRI,
3543                                                  MachineIRBuilder &B) const {
3544   Register Res = MI.getOperand(0).getReg();
3545   Register LHS = MI.getOperand(2).getReg();
3546   Register RHS = MI.getOperand(3).getReg();
3547   uint16_t Flags = MI.getFlags();
3548 
3549   LLT S32 = LLT::scalar(32);
3550   LLT S1 = LLT::scalar(1);
3551 
3552   auto Abs = B.buildFAbs(S32, RHS, Flags);
3553   const APFloat C0Val(1.0f);
3554 
3555   auto C0 = B.buildConstant(S32, 0x6f800000);
3556   auto C1 = B.buildConstant(S32, 0x2f800000);
3557   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3558 
3559   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3560   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3561 
3562   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3563 
3564   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3565     .addUse(Mul0.getReg(0))
3566     .setMIFlags(Flags);
3567 
3568   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3569 
3570   B.buildFMul(Res, Sel, Mul1, Flags);
3571 
3572   MI.eraseFromParent();
3573   return true;
3574 }
3575 
3576 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3577 // FIXME: Why do we handle this one but not other removed instructions?
3578 //
3579 // Reciprocal square root.  The clamp prevents infinite results, clamping
3580 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3581 // +-max_float.
3582 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3583                                                     MachineRegisterInfo &MRI,
3584                                                     MachineIRBuilder &B) const {
3585   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3586     return true;
3587 
3588   Register Dst = MI.getOperand(0).getReg();
3589   Register Src = MI.getOperand(2).getReg();
3590   auto Flags = MI.getFlags();
3591 
3592   LLT Ty = MRI.getType(Dst);
3593 
3594   const fltSemantics *FltSemantics;
3595   if (Ty == LLT::scalar(32))
3596     FltSemantics = &APFloat::IEEEsingle();
3597   else if (Ty == LLT::scalar(64))
3598     FltSemantics = &APFloat::IEEEdouble();
3599   else
3600     return false;
3601 
3602   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3603     .addUse(Src)
3604     .setMIFlags(Flags);
3605 
3606   // We don't need to concern ourselves with the snan handling difference, since
3607   // the rsq quieted (or not) so use the one which will directly select.
3608   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3609   const bool UseIEEE = MFI->getMode().IEEE;
3610 
3611   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3612   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3613                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3614 
3615   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3616 
3617   if (UseIEEE)
3618     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3619   else
3620     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3621   MI.eraseFromParent();
3622   return true;
3623 }
3624 
3625 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3626   switch (IID) {
3627   case Intrinsic::amdgcn_ds_fadd:
3628     return AMDGPU::G_ATOMICRMW_FADD;
3629   case Intrinsic::amdgcn_ds_fmin:
3630     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3631   case Intrinsic::amdgcn_ds_fmax:
3632     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3633   default:
3634     llvm_unreachable("not a DS FP intrinsic");
3635   }
3636 }
3637 
3638 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3639                                                       MachineInstr &MI,
3640                                                       Intrinsic::ID IID) const {
3641   GISelChangeObserver &Observer = Helper.Observer;
3642   Observer.changingInstr(MI);
3643 
3644   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3645 
3646   // The remaining operands were used to set fields in the MemOperand on
3647   // construction.
3648   for (int I = 6; I > 3; --I)
3649     MI.RemoveOperand(I);
3650 
3651   MI.RemoveOperand(1); // Remove the intrinsic ID.
3652   Observer.changedInstr(MI);
3653   return true;
3654 }
3655 
3656 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3657                                             MachineRegisterInfo &MRI,
3658                                             MachineIRBuilder &B) const {
3659   uint64_t Offset =
3660     ST.getTargetLowering()->getImplicitParameterOffset(
3661       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3662   LLT DstTy = MRI.getType(DstReg);
3663   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3664 
3665   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3666   if (!loadInputValue(KernargPtrReg, B,
3667                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3668     return false;
3669 
3670   // FIXME: This should be nuw
3671   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3672   return true;
3673 }
3674 
3675 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3676                                                  MachineRegisterInfo &MRI,
3677                                                  MachineIRBuilder &B) const {
3678   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3679   if (!MFI->isEntryFunction()) {
3680     return legalizePreloadedArgIntrin(MI, MRI, B,
3681                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3682   }
3683 
3684   Register DstReg = MI.getOperand(0).getReg();
3685   if (!getImplicitArgPtr(DstReg, MRI, B))
3686     return false;
3687 
3688   MI.eraseFromParent();
3689   return true;
3690 }
3691 
3692 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3693                                               MachineRegisterInfo &MRI,
3694                                               MachineIRBuilder &B,
3695                                               unsigned AddrSpace) const {
3696   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3697   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3698   Register Hi32 = Unmerge.getReg(1);
3699 
3700   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3701   MI.eraseFromParent();
3702   return true;
3703 }
3704 
3705 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3706 // offset (the offset that is included in bounds checking and swizzling, to be
3707 // split between the instruction's voffset and immoffset fields) and soffset
3708 // (the offset that is excluded from bounds checking and swizzling, to go in
3709 // the instruction's soffset field).  This function takes the first kind of
3710 // offset and figures out how to split it between voffset and immoffset.
3711 std::pair<Register, unsigned>
3712 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3713                                         Register OrigOffset) const {
3714   const unsigned MaxImm = 4095;
3715   Register BaseReg;
3716   unsigned ImmOffset;
3717   const LLT S32 = LLT::scalar(32);
3718   MachineRegisterInfo &MRI = *B.getMRI();
3719 
3720   std::tie(BaseReg, ImmOffset) =
3721       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3722 
3723   // If BaseReg is a pointer, convert it to int.
3724   if (MRI.getType(BaseReg).isPointer())
3725     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3726 
3727   // If the immediate value is too big for the immoffset field, put the value
3728   // and -4096 into the immoffset field so that the value that is copied/added
3729   // for the voffset field is a multiple of 4096, and it stands more chance
3730   // of being CSEd with the copy/add for another similar load/store.
3731   // However, do not do that rounding down to a multiple of 4096 if that is a
3732   // negative number, as it appears to be illegal to have a negative offset
3733   // in the vgpr, even if adding the immediate offset makes it positive.
3734   unsigned Overflow = ImmOffset & ~MaxImm;
3735   ImmOffset -= Overflow;
3736   if ((int32_t)Overflow < 0) {
3737     Overflow += ImmOffset;
3738     ImmOffset = 0;
3739   }
3740 
3741   if (Overflow != 0) {
3742     if (!BaseReg) {
3743       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3744     } else {
3745       auto OverflowVal = B.buildConstant(S32, Overflow);
3746       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3747     }
3748   }
3749 
3750   if (!BaseReg)
3751     BaseReg = B.buildConstant(S32, 0).getReg(0);
3752 
3753   return std::make_pair(BaseReg, ImmOffset);
3754 }
3755 
3756 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3757 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3758                                           Register VOffset, Register SOffset,
3759                                           unsigned ImmOffset, Register VIndex,
3760                                           MachineRegisterInfo &MRI) const {
3761   Optional<ValueAndVReg> MaybeVOffsetVal =
3762       getConstantVRegValWithLookThrough(VOffset, MRI);
3763   Optional<ValueAndVReg> MaybeSOffsetVal =
3764       getConstantVRegValWithLookThrough(SOffset, MRI);
3765   Optional<ValueAndVReg> MaybeVIndexVal =
3766       getConstantVRegValWithLookThrough(VIndex, MRI);
3767   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3768   // update the MMO with that offset. The stride is unknown so we can only do
3769   // this if VIndex is constant 0.
3770   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3771       MaybeVIndexVal->Value == 0) {
3772     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3773                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3774     MMO->setOffset(TotalOffset);
3775   } else {
3776     // We don't have a constant combined offset to use in the MMO. Give up.
3777     MMO->setValue((Value *)nullptr);
3778   }
3779 }
3780 
3781 /// Handle register layout difference for f16 images for some subtargets.
3782 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3783                                              MachineRegisterInfo &MRI,
3784                                              Register Reg,
3785                                              bool ImageStore) const {
3786   const LLT S16 = LLT::scalar(16);
3787   const LLT S32 = LLT::scalar(32);
3788   LLT StoreVT = MRI.getType(Reg);
3789   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3790 
3791   if (ST.hasUnpackedD16VMem()) {
3792     auto Unmerge = B.buildUnmerge(S16, Reg);
3793 
3794     SmallVector<Register, 4> WideRegs;
3795     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3796       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3797 
3798     int NumElts = StoreVT.getNumElements();
3799 
3800     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3801         .getReg(0);
3802   }
3803 
3804   if (ImageStore && ST.hasImageStoreD16Bug()) {
3805     if (StoreVT.getNumElements() == 2) {
3806       SmallVector<Register, 4> PackedRegs;
3807       Reg = B.buildBitcast(S32, Reg).getReg(0);
3808       PackedRegs.push_back(Reg);
3809       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3810       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3811           .getReg(0);
3812     }
3813 
3814     if (StoreVT.getNumElements() == 3) {
3815       SmallVector<Register, 4> PackedRegs;
3816       auto Unmerge = B.buildUnmerge(S16, Reg);
3817       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3818         PackedRegs.push_back(Unmerge.getReg(I));
3819       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3820       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3821       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3822     }
3823 
3824     if (StoreVT.getNumElements() == 4) {
3825       SmallVector<Register, 4> PackedRegs;
3826       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3827       auto Unmerge = B.buildUnmerge(S32, Reg);
3828       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3829         PackedRegs.push_back(Unmerge.getReg(I));
3830       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3831       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3832           .getReg(0);
3833     }
3834 
3835     llvm_unreachable("invalid data type");
3836   }
3837 
3838   return Reg;
3839 }
3840 
3841 Register AMDGPULegalizerInfo::fixStoreSourceType(
3842   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3843   MachineRegisterInfo *MRI = B.getMRI();
3844   LLT Ty = MRI->getType(VData);
3845 
3846   const LLT S16 = LLT::scalar(16);
3847 
3848   // Fixup illegal register types for i8 stores.
3849   if (Ty == LLT::scalar(8) || Ty == S16) {
3850     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3851     return AnyExt;
3852   }
3853 
3854   if (Ty.isVector()) {
3855     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3856       if (IsFormat)
3857         return handleD16VData(B, *MRI, VData);
3858     }
3859   }
3860 
3861   return VData;
3862 }
3863 
3864 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3865                                               MachineRegisterInfo &MRI,
3866                                               MachineIRBuilder &B,
3867                                               bool IsTyped,
3868                                               bool IsFormat) const {
3869   Register VData = MI.getOperand(1).getReg();
3870   LLT Ty = MRI.getType(VData);
3871   LLT EltTy = Ty.getScalarType();
3872   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3873   const LLT S32 = LLT::scalar(32);
3874 
3875   VData = fixStoreSourceType(B, VData, IsFormat);
3876   Register RSrc = MI.getOperand(2).getReg();
3877 
3878   MachineMemOperand *MMO = *MI.memoperands_begin();
3879   const int MemSize = MMO->getSize();
3880 
3881   unsigned ImmOffset;
3882 
3883   // The typed intrinsics add an immediate after the registers.
3884   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3885 
3886   // The struct intrinsic variants add one additional operand over raw.
3887   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3888   Register VIndex;
3889   int OpOffset = 0;
3890   if (HasVIndex) {
3891     VIndex = MI.getOperand(3).getReg();
3892     OpOffset = 1;
3893   } else {
3894     VIndex = B.buildConstant(S32, 0).getReg(0);
3895   }
3896 
3897   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3898   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3899 
3900   unsigned Format = 0;
3901   if (IsTyped) {
3902     Format = MI.getOperand(5 + OpOffset).getImm();
3903     ++OpOffset;
3904   }
3905 
3906   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3907 
3908   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3909   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3910 
3911   unsigned Opc;
3912   if (IsTyped) {
3913     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3914                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3915   } else if (IsFormat) {
3916     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3917                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3918   } else {
3919     switch (MemSize) {
3920     case 1:
3921       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3922       break;
3923     case 2:
3924       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3925       break;
3926     default:
3927       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3928       break;
3929     }
3930   }
3931 
3932   auto MIB = B.buildInstr(Opc)
3933     .addUse(VData)              // vdata
3934     .addUse(RSrc)               // rsrc
3935     .addUse(VIndex)             // vindex
3936     .addUse(VOffset)            // voffset
3937     .addUse(SOffset)            // soffset
3938     .addImm(ImmOffset);         // offset(imm)
3939 
3940   if (IsTyped)
3941     MIB.addImm(Format);
3942 
3943   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3944      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3945      .addMemOperand(MMO);
3946 
3947   MI.eraseFromParent();
3948   return true;
3949 }
3950 
3951 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3952                                              MachineRegisterInfo &MRI,
3953                                              MachineIRBuilder &B,
3954                                              bool IsFormat,
3955                                              bool IsTyped) const {
3956   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3957   MachineMemOperand *MMO = *MI.memoperands_begin();
3958   const LLT MemTy = MMO->getMemoryType();
3959   const LLT S32 = LLT::scalar(32);
3960 
3961   Register Dst = MI.getOperand(0).getReg();
3962   Register RSrc = MI.getOperand(2).getReg();
3963 
3964   // The typed intrinsics add an immediate after the registers.
3965   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3966 
3967   // The struct intrinsic variants add one additional operand over raw.
3968   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3969   Register VIndex;
3970   int OpOffset = 0;
3971   if (HasVIndex) {
3972     VIndex = MI.getOperand(3).getReg();
3973     OpOffset = 1;
3974   } else {
3975     VIndex = B.buildConstant(S32, 0).getReg(0);
3976   }
3977 
3978   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3979   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3980 
3981   unsigned Format = 0;
3982   if (IsTyped) {
3983     Format = MI.getOperand(5 + OpOffset).getImm();
3984     ++OpOffset;
3985   }
3986 
3987   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3988   unsigned ImmOffset;
3989 
3990   LLT Ty = MRI.getType(Dst);
3991   LLT EltTy = Ty.getScalarType();
3992   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3993   const bool Unpacked = ST.hasUnpackedD16VMem();
3994 
3995   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3996   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3997 
3998   unsigned Opc;
3999 
4000   if (IsTyped) {
4001     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4002                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4003   } else if (IsFormat) {
4004     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4005                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4006   } else {
4007     switch (MemTy.getSizeInBits()) {
4008     case 8:
4009       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4010       break;
4011     case 16:
4012       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4013       break;
4014     default:
4015       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4016       break;
4017     }
4018   }
4019 
4020   Register LoadDstReg;
4021 
4022   bool IsExtLoad =
4023       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4024   LLT UnpackedTy = Ty.changeElementSize(32);
4025 
4026   if (IsExtLoad)
4027     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4028   else if (Unpacked && IsD16 && Ty.isVector())
4029     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4030   else
4031     LoadDstReg = Dst;
4032 
4033   auto MIB = B.buildInstr(Opc)
4034     .addDef(LoadDstReg)         // vdata
4035     .addUse(RSrc)               // rsrc
4036     .addUse(VIndex)             // vindex
4037     .addUse(VOffset)            // voffset
4038     .addUse(SOffset)            // soffset
4039     .addImm(ImmOffset);         // offset(imm)
4040 
4041   if (IsTyped)
4042     MIB.addImm(Format);
4043 
4044   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4045      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4046      .addMemOperand(MMO);
4047 
4048   if (LoadDstReg != Dst) {
4049     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4050 
4051     // Widen result for extending loads was widened.
4052     if (IsExtLoad)
4053       B.buildTrunc(Dst, LoadDstReg);
4054     else {
4055       // Repack to original 16-bit vector result
4056       // FIXME: G_TRUNC should work, but legalization currently fails
4057       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4058       SmallVector<Register, 4> Repack;
4059       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4060         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4061       B.buildMerge(Dst, Repack);
4062     }
4063   }
4064 
4065   MI.eraseFromParent();
4066   return true;
4067 }
4068 
4069 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4070                                                MachineIRBuilder &B,
4071                                                bool IsInc) const {
4072   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4073                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4074   B.buildInstr(Opc)
4075     .addDef(MI.getOperand(0).getReg())
4076     .addUse(MI.getOperand(2).getReg())
4077     .addUse(MI.getOperand(3).getReg())
4078     .cloneMemRefs(MI);
4079   MI.eraseFromParent();
4080   return true;
4081 }
4082 
4083 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4084   switch (IntrID) {
4085   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4086   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4087     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4088   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4089   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4090     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4091   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4092   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4093     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4094   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4095   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4096     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4097   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4098   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4099     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4100   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4101   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4102     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4103   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4104   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4105     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4106   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4107   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4108     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4109   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4110   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4111     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4112   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4113   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4114     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4115   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4116   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4117     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4118   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4119   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4120     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4121   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4122   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4123     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4124   case Intrinsic::amdgcn_buffer_atomic_fadd:
4125   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4126   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4127     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4128   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4129   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4130     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4131   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4132   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4133     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4134   default:
4135     llvm_unreachable("unhandled atomic opcode");
4136   }
4137 }
4138 
4139 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4140                                                MachineIRBuilder &B,
4141                                                Intrinsic::ID IID) const {
4142   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4143                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4144   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4145 
4146   Register Dst;
4147 
4148   int OpOffset = 0;
4149   if (HasReturn) {
4150     // A few FP atomics do not support return values.
4151     Dst = MI.getOperand(0).getReg();
4152   } else {
4153     OpOffset = -1;
4154   }
4155 
4156   Register VData = MI.getOperand(2 + OpOffset).getReg();
4157   Register CmpVal;
4158 
4159   if (IsCmpSwap) {
4160     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4161     ++OpOffset;
4162   }
4163 
4164   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4165   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4166 
4167   // The struct intrinsic variants add one additional operand over raw.
4168   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4169   Register VIndex;
4170   if (HasVIndex) {
4171     VIndex = MI.getOperand(4 + OpOffset).getReg();
4172     ++OpOffset;
4173   } else {
4174     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4175   }
4176 
4177   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4178   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4179   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4180 
4181   MachineMemOperand *MMO = *MI.memoperands_begin();
4182 
4183   unsigned ImmOffset;
4184   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4185   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4186 
4187   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4188 
4189   if (HasReturn)
4190     MIB.addDef(Dst);
4191 
4192   MIB.addUse(VData); // vdata
4193 
4194   if (IsCmpSwap)
4195     MIB.addReg(CmpVal);
4196 
4197   MIB.addUse(RSrc)               // rsrc
4198      .addUse(VIndex)             // vindex
4199      .addUse(VOffset)            // voffset
4200      .addUse(SOffset)            // soffset
4201      .addImm(ImmOffset)          // offset(imm)
4202      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4203      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4204      .addMemOperand(MMO);
4205 
4206   MI.eraseFromParent();
4207   return true;
4208 }
4209 
4210 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4211 /// vector with s16 typed elements.
4212 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4213                                       SmallVectorImpl<Register> &PackedAddrs,
4214                                       unsigned ArgOffset,
4215                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4216                                       bool IsA16, bool IsG16) {
4217   const LLT S16 = LLT::scalar(16);
4218   const LLT V2S16 = LLT::fixed_vector(2, 16);
4219   auto EndIdx = Intr->VAddrEnd;
4220 
4221   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4222     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4223     if (!SrcOp.isReg())
4224       continue; // _L to _LZ may have eliminated this.
4225 
4226     Register AddrReg = SrcOp.getReg();
4227 
4228     if ((I < Intr->GradientStart) ||
4229         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4230         (I >= Intr->CoordStart && !IsA16)) {
4231       // Handle any gradient or coordinate operands that should not be packed
4232       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4233       PackedAddrs.push_back(AddrReg);
4234     } else {
4235       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4236       // derivatives dx/dh and dx/dv are packed with undef.
4237       if (((I + 1) >= EndIdx) ||
4238           ((Intr->NumGradients / 2) % 2 == 1 &&
4239            (I == static_cast<unsigned>(Intr->GradientStart +
4240                                        (Intr->NumGradients / 2) - 1) ||
4241             I == static_cast<unsigned>(Intr->GradientStart +
4242                                        Intr->NumGradients - 1))) ||
4243           // Check for _L to _LZ optimization
4244           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4245         PackedAddrs.push_back(
4246             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4247                 .getReg(0));
4248       } else {
4249         PackedAddrs.push_back(
4250             B.buildBuildVector(
4251                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4252                 .getReg(0));
4253         ++I;
4254       }
4255     }
4256   }
4257 }
4258 
4259 /// Convert from separate vaddr components to a single vector address register,
4260 /// and replace the remaining operands with $noreg.
4261 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4262                                      int DimIdx, int NumVAddrs) {
4263   const LLT S32 = LLT::scalar(32);
4264 
4265   SmallVector<Register, 8> AddrRegs;
4266   for (int I = 0; I != NumVAddrs; ++I) {
4267     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4268     if (SrcOp.isReg()) {
4269       AddrRegs.push_back(SrcOp.getReg());
4270       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4271     }
4272   }
4273 
4274   int NumAddrRegs = AddrRegs.size();
4275   if (NumAddrRegs != 1) {
4276     // Above 8 elements round up to next power of 2 (i.e. 16).
4277     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4278       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4279       auto Undef = B.buildUndef(S32);
4280       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4281       NumAddrRegs = RoundedNumRegs;
4282     }
4283 
4284     auto VAddr =
4285         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4286     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4287   }
4288 
4289   for (int I = 1; I != NumVAddrs; ++I) {
4290     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4291     if (SrcOp.isReg())
4292       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4293   }
4294 }
4295 
4296 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4297 ///
4298 /// Depending on the subtarget, load/store with 16-bit element data need to be
4299 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4300 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4301 /// registers.
4302 ///
4303 /// We don't want to directly select image instructions just yet, but also want
4304 /// to exposes all register repacking to the legalizer/combiners. We also don't
4305 /// want a selected instrution entering RegBankSelect. In order to avoid
4306 /// defining a multitude of intermediate image instructions, directly hack on
4307 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4308 /// now unnecessary arguments with $noreg.
4309 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4310     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4311     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4312 
4313   const unsigned NumDefs = MI.getNumExplicitDefs();
4314   const unsigned ArgOffset = NumDefs + 1;
4315   bool IsTFE = NumDefs == 2;
4316   // We are only processing the operands of d16 image operations on subtargets
4317   // that use the unpacked register layout, or need to repack the TFE result.
4318 
4319   // TODO: Do we need to guard against already legalized intrinsics?
4320   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4321       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4322 
4323   MachineRegisterInfo *MRI = B.getMRI();
4324   const LLT S32 = LLT::scalar(32);
4325   const LLT S16 = LLT::scalar(16);
4326   const LLT V2S16 = LLT::fixed_vector(2, 16);
4327 
4328   unsigned DMask = 0;
4329 
4330   // Check for 16 bit addresses and pack if true.
4331   LLT GradTy =
4332       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4333   LLT AddrTy =
4334       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4335   const bool IsG16 = GradTy == S16;
4336   const bool IsA16 = AddrTy == S16;
4337 
4338   int DMaskLanes = 0;
4339   if (!BaseOpcode->Atomic) {
4340     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4341     if (BaseOpcode->Gather4) {
4342       DMaskLanes = 4;
4343     } else if (DMask != 0) {
4344       DMaskLanes = countPopulation(DMask);
4345     } else if (!IsTFE && !BaseOpcode->Store) {
4346       // If dmask is 0, this is a no-op load. This can be eliminated.
4347       B.buildUndef(MI.getOperand(0));
4348       MI.eraseFromParent();
4349       return true;
4350     }
4351   }
4352 
4353   Observer.changingInstr(MI);
4354   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4355 
4356   unsigned NewOpcode = NumDefs == 0 ?
4357     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4358 
4359   // Track that we legalized this
4360   MI.setDesc(B.getTII().get(NewOpcode));
4361 
4362   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4363   // dmask to be at least 1 otherwise the instruction will fail
4364   if (IsTFE && DMask == 0) {
4365     DMask = 0x1;
4366     DMaskLanes = 1;
4367     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4368   }
4369 
4370   if (BaseOpcode->Atomic) {
4371     Register VData0 = MI.getOperand(2).getReg();
4372     LLT Ty = MRI->getType(VData0);
4373 
4374     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4375     if (Ty.isVector())
4376       return false;
4377 
4378     if (BaseOpcode->AtomicX2) {
4379       Register VData1 = MI.getOperand(3).getReg();
4380       // The two values are packed in one register.
4381       LLT PackedTy = LLT::fixed_vector(2, Ty);
4382       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4383       MI.getOperand(2).setReg(Concat.getReg(0));
4384       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4385     }
4386   }
4387 
4388   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4389 
4390   // Optimize _L to _LZ when _L is zero
4391   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4392           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4393     const ConstantFP *ConstantLod;
4394 
4395     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4396                  m_GFCst(ConstantLod))) {
4397       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4398         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4399         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4400             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4401                                                       Intr->Dim);
4402 
4403         // The starting indexes should remain in the same place.
4404         --CorrectedNumVAddrs;
4405 
4406         MI.getOperand(MI.getNumExplicitDefs())
4407             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4408         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4409         Intr = NewImageDimIntr;
4410       }
4411     }
4412   }
4413 
4414   // Optimize _mip away, when 'lod' is zero
4415   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4416     int64_t ConstantLod;
4417     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4418                  m_ICst(ConstantLod))) {
4419       if (ConstantLod == 0) {
4420         // TODO: Change intrinsic opcode and remove operand instead or replacing
4421         // it with 0, as the _L to _LZ handling is done above.
4422         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4423         --CorrectedNumVAddrs;
4424       }
4425     }
4426   }
4427 
4428   // Rewrite the addressing register layout before doing anything else.
4429   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4430     // 16 bit gradients are supported, but are tied to the A16 control
4431     // so both gradients and addresses must be 16 bit
4432     return false;
4433   }
4434 
4435   if (IsA16 && !ST.hasA16()) {
4436     // A16 not supported
4437     return false;
4438   }
4439 
4440   if (IsA16 || IsG16) {
4441     if (Intr->NumVAddrs > 1) {
4442       SmallVector<Register, 4> PackedRegs;
4443 
4444       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4445                                 IsG16);
4446 
4447       // See also below in the non-a16 branch
4448       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4449                           PackedRegs.size() <= ST.getNSAMaxSize();
4450 
4451       if (!UseNSA && PackedRegs.size() > 1) {
4452         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4453         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4454         PackedRegs[0] = Concat.getReg(0);
4455         PackedRegs.resize(1);
4456       }
4457 
4458       const unsigned NumPacked = PackedRegs.size();
4459       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4460         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4461         if (!SrcOp.isReg()) {
4462           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4463           continue;
4464         }
4465 
4466         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4467 
4468         if (I - Intr->VAddrStart < NumPacked)
4469           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4470         else
4471           SrcOp.setReg(AMDGPU::NoRegister);
4472       }
4473     }
4474   } else {
4475     // If the register allocator cannot place the address registers contiguously
4476     // without introducing moves, then using the non-sequential address encoding
4477     // is always preferable, since it saves VALU instructions and is usually a
4478     // wash in terms of code size or even better.
4479     //
4480     // However, we currently have no way of hinting to the register allocator
4481     // that MIMG addresses should be placed contiguously when it is possible to
4482     // do so, so force non-NSA for the common 2-address case as a heuristic.
4483     //
4484     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4485     // allocation when possible.
4486     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4487                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4488 
4489     if (!UseNSA && Intr->NumVAddrs > 1)
4490       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4491                                Intr->NumVAddrs);
4492   }
4493 
4494   int Flags = 0;
4495   if (IsA16)
4496     Flags |= 1;
4497   if (IsG16)
4498     Flags |= 2;
4499   MI.addOperand(MachineOperand::CreateImm(Flags));
4500 
4501   if (BaseOpcode->Store) { // No TFE for stores?
4502     // TODO: Handle dmask trim
4503     Register VData = MI.getOperand(1).getReg();
4504     LLT Ty = MRI->getType(VData);
4505     if (!Ty.isVector() || Ty.getElementType() != S16)
4506       return true;
4507 
4508     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4509     if (RepackedReg != VData) {
4510       MI.getOperand(1).setReg(RepackedReg);
4511     }
4512 
4513     return true;
4514   }
4515 
4516   Register DstReg = MI.getOperand(0).getReg();
4517   LLT Ty = MRI->getType(DstReg);
4518   const LLT EltTy = Ty.getScalarType();
4519   const bool IsD16 = Ty.getScalarType() == S16;
4520   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4521 
4522   // Confirm that the return type is large enough for the dmask specified
4523   if (NumElts < DMaskLanes)
4524     return false;
4525 
4526   if (NumElts > 4 || DMaskLanes > 4)
4527     return false;
4528 
4529   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4530   const LLT AdjustedTy =
4531       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4532 
4533   // The raw dword aligned data component of the load. The only legal cases
4534   // where this matters should be when using the packed D16 format, for
4535   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4536   LLT RoundedTy;
4537 
4538   // S32 vector to to cover all data, plus TFE result element.
4539   LLT TFETy;
4540 
4541   // Register type to use for each loaded component. Will be S32 or V2S16.
4542   LLT RegTy;
4543 
4544   if (IsD16 && ST.hasUnpackedD16VMem()) {
4545     RoundedTy =
4546         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4547     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4548     RegTy = S32;
4549   } else {
4550     unsigned EltSize = EltTy.getSizeInBits();
4551     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4552     unsigned RoundedSize = 32 * RoundedElts;
4553     RoundedTy = LLT::scalarOrVector(
4554         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4555     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4556     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4557   }
4558 
4559   // The return type does not need adjustment.
4560   // TODO: Should we change s16 case to s32 or <2 x s16>?
4561   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4562     return true;
4563 
4564   Register Dst1Reg;
4565 
4566   // Insert after the instruction.
4567   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4568 
4569   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4570   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4571   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4572   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4573 
4574   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4575 
4576   MI.getOperand(0).setReg(NewResultReg);
4577 
4578   // In the IR, TFE is supposed to be used with a 2 element struct return
4579   // type. The intruction really returns these two values in one contiguous
4580   // register, with one additional dword beyond the loaded data. Rewrite the
4581   // return type to use a single register result.
4582 
4583   if (IsTFE) {
4584     Dst1Reg = MI.getOperand(1).getReg();
4585     if (MRI->getType(Dst1Reg) != S32)
4586       return false;
4587 
4588     // TODO: Make sure the TFE operand bit is set.
4589     MI.RemoveOperand(1);
4590 
4591     // Handle the easy case that requires no repack instructions.
4592     if (Ty == S32) {
4593       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4594       return true;
4595     }
4596   }
4597 
4598   // Now figure out how to copy the new result register back into the old
4599   // result.
4600   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4601 
4602   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4603 
4604   if (ResultNumRegs == 1) {
4605     assert(!IsTFE);
4606     ResultRegs[0] = NewResultReg;
4607   } else {
4608     // We have to repack into a new vector of some kind.
4609     for (int I = 0; I != NumDataRegs; ++I)
4610       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4611     B.buildUnmerge(ResultRegs, NewResultReg);
4612 
4613     // Drop the final TFE element to get the data part. The TFE result is
4614     // directly written to the right place already.
4615     if (IsTFE)
4616       ResultRegs.resize(NumDataRegs);
4617   }
4618 
4619   // For an s16 scalar result, we form an s32 result with a truncate regardless
4620   // of packed vs. unpacked.
4621   if (IsD16 && !Ty.isVector()) {
4622     B.buildTrunc(DstReg, ResultRegs[0]);
4623     return true;
4624   }
4625 
4626   // Avoid a build/concat_vector of 1 entry.
4627   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4628     B.buildBitcast(DstReg, ResultRegs[0]);
4629     return true;
4630   }
4631 
4632   assert(Ty.isVector());
4633 
4634   if (IsD16) {
4635     // For packed D16 results with TFE enabled, all the data components are
4636     // S32. Cast back to the expected type.
4637     //
4638     // TODO: We don't really need to use load s32 elements. We would only need one
4639     // cast for the TFE result if a multiple of v2s16 was used.
4640     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4641       for (Register &Reg : ResultRegs)
4642         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4643     } else if (ST.hasUnpackedD16VMem()) {
4644       for (Register &Reg : ResultRegs)
4645         Reg = B.buildTrunc(S16, Reg).getReg(0);
4646     }
4647   }
4648 
4649   auto padWithUndef = [&](LLT Ty, int NumElts) {
4650     if (NumElts == 0)
4651       return;
4652     Register Undef = B.buildUndef(Ty).getReg(0);
4653     for (int I = 0; I != NumElts; ++I)
4654       ResultRegs.push_back(Undef);
4655   };
4656 
4657   // Pad out any elements eliminated due to the dmask.
4658   LLT ResTy = MRI->getType(ResultRegs[0]);
4659   if (!ResTy.isVector()) {
4660     padWithUndef(ResTy, NumElts - ResultRegs.size());
4661     B.buildBuildVector(DstReg, ResultRegs);
4662     return true;
4663   }
4664 
4665   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4666   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4667 
4668   // Deal with the one annoying legal case.
4669   const LLT V3S16 = LLT::fixed_vector(3, 16);
4670   if (Ty == V3S16) {
4671     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4672     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4673     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4674     return true;
4675   }
4676 
4677   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4678   B.buildConcatVectors(DstReg, ResultRegs);
4679   return true;
4680 }
4681 
4682 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4683   LegalizerHelper &Helper, MachineInstr &MI) const {
4684   MachineIRBuilder &B = Helper.MIRBuilder;
4685   GISelChangeObserver &Observer = Helper.Observer;
4686 
4687   Register Dst = MI.getOperand(0).getReg();
4688   LLT Ty = B.getMRI()->getType(Dst);
4689   unsigned Size = Ty.getSizeInBits();
4690   MachineFunction &MF = B.getMF();
4691 
4692   Observer.changingInstr(MI);
4693 
4694   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4695     Ty = getBitcastRegisterType(Ty);
4696     Helper.bitcastDst(MI, Ty, 0);
4697     Dst = MI.getOperand(0).getReg();
4698     B.setInsertPt(B.getMBB(), MI);
4699   }
4700 
4701   // FIXME: We don't really need this intermediate instruction. The intrinsic
4702   // should be fixed to have a memory operand. Since it's readnone, we're not
4703   // allowed to add one.
4704   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4705   MI.RemoveOperand(1); // Remove intrinsic ID
4706 
4707   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4708   // TODO: Should this use datalayout alignment?
4709   const unsigned MemSize = (Size + 7) / 8;
4710   const Align MemAlign(4);
4711   MachineMemOperand *MMO = MF.getMachineMemOperand(
4712       MachinePointerInfo(),
4713       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4714           MachineMemOperand::MOInvariant,
4715       MemSize, MemAlign);
4716   MI.addMemOperand(MF, MMO);
4717 
4718   // There are no 96-bit result scalar loads, but widening to 128-bit should
4719   // always be legal. We may need to restore this to a 96-bit result if it turns
4720   // out this needs to be converted to a vector load during RegBankSelect.
4721   if (!isPowerOf2_32(Size)) {
4722     if (Ty.isVector())
4723       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4724     else
4725       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4726   }
4727 
4728   Observer.changedInstr(MI);
4729   return true;
4730 }
4731 
4732 // TODO: Move to selection
4733 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4734                                                 MachineRegisterInfo &MRI,
4735                                                 MachineIRBuilder &B) const {
4736   if (!ST.isTrapHandlerEnabled() ||
4737       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4738     return legalizeTrapEndpgm(MI, MRI, B);
4739 
4740   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4741     switch (*HsaAbiVer) {
4742     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4743     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4744       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4745     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4746       return ST.supportsGetDoorbellID() ?
4747           legalizeTrapHsa(MI, MRI, B) :
4748           legalizeTrapHsaQueuePtr(MI, MRI, B);
4749     }
4750   }
4751 
4752   llvm_unreachable("Unknown trap handler");
4753 }
4754 
4755 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4756     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4757   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4758   MI.eraseFromParent();
4759   return true;
4760 }
4761 
4762 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4763     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4764   // Pass queue pointer to trap handler as input, and insert trap instruction
4765   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4766   Register LiveIn =
4767     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4768   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4769     return false;
4770 
4771   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4772   B.buildCopy(SGPR01, LiveIn);
4773   B.buildInstr(AMDGPU::S_TRAP)
4774       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4775       .addReg(SGPR01, RegState::Implicit);
4776 
4777   MI.eraseFromParent();
4778   return true;
4779 }
4780 
4781 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4782     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4783   B.buildInstr(AMDGPU::S_TRAP)
4784       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4785   MI.eraseFromParent();
4786   return true;
4787 }
4788 
4789 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4790     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4791   // Is non-HSA path or trap-handler disabled? then, report a warning
4792   // accordingly
4793   if (!ST.isTrapHandlerEnabled() ||
4794       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4795     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4796                                      "debugtrap handler not supported",
4797                                      MI.getDebugLoc(), DS_Warning);
4798     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4799     Ctx.diagnose(NoTrap);
4800   } else {
4801     // Insert debug-trap instruction
4802     B.buildInstr(AMDGPU::S_TRAP)
4803         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4804   }
4805 
4806   MI.eraseFromParent();
4807   return true;
4808 }
4809 
4810 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4811                                                MachineIRBuilder &B) const {
4812   MachineRegisterInfo &MRI = *B.getMRI();
4813   const LLT S16 = LLT::scalar(16);
4814   const LLT S32 = LLT::scalar(32);
4815 
4816   Register DstReg = MI.getOperand(0).getReg();
4817   Register NodePtr = MI.getOperand(2).getReg();
4818   Register RayExtent = MI.getOperand(3).getReg();
4819   Register RayOrigin = MI.getOperand(4).getReg();
4820   Register RayDir = MI.getOperand(5).getReg();
4821   Register RayInvDir = MI.getOperand(6).getReg();
4822   Register TDescr = MI.getOperand(7).getReg();
4823 
4824   if (!ST.hasGFX10_AEncoding()) {
4825     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4826                                         "intrinsic not supported on subtarget",
4827                                         MI.getDebugLoc());
4828     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4829     return false;
4830   }
4831 
4832   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4833   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4834   const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4835   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
4836   const unsigned Opcodes[2][2][2] = {
4837       {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
4838         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
4839        {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
4840         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
4841       {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
4842         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
4843        {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
4844         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
4845   const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
4846 
4847   SmallVector<Register, 12> Ops;
4848   if (Is64) {
4849     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4850     Ops.push_back(Unmerge.getReg(0));
4851     Ops.push_back(Unmerge.getReg(1));
4852   } else {
4853     Ops.push_back(NodePtr);
4854   }
4855   Ops.push_back(RayExtent);
4856 
4857   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4858     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4859     Ops.push_back(Unmerge.getReg(0));
4860     Ops.push_back(Unmerge.getReg(1));
4861     Ops.push_back(Unmerge.getReg(2));
4862   };
4863 
4864   packLanes(RayOrigin);
4865   if (IsA16) {
4866     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4867     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4868     Register R1 = MRI.createGenericVirtualRegister(S32);
4869     Register R2 = MRI.createGenericVirtualRegister(S32);
4870     Register R3 = MRI.createGenericVirtualRegister(S32);
4871     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4872     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4873     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4874     Ops.push_back(R1);
4875     Ops.push_back(R2);
4876     Ops.push_back(R3);
4877   } else {
4878     packLanes(RayDir);
4879     packLanes(RayInvDir);
4880   }
4881 
4882   if (!UseNSA) {
4883     // Build a single vector containing all the operands so far prepared.
4884     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4885     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4886     Ops.clear();
4887     Ops.push_back(MergedOps);
4888   }
4889 
4890   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4891     .addDef(DstReg)
4892     .addImm(Opcode);
4893 
4894   for (Register R : Ops) {
4895     MIB.addUse(R);
4896   }
4897 
4898   MIB.addUse(TDescr)
4899      .addImm(IsA16 ? 1 : 0)
4900      .cloneMemRefs(MI);
4901 
4902   MI.eraseFromParent();
4903   return true;
4904 }
4905 
4906 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4907                                             MachineInstr &MI) const {
4908   MachineIRBuilder &B = Helper.MIRBuilder;
4909   MachineRegisterInfo &MRI = *B.getMRI();
4910 
4911   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4912   auto IntrID = MI.getIntrinsicID();
4913   switch (IntrID) {
4914   case Intrinsic::amdgcn_if:
4915   case Intrinsic::amdgcn_else: {
4916     MachineInstr *Br = nullptr;
4917     MachineBasicBlock *UncondBrTarget = nullptr;
4918     bool Negated = false;
4919     if (MachineInstr *BrCond =
4920             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4921       const SIRegisterInfo *TRI
4922         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4923 
4924       Register Def = MI.getOperand(1).getReg();
4925       Register Use = MI.getOperand(3).getReg();
4926 
4927       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4928 
4929       if (Negated)
4930         std::swap(CondBrTarget, UncondBrTarget);
4931 
4932       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4933       if (IntrID == Intrinsic::amdgcn_if) {
4934         B.buildInstr(AMDGPU::SI_IF)
4935           .addDef(Def)
4936           .addUse(Use)
4937           .addMBB(UncondBrTarget);
4938       } else {
4939         B.buildInstr(AMDGPU::SI_ELSE)
4940             .addDef(Def)
4941             .addUse(Use)
4942             .addMBB(UncondBrTarget);
4943       }
4944 
4945       if (Br) {
4946         Br->getOperand(0).setMBB(CondBrTarget);
4947       } else {
4948         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4949         // since we're swapping branch targets it needs to be reinserted.
4950         // FIXME: IRTranslator should probably not do this
4951         B.buildBr(*CondBrTarget);
4952       }
4953 
4954       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4955       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4956       MI.eraseFromParent();
4957       BrCond->eraseFromParent();
4958       return true;
4959     }
4960 
4961     return false;
4962   }
4963   case Intrinsic::amdgcn_loop: {
4964     MachineInstr *Br = nullptr;
4965     MachineBasicBlock *UncondBrTarget = nullptr;
4966     bool Negated = false;
4967     if (MachineInstr *BrCond =
4968             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4969       const SIRegisterInfo *TRI
4970         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4971 
4972       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4973       Register Reg = MI.getOperand(2).getReg();
4974 
4975       if (Negated)
4976         std::swap(CondBrTarget, UncondBrTarget);
4977 
4978       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4979       B.buildInstr(AMDGPU::SI_LOOP)
4980         .addUse(Reg)
4981         .addMBB(UncondBrTarget);
4982 
4983       if (Br)
4984         Br->getOperand(0).setMBB(CondBrTarget);
4985       else
4986         B.buildBr(*CondBrTarget);
4987 
4988       MI.eraseFromParent();
4989       BrCond->eraseFromParent();
4990       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4991       return true;
4992     }
4993 
4994     return false;
4995   }
4996   case Intrinsic::amdgcn_kernarg_segment_ptr:
4997     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4998       // This only makes sense to call in a kernel, so just lower to null.
4999       B.buildConstant(MI.getOperand(0).getReg(), 0);
5000       MI.eraseFromParent();
5001       return true;
5002     }
5003 
5004     return legalizePreloadedArgIntrin(
5005       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5006   case Intrinsic::amdgcn_implicitarg_ptr:
5007     return legalizeImplicitArgPtr(MI, MRI, B);
5008   case Intrinsic::amdgcn_workitem_id_x:
5009     return legalizePreloadedArgIntrin(MI, MRI, B,
5010                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5011   case Intrinsic::amdgcn_workitem_id_y:
5012     return legalizePreloadedArgIntrin(MI, MRI, B,
5013                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5014   case Intrinsic::amdgcn_workitem_id_z:
5015     return legalizePreloadedArgIntrin(MI, MRI, B,
5016                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5017   case Intrinsic::amdgcn_workgroup_id_x:
5018     return legalizePreloadedArgIntrin(MI, MRI, B,
5019                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5020   case Intrinsic::amdgcn_workgroup_id_y:
5021     return legalizePreloadedArgIntrin(MI, MRI, B,
5022                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5023   case Intrinsic::amdgcn_workgroup_id_z:
5024     return legalizePreloadedArgIntrin(MI, MRI, B,
5025                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5026   case Intrinsic::amdgcn_dispatch_ptr:
5027     return legalizePreloadedArgIntrin(MI, MRI, B,
5028                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5029   case Intrinsic::amdgcn_queue_ptr:
5030     return legalizePreloadedArgIntrin(MI, MRI, B,
5031                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5032   case Intrinsic::amdgcn_implicit_buffer_ptr:
5033     return legalizePreloadedArgIntrin(
5034       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5035   case Intrinsic::amdgcn_dispatch_id:
5036     return legalizePreloadedArgIntrin(MI, MRI, B,
5037                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5038   case Intrinsic::amdgcn_fdiv_fast:
5039     return legalizeFDIVFastIntrin(MI, MRI, B);
5040   case Intrinsic::amdgcn_is_shared:
5041     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5042   case Intrinsic::amdgcn_is_private:
5043     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5044   case Intrinsic::amdgcn_wavefrontsize: {
5045     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5046     MI.eraseFromParent();
5047     return true;
5048   }
5049   case Intrinsic::amdgcn_s_buffer_load:
5050     return legalizeSBufferLoad(Helper, MI);
5051   case Intrinsic::amdgcn_raw_buffer_store:
5052   case Intrinsic::amdgcn_struct_buffer_store:
5053     return legalizeBufferStore(MI, MRI, B, false, false);
5054   case Intrinsic::amdgcn_raw_buffer_store_format:
5055   case Intrinsic::amdgcn_struct_buffer_store_format:
5056     return legalizeBufferStore(MI, MRI, B, false, true);
5057   case Intrinsic::amdgcn_raw_tbuffer_store:
5058   case Intrinsic::amdgcn_struct_tbuffer_store:
5059     return legalizeBufferStore(MI, MRI, B, true, true);
5060   case Intrinsic::amdgcn_raw_buffer_load:
5061   case Intrinsic::amdgcn_struct_buffer_load:
5062     return legalizeBufferLoad(MI, MRI, B, false, false);
5063   case Intrinsic::amdgcn_raw_buffer_load_format:
5064   case Intrinsic::amdgcn_struct_buffer_load_format:
5065     return legalizeBufferLoad(MI, MRI, B, true, false);
5066   case Intrinsic::amdgcn_raw_tbuffer_load:
5067   case Intrinsic::amdgcn_struct_tbuffer_load:
5068     return legalizeBufferLoad(MI, MRI, B, true, true);
5069   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5070   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5071   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5072   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5073   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5074   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5075   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5076   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5077   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5078   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5079   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5080   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5081   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5082   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5083   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5084   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5085   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5086   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5087   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5088   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5089   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5090   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5091   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5092   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5093   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5094   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5095   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5096   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5097   case Intrinsic::amdgcn_buffer_atomic_fadd:
5098   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5099   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5100   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5101   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5102     return legalizeBufferAtomic(MI, B, IntrID);
5103   case Intrinsic::amdgcn_atomic_inc:
5104     return legalizeAtomicIncDec(MI, B, true);
5105   case Intrinsic::amdgcn_atomic_dec:
5106     return legalizeAtomicIncDec(MI, B, false);
5107   case Intrinsic::trap:
5108     return legalizeTrapIntrinsic(MI, MRI, B);
5109   case Intrinsic::debugtrap:
5110     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5111   case Intrinsic::amdgcn_rsq_clamp:
5112     return legalizeRsqClampIntrinsic(MI, MRI, B);
5113   case Intrinsic::amdgcn_ds_fadd:
5114   case Intrinsic::amdgcn_ds_fmin:
5115   case Intrinsic::amdgcn_ds_fmax:
5116     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5117   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5118     return legalizeBVHIntrinsic(MI, B);
5119   default: {
5120     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5121             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5122       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5123     return true;
5124   }
5125   }
5126 
5127   return true;
5128 }
5129