1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816                     .lowerIf(typeIs(1, S1))
817                     .customFor({{S32, S64}, {S64, S64}});
818   if (ST.has16BitInsts())
819     IToFP.legalFor({{S16, S16}});
820   IToFP.clampScalar(1, S32, S64)
821        .minScalar(0, S32)
822        .scalarize(0)
823        .widenScalarToNextPow2(1);
824 
825   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
826     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
827     .customFor({{S64, S32}, {S64, S64}})
828     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
829   if (ST.has16BitInsts())
830     FPToI.legalFor({{S16, S16}});
831   else
832     FPToI.minScalar(1, S32);
833 
834   FPToI.minScalar(0, S32)
835        .widenScalarToNextPow2(0, 32)
836        .scalarize(0)
837        .lower();
838 
839   // Lower roundeven into G_FRINT
840   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
841     .scalarize(0)
842     .lower();
843 
844   if (ST.has16BitInsts()) {
845     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
846       .legalFor({S16, S32, S64})
847       .clampScalar(0, S16, S64)
848       .scalarize(0);
849   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
850     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
851       .legalFor({S32, S64})
852       .clampScalar(0, S32, S64)
853       .scalarize(0);
854   } else {
855     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
856       .legalFor({S32})
857       .customFor({S64})
858       .clampScalar(0, S32, S64)
859       .scalarize(0);
860   }
861 
862   getActionDefinitionsBuilder(G_PTR_ADD)
863     .legalIf(all(isPointer(0), sameSize(0, 1)))
864     .scalarize(0)
865     .scalarSameSizeAs(1, 0);
866 
867   getActionDefinitionsBuilder(G_PTRMASK)
868     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
869     .scalarSameSizeAs(1, 0)
870     .scalarize(0);
871 
872   auto &CmpBuilder =
873     getActionDefinitionsBuilder(G_ICMP)
874     // The compare output type differs based on the register bank of the output,
875     // so make both s1 and s32 legal.
876     //
877     // Scalar compares producing output in scc will be promoted to s32, as that
878     // is the allocatable register type that will be needed for the copy from
879     // scc. This will be promoted during RegBankSelect, and we assume something
880     // before that won't try to use s32 result types.
881     //
882     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
883     // bank.
884     .legalForCartesianProduct(
885       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
886     .legalForCartesianProduct(
887       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
888   if (ST.has16BitInsts()) {
889     CmpBuilder.legalFor({{S1, S16}});
890   }
891 
892   CmpBuilder
893     .widenScalarToNextPow2(1)
894     .clampScalar(1, S32, S64)
895     .scalarize(0)
896     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
897 
898   getActionDefinitionsBuilder(G_FCMP)
899     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
900     .widenScalarToNextPow2(1)
901     .clampScalar(1, S32, S64)
902     .scalarize(0);
903 
904   // FIXME: fpow has a selection pattern that should move to custom lowering.
905   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
906   if (ST.has16BitInsts())
907     Exp2Ops.legalFor({S32, S16});
908   else
909     Exp2Ops.legalFor({S32});
910   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
911   Exp2Ops.scalarize(0);
912 
913   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
914   if (ST.has16BitInsts())
915     ExpOps.customFor({{S32}, {S16}});
916   else
917     ExpOps.customFor({S32});
918   ExpOps.clampScalar(0, MinScalarFPTy, S32)
919         .scalarize(0);
920 
921   getActionDefinitionsBuilder(G_FPOWI)
922     .clampScalar(0, MinScalarFPTy, S32)
923     .lower();
924 
925   // The 64-bit versions produce 32-bit results, but only on the SALU.
926   getActionDefinitionsBuilder(G_CTPOP)
927     .legalFor({{S32, S32}, {S32, S64}})
928     .clampScalar(0, S32, S32)
929     .clampScalar(1, S32, S64)
930     .scalarize(0)
931     .widenScalarToNextPow2(0, 32)
932     .widenScalarToNextPow2(1, 32);
933 
934   // The hardware instructions return a different result on 0 than the generic
935   // instructions expect. The hardware produces -1, but these produce the
936   // bitwidth.
937   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
938     .scalarize(0)
939     .clampScalar(0, S32, S32)
940     .clampScalar(1, S32, S64)
941     .widenScalarToNextPow2(0, 32)
942     .widenScalarToNextPow2(1, 32)
943     .custom();
944 
945   // The 64-bit versions produce 32-bit results, but only on the SALU.
946   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
947     .legalFor({{S32, S32}, {S32, S64}})
948     .clampScalar(0, S32, S32)
949     .clampScalar(1, S32, S64)
950     .scalarize(0)
951     .widenScalarToNextPow2(0, 32)
952     .widenScalarToNextPow2(1, 32);
953 
954   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
955   // RegBankSelect.
956   getActionDefinitionsBuilder(G_BITREVERSE)
957     .legalFor({S32, S64})
958     .clampScalar(0, S32, S64)
959     .scalarize(0)
960     .widenScalarToNextPow2(0);
961 
962   if (ST.has16BitInsts()) {
963     getActionDefinitionsBuilder(G_BSWAP)
964       .legalFor({S16, S32, V2S16})
965       .clampMaxNumElements(0, S16, 2)
966       // FIXME: Fixing non-power-of-2 before clamp is workaround for
967       // narrowScalar limitation.
968       .widenScalarToNextPow2(0)
969       .clampScalar(0, S16, S32)
970       .scalarize(0);
971 
972     if (ST.hasVOP3PInsts()) {
973       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
974         .legalFor({S32, S16, V2S16})
975         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
976         .clampMaxNumElements(0, S16, 2)
977         .minScalar(0, S16)
978         .widenScalarToNextPow2(0)
979         .scalarize(0)
980         .lower();
981     } else {
982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983         .legalFor({S32, S16})
984         .widenScalarToNextPow2(0)
985         .minScalar(0, S16)
986         .scalarize(0)
987         .lower();
988     }
989   } else {
990     // TODO: Should have same legality without v_perm_b32
991     getActionDefinitionsBuilder(G_BSWAP)
992       .legalFor({S32})
993       .lowerIf(scalarNarrowerThan(0, 32))
994       // FIXME: Fixing non-power-of-2 before clamp is workaround for
995       // narrowScalar limitation.
996       .widenScalarToNextPow2(0)
997       .maxScalar(0, S32)
998       .scalarize(0)
999       .lower();
1000 
1001     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1002       .legalFor({S32})
1003       .minScalar(0, S32)
1004       .widenScalarToNextPow2(0)
1005       .scalarize(0)
1006       .lower();
1007   }
1008 
1009   getActionDefinitionsBuilder(G_INTTOPTR)
1010     // List the common cases
1011     .legalForCartesianProduct(AddrSpaces64, {S64})
1012     .legalForCartesianProduct(AddrSpaces32, {S32})
1013     .scalarize(0)
1014     // Accept any address space as long as the size matches
1015     .legalIf(sameSize(0, 1))
1016     .widenScalarIf(smallerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       })
1020     .narrowScalarIf(largerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       });
1024 
1025   getActionDefinitionsBuilder(G_PTRTOINT)
1026     // List the common cases
1027     .legalForCartesianProduct(AddrSpaces64, {S64})
1028     .legalForCartesianProduct(AddrSpaces32, {S32})
1029     .scalarize(0)
1030     // Accept any address space as long as the size matches
1031     .legalIf(sameSize(0, 1))
1032     .widenScalarIf(smallerThan(0, 1),
1033       [](const LegalityQuery &Query) {
1034         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1035       })
1036     .narrowScalarIf(
1037       largerThan(0, 1),
1038       [](const LegalityQuery &Query) {
1039         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1040       });
1041 
1042   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1043     .scalarize(0)
1044     .custom();
1045 
1046   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1047                                     bool IsLoad) -> bool {
1048     const LLT DstTy = Query.Types[0];
1049 
1050     // Split vector extloads.
1051     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1052     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1053 
1054     if (MemSize < DstTy.getSizeInBits())
1055       MemSize = std::max(MemSize, AlignBits);
1056 
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059 
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064 
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076 
1077     if (AlignBits < MemSize) {
1078       const SITargetLowering *TLI = ST.getTargetLowering();
1079       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1080                                                       Align(AlignBits / 8));
1081     }
1082 
1083     return false;
1084   };
1085 
1086   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1089 
1090   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1091   // LDS
1092   // TODO: Unsupported flat for SI.
1093 
1094   for (unsigned Op : {G_LOAD, G_STORE}) {
1095     const bool IsStore = Op == G_STORE;
1096 
1097     auto &Actions = getActionDefinitionsBuilder(Op);
1098     // Explicitly list some common cases.
1099     // TODO: Does this help compile time at all?
1100     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1101                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1102                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1103                                       {S64, GlobalPtr, S64, GlobalAlign32},
1104                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1105                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1106                                       {S32, GlobalPtr, S8, GlobalAlign8},
1107                                       {S32, GlobalPtr, S16, GlobalAlign16},
1108 
1109                                       {S32, LocalPtr, S32, 32},
1110                                       {S64, LocalPtr, S64, 32},
1111                                       {V2S32, LocalPtr, V2S32, 32},
1112                                       {S32, LocalPtr, S8, 8},
1113                                       {S32, LocalPtr, S16, 16},
1114                                       {V2S16, LocalPtr, S32, 32},
1115 
1116                                       {S32, PrivatePtr, S32, 32},
1117                                       {S32, PrivatePtr, S8, 8},
1118                                       {S32, PrivatePtr, S16, 16},
1119                                       {V2S16, PrivatePtr, S32, 32},
1120 
1121                                       {S32, ConstantPtr, S32, GlobalAlign32},
1122                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1123                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1124                                       {S64, ConstantPtr, S64, GlobalAlign32},
1125                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1126     Actions.legalIf(
1127       [=](const LegalityQuery &Query) -> bool {
1128         return isLoadStoreLegal(ST, Query);
1129       });
1130 
1131     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1132     // 64-bits.
1133     //
1134     // TODO: Should generalize bitcast action into coerce, which will also cover
1135     // inserting addrspacecasts.
1136     Actions.customIf(typeIs(1, Constant32Ptr));
1137 
1138     // Turn any illegal element vectors into something easier to deal
1139     // with. These will ultimately produce 32-bit scalar shifts to extract the
1140     // parts anyway.
1141     //
1142     // For odd 16-bit element vectors, prefer to split those into pieces with
1143     // 16-bit vector parts.
1144     Actions.bitcastIf(
1145       [=](const LegalityQuery &Query) -> bool {
1146         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1147                                           Query.MMODescrs[0].MemoryTy);
1148       }, bitcastToRegisterType(0));
1149 
1150     if (!IsStore) {
1151       // Widen suitably aligned loads by loading extra bytes. The standard
1152       // legalization actions can't properly express widening memory operands.
1153       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1154         return shouldWidenLoad(ST, Query, G_LOAD);
1155       });
1156     }
1157 
1158     // FIXME: load/store narrowing should be moved to lower action
1159     Actions
1160         .narrowScalarIf(
1161             [=](const LegalityQuery &Query) -> bool {
1162               return !Query.Types[0].isVector() &&
1163                      needToSplitMemOp(Query, Op == G_LOAD);
1164             },
1165             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1166               const LLT DstTy = Query.Types[0];
1167               const LLT PtrTy = Query.Types[1];
1168 
1169               const unsigned DstSize = DstTy.getSizeInBits();
1170               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1171 
1172               // Split extloads.
1173               if (DstSize > MemSize)
1174                 return std::make_pair(0, LLT::scalar(MemSize));
1175 
1176               if (!isPowerOf2_32(DstSize)) {
1177                 // We're probably decomposing an odd sized store. Try to split
1178                 // to the widest type. TODO: Account for alignment. As-is it
1179                 // should be OK, since the new parts will be further legalized.
1180                 unsigned FloorSize = PowerOf2Floor(DstSize);
1181                 return std::make_pair(0, LLT::scalar(FloorSize));
1182               }
1183 
1184               if (DstSize > 32 && (DstSize % 32 != 0)) {
1185                 // FIXME: Need a way to specify non-extload of larger size if
1186                 // suitably aligned.
1187                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1188               }
1189 
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193               if (MemSize > MaxSize)
1194                 return std::make_pair(0, LLT::scalar(MaxSize));
1195 
1196               unsigned Align = Query.MMODescrs[0].AlignInBits;
1197               return std::make_pair(0, LLT::scalar(Align));
1198             })
1199         .fewerElementsIf(
1200             [=](const LegalityQuery &Query) -> bool {
1201               return Query.Types[0].isVector() &&
1202                      needToSplitMemOp(Query, Op == G_LOAD);
1203             },
1204             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1205               const LLT DstTy = Query.Types[0];
1206               const LLT PtrTy = Query.Types[1];
1207 
1208               LLT EltTy = DstTy.getElementType();
1209               unsigned MaxSize = maxSizeForAddrSpace(ST,
1210                                                      PtrTy.getAddressSpace(),
1211                                                      Op == G_LOAD);
1212 
1213               // FIXME: Handle widened to power of 2 results better. This ends
1214               // up scalarizing.
1215               // FIXME: 3 element stores scalarized on SI
1216 
1217               // Split if it's too large for the address space.
1218               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1219               if (MemSize > MaxSize) {
1220                 unsigned NumElts = DstTy.getNumElements();
1221                 unsigned EltSize = EltTy.getSizeInBits();
1222 
1223                 if (MaxSize % EltSize == 0) {
1224                   return std::make_pair(
1225                       0, LLT::scalarOrVector(
1226                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1227                 }
1228 
1229                 unsigned NumPieces = MemSize / MaxSize;
1230 
1231                 // FIXME: Refine when odd breakdowns handled
1232                 // The scalars will need to be re-legalized.
1233                 if (NumPieces == 1 || NumPieces >= NumElts ||
1234                     NumElts % NumPieces != 0)
1235                   return std::make_pair(0, EltTy);
1236 
1237                 return std::make_pair(
1238                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1239               }
1240 
1241               // FIXME: We could probably handle weird extending loads better.
1242               if (DstTy.getSizeInBits() > MemSize)
1243                 return std::make_pair(0, EltTy);
1244 
1245               unsigned EltSize = EltTy.getSizeInBits();
1246               unsigned DstSize = DstTy.getSizeInBits();
1247               if (!isPowerOf2_32(DstSize)) {
1248                 // We're probably decomposing an odd sized store. Try to split
1249                 // to the widest type. TODO: Account for alignment. As-is it
1250                 // should be OK, since the new parts will be further legalized.
1251                 unsigned FloorSize = PowerOf2Floor(DstSize);
1252                 return std::make_pair(
1253                     0, LLT::scalarOrVector(
1254                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1255               }
1256 
1257               // Need to split because of alignment.
1258               unsigned Align = Query.MMODescrs[0].AlignInBits;
1259               if (EltSize > Align &&
1260                   (EltSize / Align < DstTy.getNumElements())) {
1261                 return std::make_pair(
1262                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1263               }
1264 
1265               // May need relegalization for the scalars.
1266               return std::make_pair(0, EltTy);
1267             })
1268     .minScalar(0, S32)
1269     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1270     .widenScalarToNextPow2(0)
1271     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1272     .lower();
1273   }
1274 
1275   // FIXME: Unaligned accesses not lowered.
1276   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1277                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1278                                                   {S32, GlobalPtr, S16, 2 * 8},
1279                                                   {S32, LocalPtr, S8, 8},
1280                                                   {S32, LocalPtr, S16, 16},
1281                                                   {S32, PrivatePtr, S8, 8},
1282                                                   {S32, PrivatePtr, S16, 16},
1283                                                   {S32, ConstantPtr, S8, 8},
1284                                                   {S32, ConstantPtr, S16, 2 * 8}})
1285                        .legalIf(
1286                          [=](const LegalityQuery &Query) -> bool {
1287                            return isLoadStoreLegal(ST, Query);
1288                          });
1289 
1290   if (ST.hasFlatAddressSpace()) {
1291     ExtLoads.legalForTypesWithMemDesc(
1292         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1293   }
1294 
1295   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1296   // 64-bits.
1297   //
1298   // TODO: Should generalize bitcast action into coerce, which will also cover
1299   // inserting addrspacecasts.
1300   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1301 
1302   ExtLoads.clampScalar(0, S32, S32)
1303           .widenScalarToNextPow2(0)
1304           .lower();
1305 
1306   auto &Atomics = getActionDefinitionsBuilder(
1307     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1308      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1309      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1310      G_ATOMICRMW_UMIN})
1311     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1312                {S64, GlobalPtr}, {S64, LocalPtr},
1313                {S32, RegionPtr}, {S64, RegionPtr}});
1314   if (ST.hasFlatAddressSpace()) {
1315     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1316   }
1317 
1318   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1319   if (ST.hasLDSFPAtomics()) {
1320     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1321     if (ST.hasGFX90AInsts())
1322       Atomic.legalFor({{S64, LocalPtr}});
1323   }
1324   if (ST.hasAtomicFaddInsts())
1325     Atomic.legalFor({{S32, GlobalPtr}});
1326 
1327   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1328   // demarshalling
1329   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1330     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1331                 {S32, FlatPtr}, {S64, FlatPtr}})
1332     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1333                {S32, RegionPtr}, {S64, RegionPtr}});
1334   // TODO: Pointer types, any 32-bit or 64-bit vector
1335 
1336   // Condition should be s32 for scalar, s1 for vector.
1337   getActionDefinitionsBuilder(G_SELECT)
1338       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1339                                  LocalPtr, FlatPtr, PrivatePtr,
1340                                  LLT::fixed_vector(2, LocalPtr),
1341                                  LLT::fixed_vector(2, PrivatePtr)},
1342                                 {S1, S32})
1343       .clampScalar(0, S16, S64)
1344       .scalarize(1)
1345       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347       .clampMaxNumElements(0, S32, 2)
1348       .clampMaxNumElements(0, LocalPtr, 2)
1349       .clampMaxNumElements(0, PrivatePtr, 2)
1350       .scalarize(0)
1351       .widenScalarToNextPow2(0)
1352       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1353 
1354   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355   // be more flexible with the shift amount type.
1356   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1357     .legalFor({{S32, S32}, {S64, S32}});
1358   if (ST.has16BitInsts()) {
1359     if (ST.hasVOP3PInsts()) {
1360       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1361             .clampMaxNumElements(0, S16, 2);
1362     } else
1363       Shifts.legalFor({{S16, S16}});
1364 
1365     // TODO: Support 16-bit shift amounts for all types
1366     Shifts.widenScalarIf(
1367       [=](const LegalityQuery &Query) {
1368         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1369         // 32-bit amount.
1370         const LLT ValTy = Query.Types[0];
1371         const LLT AmountTy = Query.Types[1];
1372         return ValTy.getSizeInBits() <= 16 &&
1373                AmountTy.getSizeInBits() < 16;
1374       }, changeTo(1, S16));
1375     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1376     Shifts.clampScalar(1, S32, S32);
1377     Shifts.clampScalar(0, S16, S64);
1378     Shifts.widenScalarToNextPow2(0, 16);
1379 
1380     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381       .minScalar(0, S16)
1382       .scalarize(0)
1383       .lower();
1384   } else {
1385     // Make sure we legalize the shift amount type first, as the general
1386     // expansion for the shifted type will produce much worse code if it hasn't
1387     // been truncated already.
1388     Shifts.clampScalar(1, S32, S32);
1389     Shifts.clampScalar(0, S32, S64);
1390     Shifts.widenScalarToNextPow2(0, 32);
1391 
1392     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1393       .minScalar(0, S32)
1394       .scalarize(0)
1395       .lower();
1396   }
1397   Shifts.scalarize(0);
1398 
1399   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1400     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1401     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1402     unsigned IdxTypeIdx = 2;
1403 
1404     getActionDefinitionsBuilder(Op)
1405       .customIf([=](const LegalityQuery &Query) {
1406           const LLT EltTy = Query.Types[EltTypeIdx];
1407           const LLT VecTy = Query.Types[VecTypeIdx];
1408           const LLT IdxTy = Query.Types[IdxTypeIdx];
1409           const unsigned EltSize = EltTy.getSizeInBits();
1410           return (EltSize == 32 || EltSize == 64) &&
1411                   VecTy.getSizeInBits() % 32 == 0 &&
1412                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1413                   IdxTy.getSizeInBits() == 32;
1414         })
1415       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1416                  bitcastToVectorElement32(VecTypeIdx))
1417       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1418       .bitcastIf(
1419         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1420         [=](const LegalityQuery &Query) {
1421           // For > 64-bit element types, try to turn this into a 64-bit
1422           // element vector since we may be able to do better indexing
1423           // if this is scalar. If not, fall back to 32.
1424           const LLT EltTy = Query.Types[EltTypeIdx];
1425           const LLT VecTy = Query.Types[VecTypeIdx];
1426           const unsigned DstEltSize = EltTy.getSizeInBits();
1427           const unsigned VecSize = VecTy.getSizeInBits();
1428 
1429           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1430           return std::make_pair(
1431               VecTypeIdx,
1432               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1433         })
1434       .clampScalar(EltTypeIdx, S32, S64)
1435       .clampScalar(VecTypeIdx, S32, S64)
1436       .clampScalar(IdxTypeIdx, S32, S32)
1437       .clampMaxNumElements(VecTypeIdx, S32, 32)
1438       // TODO: Clamp elements for 64-bit vectors?
1439       // It should only be necessary with variable indexes.
1440       // As a last resort, lower to the stack
1441       .lower();
1442   }
1443 
1444   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1445     .unsupportedIf([=](const LegalityQuery &Query) {
1446         const LLT &EltTy = Query.Types[1].getElementType();
1447         return Query.Types[0] != EltTy;
1448       });
1449 
1450   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1451     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1452     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1453 
1454     // FIXME: Doesn't handle extract of illegal sizes.
1455     getActionDefinitionsBuilder(Op)
1456       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1457       // FIXME: Multiples of 16 should not be legal.
1458       .legalIf([=](const LegalityQuery &Query) {
1459           const LLT BigTy = Query.Types[BigTyIdx];
1460           const LLT LitTy = Query.Types[LitTyIdx];
1461           return (BigTy.getSizeInBits() % 32 == 0) &&
1462                  (LitTy.getSizeInBits() % 16 == 0);
1463         })
1464       .widenScalarIf(
1465         [=](const LegalityQuery &Query) {
1466           const LLT BigTy = Query.Types[BigTyIdx];
1467           return (BigTy.getScalarSizeInBits() < 16);
1468         },
1469         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1470       .widenScalarIf(
1471         [=](const LegalityQuery &Query) {
1472           const LLT LitTy = Query.Types[LitTyIdx];
1473           return (LitTy.getScalarSizeInBits() < 16);
1474         },
1475         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1476       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1477       .widenScalarToNextPow2(BigTyIdx, 32);
1478 
1479   }
1480 
1481   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1482     .legalForCartesianProduct(AllS32Vectors, {S32})
1483     .legalForCartesianProduct(AllS64Vectors, {S64})
1484     .clampNumElements(0, V16S32, V32S32)
1485     .clampNumElements(0, V2S64, V16S64)
1486     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1487 
1488   if (ST.hasScalarPackInsts()) {
1489     BuildVector
1490       // FIXME: Should probably widen s1 vectors straight to s32
1491       .minScalarOrElt(0, S16)
1492       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1493       .minScalar(1, S32);
1494 
1495     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496       .legalFor({V2S16, S32})
1497       .lower();
1498     BuildVector.minScalarOrElt(0, S32);
1499   } else {
1500     BuildVector.customFor({V2S16, S16});
1501     BuildVector.minScalarOrElt(0, S32);
1502 
1503     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1504       .customFor({V2S16, S32})
1505       .lower();
1506   }
1507 
1508   BuildVector.legalIf(isRegisterType(0));
1509 
1510   // FIXME: Clamp maximum size
1511   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1512     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513     .clampMaxNumElements(0, S32, 32)
1514     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1515     .clampMaxNumElements(0, S16, 64);
1516 
1517   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1518   // pre-legalize.
1519   if (ST.hasVOP3PInsts()) {
1520     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1521       .customFor({V2S16, V2S16})
1522       .lower();
1523   } else
1524     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1525 
1526   // Merge/Unmerge
1527   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1528     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1529     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1530 
1531     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1532       const LLT Ty = Query.Types[TypeIdx];
1533       if (Ty.isVector()) {
1534         const LLT &EltTy = Ty.getElementType();
1535         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1536           return true;
1537         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1538           return true;
1539       }
1540       return false;
1541     };
1542 
1543     auto &Builder = getActionDefinitionsBuilder(Op)
1544       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545       .lowerFor({{S16, V2S16}})
1546       .lowerIf([=](const LegalityQuery &Query) {
1547           const LLT BigTy = Query.Types[BigTyIdx];
1548           return BigTy.getSizeInBits() == 32;
1549         })
1550       // Try to widen to s16 first for small types.
1551       // TODO: Only do this on targets with legal s16 shifts
1552       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1554       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1555       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1556                            elementTypeIs(1, S16)),
1557                        changeTo(1, V2S16))
1558       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1560       // valid.
1561       .clampScalar(LitTyIdx, S32, S512)
1562       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1563       // Break up vectors with weird elements into scalars
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1566         scalarize(0))
1567       .fewerElementsIf(
1568         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1569         scalarize(1))
1570       .clampScalar(BigTyIdx, S32, MaxScalar);
1571 
1572     if (Op == G_MERGE_VALUES) {
1573       Builder.widenScalarIf(
1574         // TODO: Use 16-bit shifts if legal for 8-bit values?
1575         [=](const LegalityQuery &Query) {
1576           const LLT Ty = Query.Types[LitTyIdx];
1577           return Ty.getSizeInBits() < 32;
1578         },
1579         changeTo(LitTyIdx, S32));
1580     }
1581 
1582     Builder.widenScalarIf(
1583       [=](const LegalityQuery &Query) {
1584         const LLT Ty = Query.Types[BigTyIdx];
1585         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1586           Ty.getSizeInBits() % 16 != 0;
1587       },
1588       [=](const LegalityQuery &Query) {
1589         // Pick the next power of 2, or a multiple of 64 over 128.
1590         // Whichever is smaller.
1591         const LLT &Ty = Query.Types[BigTyIdx];
1592         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1593         if (NewSizeInBits >= 256) {
1594           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1595           if (RoundedTo < NewSizeInBits)
1596             NewSizeInBits = RoundedTo;
1597         }
1598         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1599       })
1600       // Any vectors left are the wrong size. Scalarize them.
1601       .scalarize(0)
1602       .scalarize(1);
1603   }
1604 
1605   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1606   // RegBankSelect.
1607   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1608     .legalFor({{S32}, {S64}});
1609 
1610   if (ST.hasVOP3PInsts()) {
1611     SextInReg.lowerFor({{V2S16}})
1612       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613       // get more vector shift opportunities, since we'll get those when
1614       // expanded.
1615       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1616   } else if (ST.has16BitInsts()) {
1617     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1618   } else {
1619     // Prefer to promote to s32 before lowering if we don't have 16-bit
1620     // shifts. This avoid a lot of intermediate truncate and extend operations.
1621     SextInReg.lowerFor({{S32}, {S64}});
1622   }
1623 
1624   SextInReg
1625     .scalarize(0)
1626     .clampScalar(0, S32, S64)
1627     .lower();
1628 
1629   // TODO: Only Try to form v2s16 with legal packed instructions.
1630   getActionDefinitionsBuilder(G_FSHR)
1631     .legalFor({{S32, S32}})
1632     .lowerFor({{V2S16, V2S16}})
1633     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1634     .scalarize(0)
1635     .lower();
1636 
1637   if (ST.hasVOP3PInsts()) {
1638     getActionDefinitionsBuilder(G_FSHL)
1639       .lowerFor({{V2S16, V2S16}})
1640       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641       .scalarize(0)
1642       .lower();
1643   } else {
1644     getActionDefinitionsBuilder(G_FSHL)
1645       .scalarize(0)
1646       .lower();
1647   }
1648 
1649   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650     .legalFor({S64});
1651 
1652   getActionDefinitionsBuilder(G_FENCE)
1653     .alwaysLegal();
1654 
1655   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656       .scalarize(0)
1657       .minScalar(0, S32)
1658       .lower();
1659 
1660   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661       .legalFor({{S32, S32}, {S64, S32}})
1662       .clampScalar(1, S32, S32)
1663       .clampScalar(0, S32, S64)
1664       .widenScalarToNextPow2(0)
1665       .scalarize(0);
1666 
1667   getActionDefinitionsBuilder({
1668       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669       G_FCOPYSIGN,
1670 
1671       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672       G_ATOMICRMW_NAND,
1673       G_ATOMICRMW_FSUB,
1674       G_READ_REGISTER,
1675       G_WRITE_REGISTER,
1676 
1677       G_SADDO, G_SSUBO,
1678 
1679        // TODO: Implement
1680       G_FMINIMUM, G_FMAXIMUM}).lower();
1681 
1682   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1683         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1684         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1685     .unsupported();
1686 
1687   getLegacyLegalizerInfo().computeTables();
1688   verify(*ST.getInstrInfo());
1689 }
1690 
1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1692                                          MachineInstr &MI) const {
1693   MachineIRBuilder &B = Helper.MIRBuilder;
1694   MachineRegisterInfo &MRI = *B.getMRI();
1695 
1696   switch (MI.getOpcode()) {
1697   case TargetOpcode::G_ADDRSPACE_CAST:
1698     return legalizeAddrSpaceCast(MI, MRI, B);
1699   case TargetOpcode::G_FRINT:
1700     return legalizeFrint(MI, MRI, B);
1701   case TargetOpcode::G_FCEIL:
1702     return legalizeFceil(MI, MRI, B);
1703   case TargetOpcode::G_FREM:
1704     return legalizeFrem(MI, MRI, B);
1705   case TargetOpcode::G_INTRINSIC_TRUNC:
1706     return legalizeIntrinsicTrunc(MI, MRI, B);
1707   case TargetOpcode::G_SITOFP:
1708     return legalizeITOFP(MI, MRI, B, true);
1709   case TargetOpcode::G_UITOFP:
1710     return legalizeITOFP(MI, MRI, B, false);
1711   case TargetOpcode::G_FPTOSI:
1712     return legalizeFPTOI(MI, MRI, B, true);
1713   case TargetOpcode::G_FPTOUI:
1714     return legalizeFPTOI(MI, MRI, B, false);
1715   case TargetOpcode::G_FMINNUM:
1716   case TargetOpcode::G_FMAXNUM:
1717   case TargetOpcode::G_FMINNUM_IEEE:
1718   case TargetOpcode::G_FMAXNUM_IEEE:
1719     return legalizeMinNumMaxNum(Helper, MI);
1720   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1721     return legalizeExtractVectorElt(MI, MRI, B);
1722   case TargetOpcode::G_INSERT_VECTOR_ELT:
1723     return legalizeInsertVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_SHUFFLE_VECTOR:
1725     return legalizeShuffleVector(MI, MRI, B);
1726   case TargetOpcode::G_FSIN:
1727   case TargetOpcode::G_FCOS:
1728     return legalizeSinCos(MI, MRI, B);
1729   case TargetOpcode::G_GLOBAL_VALUE:
1730     return legalizeGlobalValue(MI, MRI, B);
1731   case TargetOpcode::G_LOAD:
1732   case TargetOpcode::G_SEXTLOAD:
1733   case TargetOpcode::G_ZEXTLOAD:
1734     return legalizeLoad(Helper, MI);
1735   case TargetOpcode::G_FMAD:
1736     return legalizeFMad(MI, MRI, B);
1737   case TargetOpcode::G_FDIV:
1738     return legalizeFDIV(MI, MRI, B);
1739   case TargetOpcode::G_UDIV:
1740   case TargetOpcode::G_UREM:
1741   case TargetOpcode::G_UDIVREM:
1742     return legalizeUnsignedDIV_REM(MI, MRI, B);
1743   case TargetOpcode::G_SDIV:
1744   case TargetOpcode::G_SREM:
1745   case TargetOpcode::G_SDIVREM:
1746     return legalizeSignedDIV_REM(MI, MRI, B);
1747   case TargetOpcode::G_ATOMIC_CMPXCHG:
1748     return legalizeAtomicCmpXChg(MI, MRI, B);
1749   case TargetOpcode::G_FLOG:
1750     return legalizeFlog(MI, B, numbers::ln2f);
1751   case TargetOpcode::G_FLOG10:
1752     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1753   case TargetOpcode::G_FEXP:
1754     return legalizeFExp(MI, B);
1755   case TargetOpcode::G_FPOW:
1756     return legalizeFPow(MI, B);
1757   case TargetOpcode::G_FFLOOR:
1758     return legalizeFFloor(MI, MRI, B);
1759   case TargetOpcode::G_BUILD_VECTOR:
1760     return legalizeBuildVector(MI, MRI, B);
1761   case TargetOpcode::G_CTLZ:
1762   case TargetOpcode::G_CTTZ:
1763     return legalizeCTLZ_CTTZ(MI, MRI, B);
1764   default:
1765     return false;
1766   }
1767 
1768   llvm_unreachable("expected switch to return");
1769 }
1770 
1771 Register AMDGPULegalizerInfo::getSegmentAperture(
1772   unsigned AS,
1773   MachineRegisterInfo &MRI,
1774   MachineIRBuilder &B) const {
1775   MachineFunction &MF = B.getMF();
1776   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1777   const LLT S32 = LLT::scalar(32);
1778 
1779   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1780 
1781   if (ST.hasApertureRegs()) {
1782     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1783     // getreg.
1784     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1786         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1787     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1788         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1789         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1790     unsigned Encoding =
1791         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1792         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1793         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1794 
1795     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1796 
1797     B.buildInstr(AMDGPU::S_GETREG_B32)
1798       .addDef(GetReg)
1799       .addImm(Encoding);
1800     MRI.setType(GetReg, S32);
1801 
1802     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1803     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1804   }
1805 
1806   Register QueuePtr = MRI.createGenericVirtualRegister(
1807     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1808 
1809   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1810     return Register();
1811 
1812   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1813   // private_segment_aperture_base_hi.
1814   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1815 
1816   // TODO: can we be smarter about machine pointer info?
1817   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1818   MachineMemOperand *MMO = MF.getMachineMemOperand(
1819       PtrInfo,
1820       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1821           MachineMemOperand::MOInvariant,
1822       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1823 
1824   Register LoadAddr;
1825 
1826   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1827   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1828 }
1829 
1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1831   MachineInstr &MI, MachineRegisterInfo &MRI,
1832   MachineIRBuilder &B) const {
1833   MachineFunction &MF = B.getMF();
1834 
1835   const LLT S32 = LLT::scalar(32);
1836   Register Dst = MI.getOperand(0).getReg();
1837   Register Src = MI.getOperand(1).getReg();
1838 
1839   LLT DstTy = MRI.getType(Dst);
1840   LLT SrcTy = MRI.getType(Src);
1841   unsigned DestAS = DstTy.getAddressSpace();
1842   unsigned SrcAS = SrcTy.getAddressSpace();
1843 
1844   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1845   // vector element.
1846   assert(!DstTy.isVector());
1847 
1848   const AMDGPUTargetMachine &TM
1849     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1850 
1851   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1852     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1853     return true;
1854   }
1855 
1856   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1857     // Truncate.
1858     B.buildExtract(Dst, Src, 0);
1859     MI.eraseFromParent();
1860     return true;
1861   }
1862 
1863   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1864     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1865     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1866 
1867     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1868     // another. Merge operands are required to be the same type, but creating an
1869     // extra ptrtoint would be kind of pointless.
1870     auto HighAddr = B.buildConstant(
1871       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1872     B.buildMerge(Dst, {Src, HighAddr});
1873     MI.eraseFromParent();
1874     return true;
1875   }
1876 
1877   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1878     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1879            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1880     unsigned NullVal = TM.getNullPointerValue(DestAS);
1881 
1882     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1883     auto FlatNull = B.buildConstant(SrcTy, 0);
1884 
1885     // Extract low 32-bits of the pointer.
1886     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1887 
1888     auto CmpRes =
1889         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1890     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1891 
1892     MI.eraseFromParent();
1893     return true;
1894   }
1895 
1896   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1897     return false;
1898 
1899   if (!ST.hasFlatAddressSpace())
1900     return false;
1901 
1902   auto SegmentNull =
1903       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1904   auto FlatNull =
1905       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1906 
1907   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1908   if (!ApertureReg.isValid())
1909     return false;
1910 
1911   auto CmpRes =
1912       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1913 
1914   // Coerce the type of the low half of the result so we can use merge_values.
1915   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1916 
1917   // TODO: Should we allow mismatched types but matching sizes in merges to
1918   // avoid the ptrtoint?
1919   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1920   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1921 
1922   MI.eraseFromParent();
1923   return true;
1924 }
1925 
1926 bool AMDGPULegalizerInfo::legalizeFrint(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   Register Src = MI.getOperand(1).getReg();
1930   LLT Ty = MRI.getType(Src);
1931   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1932 
1933   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1934   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1935 
1936   auto C1 = B.buildFConstant(Ty, C1Val);
1937   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1938 
1939   // TODO: Should this propagate fast-math-flags?
1940   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1941   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1942 
1943   auto C2 = B.buildFConstant(Ty, C2Val);
1944   auto Fabs = B.buildFAbs(Ty, Src);
1945 
1946   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1947   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1948   MI.eraseFromParent();
1949   return true;
1950 }
1951 
1952 bool AMDGPULegalizerInfo::legalizeFceil(
1953   MachineInstr &MI, MachineRegisterInfo &MRI,
1954   MachineIRBuilder &B) const {
1955 
1956   const LLT S1 = LLT::scalar(1);
1957   const LLT S64 = LLT::scalar(64);
1958 
1959   Register Src = MI.getOperand(1).getReg();
1960   assert(MRI.getType(Src) == S64);
1961 
1962   // result = trunc(src)
1963   // if (src > 0.0 && src != result)
1964   //   result += 1.0
1965 
1966   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1967 
1968   const auto Zero = B.buildFConstant(S64, 0.0);
1969   const auto One = B.buildFConstant(S64, 1.0);
1970   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1971   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1972   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1973   auto Add = B.buildSelect(S64, And, One, Zero);
1974 
1975   // TODO: Should this propagate fast-math-flags?
1976   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1977   return true;
1978 }
1979 
1980 bool AMDGPULegalizerInfo::legalizeFrem(
1981   MachineInstr &MI, MachineRegisterInfo &MRI,
1982   MachineIRBuilder &B) const {
1983     Register DstReg = MI.getOperand(0).getReg();
1984     Register Src0Reg = MI.getOperand(1).getReg();
1985     Register Src1Reg = MI.getOperand(2).getReg();
1986     auto Flags = MI.getFlags();
1987     LLT Ty = MRI.getType(DstReg);
1988 
1989     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1990     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1991     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1992     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1993     MI.eraseFromParent();
1994     return true;
1995 }
1996 
1997 static MachineInstrBuilder extractF64Exponent(Register Hi,
1998                                               MachineIRBuilder &B) {
1999   const unsigned FractBits = 52;
2000   const unsigned ExpBits = 11;
2001   LLT S32 = LLT::scalar(32);
2002 
2003   auto Const0 = B.buildConstant(S32, FractBits - 32);
2004   auto Const1 = B.buildConstant(S32, ExpBits);
2005 
2006   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2007     .addUse(Hi)
2008     .addUse(Const0.getReg(0))
2009     .addUse(Const1.getReg(0));
2010 
2011   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2012 }
2013 
2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2015   MachineInstr &MI, MachineRegisterInfo &MRI,
2016   MachineIRBuilder &B) const {
2017   const LLT S1 = LLT::scalar(1);
2018   const LLT S32 = LLT::scalar(32);
2019   const LLT S64 = LLT::scalar(64);
2020 
2021   Register Src = MI.getOperand(1).getReg();
2022   assert(MRI.getType(Src) == S64);
2023 
2024   // TODO: Should this use extract since the low half is unused?
2025   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2026   Register Hi = Unmerge.getReg(1);
2027 
2028   // Extract the upper half, since this is where we will find the sign and
2029   // exponent.
2030   auto Exp = extractF64Exponent(Hi, B);
2031 
2032   const unsigned FractBits = 52;
2033 
2034   // Extract the sign bit.
2035   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2036   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2037 
2038   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2039 
2040   const auto Zero32 = B.buildConstant(S32, 0);
2041 
2042   // Extend back to 64-bits.
2043   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2044 
2045   auto Shr = B.buildAShr(S64, FractMask, Exp);
2046   auto Not = B.buildNot(S64, Shr);
2047   auto Tmp0 = B.buildAnd(S64, Src, Not);
2048   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2049 
2050   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2051   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2052 
2053   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2054   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2055   MI.eraseFromParent();
2056   return true;
2057 }
2058 
2059 bool AMDGPULegalizerInfo::legalizeITOFP(
2060   MachineInstr &MI, MachineRegisterInfo &MRI,
2061   MachineIRBuilder &B, bool Signed) const {
2062 
2063   Register Dst = MI.getOperand(0).getReg();
2064   Register Src = MI.getOperand(1).getReg();
2065 
2066   const LLT S64 = LLT::scalar(64);
2067   const LLT S32 = LLT::scalar(32);
2068 
2069   assert(MRI.getType(Src) == S64);
2070 
2071   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2072   auto ThirtyTwo = B.buildConstant(S32, 32);
2073 
2074   if (MRI.getType(Dst) == S64) {
2075     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2076                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2077 
2078     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2079     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080                      .addUse(CvtHi.getReg(0))
2081                      .addUse(ThirtyTwo.getReg(0));
2082 
2083     // TODO: Should this propagate fast-math-flags?
2084     B.buildFAdd(Dst, LdExp, CvtLo);
2085     MI.eraseFromParent();
2086     return true;
2087   }
2088 
2089   assert(MRI.getType(Dst) == S32);
2090 
2091   auto One = B.buildConstant(S32, 1);
2092 
2093   MachineInstrBuilder ShAmt;
2094   if (Signed) {
2095     auto ThirtyOne = B.buildConstant(S32, 31);
2096     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2097     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2098     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2099     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2100                                /*HasSideEffects=*/false)
2101                   .addUse(Unmerge.getReg(1));
2102     auto LS2 = B.buildSub(S32, LS, One);
2103     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2104   } else
2105     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2106   auto Norm = B.buildShl(S64, Src, ShAmt);
2107   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2108   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2109   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2110   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2111   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2112   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2113                    /*HasSideEffects=*/false)
2114       .addUse(FVal.getReg(0))
2115       .addUse(Scale.getReg(0));
2116   MI.eraseFromParent();
2117   return true;
2118 }
2119 
2120 // TODO: Copied from DAG implementation. Verify logic and document how this
2121 // actually works.
2122 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2123                                         MachineRegisterInfo &MRI,
2124                                         MachineIRBuilder &B,
2125                                         bool Signed) const {
2126 
2127   Register Dst = MI.getOperand(0).getReg();
2128   Register Src = MI.getOperand(1).getReg();
2129 
2130   const LLT S64 = LLT::scalar(64);
2131   const LLT S32 = LLT::scalar(32);
2132 
2133   const LLT SrcLT = MRI.getType(Src);
2134   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2135 
2136   unsigned Flags = MI.getFlags();
2137 
2138   // The basic idea of converting a floating point number into a pair of 32-bit
2139   // integers is illustrated as follows:
2140   //
2141   //     tf := trunc(val);
2142   //    hif := floor(tf * 2^-32);
2143   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2144   //     hi := fptoi(hif);
2145   //     lo := fptoi(lof);
2146   //
2147   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2148   MachineInstrBuilder Sign;
2149   if (Signed && SrcLT == S32) {
2150     // However, a 32-bit floating point number has only 23 bits mantissa and
2151     // it's not enough to hold all the significant bits of `lof` if val is
2152     // negative. To avoid the loss of precision, We need to take the absolute
2153     // value after truncating and flip the result back based on the original
2154     // signedness.
2155     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2156     Trunc = B.buildFAbs(S32, Trunc, Flags);
2157   }
2158   MachineInstrBuilder K0, K1;
2159   if (SrcLT == S64) {
2160     K0 = B.buildFConstant(S64,
2161                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2162     K1 = B.buildFConstant(S64,
2163                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2164   } else {
2165     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2166     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2167   }
2168 
2169   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2170   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2171   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2172 
2173   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2174                                      : B.buildFPTOUI(S32, FloorMul);
2175   auto Lo = B.buildFPTOUI(S32, Fma);
2176 
2177   if (Signed && SrcLT == S32) {
2178     // Flip the result based on the signedness, which is either all 0s or 1s.
2179     Sign = B.buildMerge(S64, {Sign, Sign});
2180     // r := xor({lo, hi}, sign) - sign;
2181     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2182   } else
2183     B.buildMerge(Dst, {Lo, Hi});
2184   MI.eraseFromParent();
2185 
2186   return true;
2187 }
2188 
2189 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2190                                                MachineInstr &MI) const {
2191   MachineFunction &MF = Helper.MIRBuilder.getMF();
2192   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2193 
2194   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2195                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2196 
2197   // With ieee_mode disabled, the instructions have the correct behavior
2198   // already for G_FMINNUM/G_FMAXNUM
2199   if (!MFI->getMode().IEEE)
2200     return !IsIEEEOp;
2201 
2202   if (IsIEEEOp)
2203     return true;
2204 
2205   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2206 }
2207 
2208 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2209   MachineInstr &MI, MachineRegisterInfo &MRI,
2210   MachineIRBuilder &B) const {
2211   // TODO: Should move some of this into LegalizerHelper.
2212 
2213   // TODO: Promote dynamic indexing of s16 to s32
2214 
2215   // FIXME: Artifact combiner probably should have replaced the truncated
2216   // constant before this, so we shouldn't need
2217   // getConstantVRegValWithLookThrough.
2218   Optional<ValueAndVReg> MaybeIdxVal =
2219       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2220   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2221     return true;
2222   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2223 
2224   Register Dst = MI.getOperand(0).getReg();
2225   Register Vec = MI.getOperand(1).getReg();
2226 
2227   LLT VecTy = MRI.getType(Vec);
2228   LLT EltTy = VecTy.getElementType();
2229   assert(EltTy == MRI.getType(Dst));
2230 
2231   if (IdxVal < VecTy.getNumElements())
2232     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2233   else
2234     B.buildUndef(Dst);
2235 
2236   MI.eraseFromParent();
2237   return true;
2238 }
2239 
2240 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2241   MachineInstr &MI, MachineRegisterInfo &MRI,
2242   MachineIRBuilder &B) const {
2243   // TODO: Should move some of this into LegalizerHelper.
2244 
2245   // TODO: Promote dynamic indexing of s16 to s32
2246 
2247   // FIXME: Artifact combiner probably should have replaced the truncated
2248   // constant before this, so we shouldn't need
2249   // getConstantVRegValWithLookThrough.
2250   Optional<ValueAndVReg> MaybeIdxVal =
2251       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2252   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2253     return true;
2254 
2255   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2256   Register Dst = MI.getOperand(0).getReg();
2257   Register Vec = MI.getOperand(1).getReg();
2258   Register Ins = MI.getOperand(2).getReg();
2259 
2260   LLT VecTy = MRI.getType(Vec);
2261   LLT EltTy = VecTy.getElementType();
2262   assert(EltTy == MRI.getType(Ins));
2263 
2264   if (IdxVal < VecTy.getNumElements())
2265     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2266   else
2267     B.buildUndef(Dst);
2268 
2269   MI.eraseFromParent();
2270   return true;
2271 }
2272 
2273 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2274   MachineInstr &MI, MachineRegisterInfo &MRI,
2275   MachineIRBuilder &B) const {
2276   const LLT V2S16 = LLT::fixed_vector(2, 16);
2277 
2278   Register Dst = MI.getOperand(0).getReg();
2279   Register Src0 = MI.getOperand(1).getReg();
2280   LLT DstTy = MRI.getType(Dst);
2281   LLT SrcTy = MRI.getType(Src0);
2282 
2283   if (SrcTy == V2S16 && DstTy == V2S16 &&
2284       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2285     return true;
2286 
2287   MachineIRBuilder HelperBuilder(MI);
2288   GISelObserverWrapper DummyObserver;
2289   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2290   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeSinCos(
2294   MachineInstr &MI, MachineRegisterInfo &MRI,
2295   MachineIRBuilder &B) const {
2296 
2297   Register DstReg = MI.getOperand(0).getReg();
2298   Register SrcReg = MI.getOperand(1).getReg();
2299   LLT Ty = MRI.getType(DstReg);
2300   unsigned Flags = MI.getFlags();
2301 
2302   Register TrigVal;
2303   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2304   if (ST.hasTrigReducedRange()) {
2305     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2306     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2307       .addUse(MulVal.getReg(0))
2308       .setMIFlags(Flags).getReg(0);
2309   } else
2310     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2311 
2312   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2313     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2314   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2315     .addUse(TrigVal)
2316     .setMIFlags(Flags);
2317   MI.eraseFromParent();
2318   return true;
2319 }
2320 
2321 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2322                                                   MachineIRBuilder &B,
2323                                                   const GlobalValue *GV,
2324                                                   int64_t Offset,
2325                                                   unsigned GAFlags) const {
2326   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2327   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2328   // to the following code sequence:
2329   //
2330   // For constant address space:
2331   //   s_getpc_b64 s[0:1]
2332   //   s_add_u32 s0, s0, $symbol
2333   //   s_addc_u32 s1, s1, 0
2334   //
2335   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2336   //   a fixup or relocation is emitted to replace $symbol with a literal
2337   //   constant, which is a pc-relative offset from the encoding of the $symbol
2338   //   operand to the global variable.
2339   //
2340   // For global address space:
2341   //   s_getpc_b64 s[0:1]
2342   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2343   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2344   //
2345   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346   //   fixups or relocations are emitted to replace $symbol@*@lo and
2347   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2348   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2349   //   operand to the global variable.
2350   //
2351   // What we want here is an offset from the value returned by s_getpc
2352   // (which is the address of the s_add_u32 instruction) to the global
2353   // variable, but since the encoding of $symbol starts 4 bytes after the start
2354   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2355   // small. This requires us to add 4 to the global variable offset in order to
2356   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2357   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2358   // instruction.
2359 
2360   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2361 
2362   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2363     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2364 
2365   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2366     .addDef(PCReg);
2367 
2368   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2369   if (GAFlags == SIInstrInfo::MO_NONE)
2370     MIB.addImm(0);
2371   else
2372     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2373 
2374   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2375 
2376   if (PtrTy.getSizeInBits() == 32)
2377     B.buildExtract(DstReg, PCReg, 0);
2378   return true;
2379  }
2380 
2381 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2382   MachineInstr &MI, MachineRegisterInfo &MRI,
2383   MachineIRBuilder &B) const {
2384   Register DstReg = MI.getOperand(0).getReg();
2385   LLT Ty = MRI.getType(DstReg);
2386   unsigned AS = Ty.getAddressSpace();
2387 
2388   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2389   MachineFunction &MF = B.getMF();
2390   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2391 
2392   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2393     if (!MFI->isModuleEntryFunction() &&
2394         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2395       const Function &Fn = MF.getFunction();
2396       DiagnosticInfoUnsupported BadLDSDecl(
2397         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2398         DS_Warning);
2399       Fn.getContext().diagnose(BadLDSDecl);
2400 
2401       // We currently don't have a way to correctly allocate LDS objects that
2402       // aren't directly associated with a kernel. We do force inlining of
2403       // functions that use local objects. However, if these dead functions are
2404       // not eliminated, we don't want a compile time error. Just emit a warning
2405       // and a trap, since there should be no callable path here.
2406       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2407       B.buildUndef(DstReg);
2408       MI.eraseFromParent();
2409       return true;
2410     }
2411 
2412     // TODO: We could emit code to handle the initialization somewhere.
2413     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2414       const SITargetLowering *TLI = ST.getTargetLowering();
2415       if (!TLI->shouldUseLDSConstAddress(GV)) {
2416         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2417         return true; // Leave in place;
2418       }
2419 
2420       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2421         Type *Ty = GV->getValueType();
2422         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2423         // zero-sized type in other languages to declare the dynamic shared
2424         // memory which size is not known at the compile time. They will be
2425         // allocated by the runtime and placed directly after the static
2426         // allocated ones. They all share the same offset.
2427         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2428           // Adjust alignment for that dynamic shared memory array.
2429           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2430           LLT S32 = LLT::scalar(32);
2431           auto Sz =
2432               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2433           B.buildIntToPtr(DstReg, Sz);
2434           MI.eraseFromParent();
2435           return true;
2436         }
2437       }
2438 
2439       B.buildConstant(
2440           DstReg,
2441           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2442       MI.eraseFromParent();
2443       return true;
2444     }
2445 
2446     const Function &Fn = MF.getFunction();
2447     DiagnosticInfoUnsupported BadInit(
2448       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2449     Fn.getContext().diagnose(BadInit);
2450     return true;
2451   }
2452 
2453   const SITargetLowering *TLI = ST.getTargetLowering();
2454 
2455   if (TLI->shouldEmitFixup(GV)) {
2456     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2457     MI.eraseFromParent();
2458     return true;
2459   }
2460 
2461   if (TLI->shouldEmitPCReloc(GV)) {
2462     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2463     MI.eraseFromParent();
2464     return true;
2465   }
2466 
2467   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2468   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2469 
2470   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2471   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2472       MachinePointerInfo::getGOT(MF),
2473       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2474           MachineMemOperand::MOInvariant,
2475       LoadTy, Align(8));
2476 
2477   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2478 
2479   if (Ty.getSizeInBits() == 32) {
2480     // Truncate if this is a 32-bit constant adrdess.
2481     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2482     B.buildExtract(DstReg, Load, 0);
2483   } else
2484     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2485 
2486   MI.eraseFromParent();
2487   return true;
2488 }
2489 
2490 static LLT widenToNextPowerOf2(LLT Ty) {
2491   if (Ty.isVector())
2492     return Ty.changeElementCount(
2493         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2494   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2495 }
2496 
2497 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2498                                        MachineInstr &MI) const {
2499   MachineIRBuilder &B = Helper.MIRBuilder;
2500   MachineRegisterInfo &MRI = *B.getMRI();
2501   GISelChangeObserver &Observer = Helper.Observer;
2502 
2503   Register PtrReg = MI.getOperand(1).getReg();
2504   LLT PtrTy = MRI.getType(PtrReg);
2505   unsigned AddrSpace = PtrTy.getAddressSpace();
2506 
2507   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2508     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2509     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2510     Observer.changingInstr(MI);
2511     MI.getOperand(1).setReg(Cast.getReg(0));
2512     Observer.changedInstr(MI);
2513     return true;
2514   }
2515 
2516   if (MI.getOpcode() != AMDGPU::G_LOAD)
2517     return false;
2518 
2519   Register ValReg = MI.getOperand(0).getReg();
2520   LLT ValTy = MRI.getType(ValReg);
2521 
2522   MachineMemOperand *MMO = *MI.memoperands_begin();
2523   const unsigned ValSize = ValTy.getSizeInBits();
2524   const LLT MemTy = MMO->getMemoryType();
2525   const Align MemAlign = MMO->getAlign();
2526   const unsigned MemSize = MemTy.getSizeInBits();
2527   const unsigned AlignInBits = 8 * MemAlign.value();
2528 
2529   // Widen non-power-of-2 loads to the alignment if needed
2530   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2531     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2532 
2533     // This was already the correct extending load result type, so just adjust
2534     // the memory type.
2535     if (WideMemSize == ValSize) {
2536       MachineFunction &MF = B.getMF();
2537 
2538       MachineMemOperand *WideMMO =
2539           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2540       Observer.changingInstr(MI);
2541       MI.setMemRefs(MF, {WideMMO});
2542       Observer.changedInstr(MI);
2543       return true;
2544     }
2545 
2546     // Don't bother handling edge case that should probably never be produced.
2547     if (ValSize > WideMemSize)
2548       return false;
2549 
2550     LLT WideTy = widenToNextPowerOf2(ValTy);
2551 
2552     Register WideLoad;
2553     if (!WideTy.isVector()) {
2554       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2555       B.buildTrunc(ValReg, WideLoad).getReg(0);
2556     } else {
2557       // Extract the subvector.
2558 
2559       if (isRegisterType(ValTy)) {
2560         // If this a case where G_EXTRACT is legal, use it.
2561         // (e.g. <3 x s32> -> <4 x s32>)
2562         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2563         B.buildExtract(ValReg, WideLoad, 0);
2564       } else {
2565         // For cases where the widened type isn't a nice register value, unmerge
2566         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2567         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2568         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2569         B.setInsertPt(B.getMBB(), MI.getIterator());
2570         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2571       }
2572     }
2573 
2574     MI.eraseFromParent();
2575     return true;
2576   }
2577 
2578   return false;
2579 }
2580 
2581 bool AMDGPULegalizerInfo::legalizeFMad(
2582   MachineInstr &MI, MachineRegisterInfo &MRI,
2583   MachineIRBuilder &B) const {
2584   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2585   assert(Ty.isScalar());
2586 
2587   MachineFunction &MF = B.getMF();
2588   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2589 
2590   // TODO: Always legal with future ftz flag.
2591   // FIXME: Do we need just output?
2592   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2593     return true;
2594   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2595     return true;
2596 
2597   MachineIRBuilder HelperBuilder(MI);
2598   GISelObserverWrapper DummyObserver;
2599   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2600   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2601 }
2602 
2603 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2604   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2605   Register DstReg = MI.getOperand(0).getReg();
2606   Register PtrReg = MI.getOperand(1).getReg();
2607   Register CmpVal = MI.getOperand(2).getReg();
2608   Register NewVal = MI.getOperand(3).getReg();
2609 
2610   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2611          "this should not have been custom lowered");
2612 
2613   LLT ValTy = MRI.getType(CmpVal);
2614   LLT VecTy = LLT::fixed_vector(2, ValTy);
2615 
2616   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2617 
2618   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2619     .addDef(DstReg)
2620     .addUse(PtrReg)
2621     .addUse(PackedVal)
2622     .setMemRefs(MI.memoperands());
2623 
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627 
2628 bool AMDGPULegalizerInfo::legalizeFlog(
2629   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2630   Register Dst = MI.getOperand(0).getReg();
2631   Register Src = MI.getOperand(1).getReg();
2632   LLT Ty = B.getMRI()->getType(Dst);
2633   unsigned Flags = MI.getFlags();
2634 
2635   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2636   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2637 
2638   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2639   MI.eraseFromParent();
2640   return true;
2641 }
2642 
2643 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2644                                        MachineIRBuilder &B) const {
2645   Register Dst = MI.getOperand(0).getReg();
2646   Register Src = MI.getOperand(1).getReg();
2647   unsigned Flags = MI.getFlags();
2648   LLT Ty = B.getMRI()->getType(Dst);
2649 
2650   auto K = B.buildFConstant(Ty, numbers::log2e);
2651   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2652   B.buildFExp2(Dst, Mul, Flags);
2653   MI.eraseFromParent();
2654   return true;
2655 }
2656 
2657 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2658                                        MachineIRBuilder &B) const {
2659   Register Dst = MI.getOperand(0).getReg();
2660   Register Src0 = MI.getOperand(1).getReg();
2661   Register Src1 = MI.getOperand(2).getReg();
2662   unsigned Flags = MI.getFlags();
2663   LLT Ty = B.getMRI()->getType(Dst);
2664   const LLT S16 = LLT::scalar(16);
2665   const LLT S32 = LLT::scalar(32);
2666 
2667   if (Ty == S32) {
2668     auto Log = B.buildFLog2(S32, Src0, Flags);
2669     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2670       .addUse(Log.getReg(0))
2671       .addUse(Src1)
2672       .setMIFlags(Flags);
2673     B.buildFExp2(Dst, Mul, Flags);
2674   } else if (Ty == S16) {
2675     // There's no f16 fmul_legacy, so we need to convert for it.
2676     auto Log = B.buildFLog2(S16, Src0, Flags);
2677     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2678     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2679     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2680       .addUse(Ext0.getReg(0))
2681       .addUse(Ext1.getReg(0))
2682       .setMIFlags(Flags);
2683 
2684     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2685   } else
2686     return false;
2687 
2688   MI.eraseFromParent();
2689   return true;
2690 }
2691 
2692 // Find a source register, ignoring any possible source modifiers.
2693 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2694   Register ModSrc = OrigSrc;
2695   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2696     ModSrc = SrcFNeg->getOperand(1).getReg();
2697     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2698       ModSrc = SrcFAbs->getOperand(1).getReg();
2699   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2700     ModSrc = SrcFAbs->getOperand(1).getReg();
2701   return ModSrc;
2702 }
2703 
2704 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2705                                          MachineRegisterInfo &MRI,
2706                                          MachineIRBuilder &B) const {
2707 
2708   const LLT S1 = LLT::scalar(1);
2709   const LLT S64 = LLT::scalar(64);
2710   Register Dst = MI.getOperand(0).getReg();
2711   Register OrigSrc = MI.getOperand(1).getReg();
2712   unsigned Flags = MI.getFlags();
2713   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2714          "this should not have been custom lowered");
2715 
2716   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2717   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2718   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2719   // V_FRACT bug is:
2720   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2721   //
2722   // Convert floor(x) to (x - fract(x))
2723 
2724   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2725     .addUse(OrigSrc)
2726     .setMIFlags(Flags);
2727 
2728   // Give source modifier matching some assistance before obscuring a foldable
2729   // pattern.
2730 
2731   // TODO: We can avoid the neg on the fract? The input sign to fract
2732   // shouldn't matter?
2733   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2734 
2735   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2736 
2737   Register Min = MRI.createGenericVirtualRegister(S64);
2738 
2739   // We don't need to concern ourselves with the snan handling difference, so
2740   // use the one which will directly select.
2741   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2742   if (MFI->getMode().IEEE)
2743     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2744   else
2745     B.buildFMinNum(Min, Fract, Const, Flags);
2746 
2747   Register CorrectedFract = Min;
2748   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2749     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2750     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2751   }
2752 
2753   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2754   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2755 
2756   MI.eraseFromParent();
2757   return true;
2758 }
2759 
2760 // Turn an illegal packed v2s16 build vector into bit operations.
2761 // TODO: This should probably be a bitcast action in LegalizerHelper.
2762 bool AMDGPULegalizerInfo::legalizeBuildVector(
2763   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2764   Register Dst = MI.getOperand(0).getReg();
2765   const LLT S32 = LLT::scalar(32);
2766   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2767 
2768   Register Src0 = MI.getOperand(1).getReg();
2769   Register Src1 = MI.getOperand(2).getReg();
2770   assert(MRI.getType(Src0) == LLT::scalar(16));
2771 
2772   auto Merge = B.buildMerge(S32, {Src0, Src1});
2773   B.buildBitcast(Dst, Merge);
2774 
2775   MI.eraseFromParent();
2776   return true;
2777 }
2778 
2779 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2780 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2781 // case with a single min instruction instead of a compare+select.
2782 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2783                                             MachineRegisterInfo &MRI,
2784                                             MachineIRBuilder &B) const {
2785   Register Dst = MI.getOperand(0).getReg();
2786   Register Src = MI.getOperand(1).getReg();
2787   LLT DstTy = MRI.getType(Dst);
2788   LLT SrcTy = MRI.getType(Src);
2789 
2790   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2791                         ? AMDGPU::G_AMDGPU_FFBH_U32
2792                         : AMDGPU::G_AMDGPU_FFBL_B32;
2793   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2794   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2795 
2796   MI.eraseFromParent();
2797   return true;
2798 }
2799 
2800 // Check that this is a G_XOR x, -1
2801 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2802   if (MI.getOpcode() != TargetOpcode::G_XOR)
2803     return false;
2804   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2805   return ConstVal && *ConstVal == -1;
2806 }
2807 
2808 // Return the use branch instruction, otherwise null if the usage is invalid.
2809 static MachineInstr *
2810 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2811                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2812   Register CondDef = MI.getOperand(0).getReg();
2813   if (!MRI.hasOneNonDBGUse(CondDef))
2814     return nullptr;
2815 
2816   MachineBasicBlock *Parent = MI.getParent();
2817   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2818 
2819   if (isNot(MRI, *UseMI)) {
2820     Register NegatedCond = UseMI->getOperand(0).getReg();
2821     if (!MRI.hasOneNonDBGUse(NegatedCond))
2822       return nullptr;
2823 
2824     // We're deleting the def of this value, so we need to remove it.
2825     UseMI->eraseFromParent();
2826 
2827     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2828     Negated = true;
2829   }
2830 
2831   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2832     return nullptr;
2833 
2834   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2835   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2836   if (Next == Parent->end()) {
2837     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2838     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2839       return nullptr;
2840     UncondBrTarget = &*NextMBB;
2841   } else {
2842     if (Next->getOpcode() != AMDGPU::G_BR)
2843       return nullptr;
2844     Br = &*Next;
2845     UncondBrTarget = Br->getOperand(0).getMBB();
2846   }
2847 
2848   return UseMI;
2849 }
2850 
2851 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2852                                          const ArgDescriptor *Arg,
2853                                          const TargetRegisterClass *ArgRC,
2854                                          LLT ArgTy) const {
2855   MCRegister SrcReg = Arg->getRegister();
2856   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2857   assert(DstReg.isVirtual() && "Virtual register expected");
2858 
2859   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2860                                              ArgTy);
2861   if (Arg->isMasked()) {
2862     // TODO: Should we try to emit this once in the entry block?
2863     const LLT S32 = LLT::scalar(32);
2864     const unsigned Mask = Arg->getMask();
2865     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2866 
2867     Register AndMaskSrc = LiveIn;
2868 
2869     if (Shift != 0) {
2870       auto ShiftAmt = B.buildConstant(S32, Shift);
2871       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2872     }
2873 
2874     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2875   } else {
2876     B.buildCopy(DstReg, LiveIn);
2877   }
2878 
2879   return true;
2880 }
2881 
2882 bool AMDGPULegalizerInfo::loadInputValue(
2883     Register DstReg, MachineIRBuilder &B,
2884     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2885   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2886   const ArgDescriptor *Arg;
2887   const TargetRegisterClass *ArgRC;
2888   LLT ArgTy;
2889   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2890 
2891   if (!Arg) {
2892     assert(ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2893     // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2894     // case the pointer argument may be missing and we use null.
2895     B.buildConstant(DstReg, 0);
2896     return true;
2897   }
2898 
2899   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2900     return false; // TODO: Handle these
2901   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2902 }
2903 
2904 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2905     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2906     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2907   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2908     return false;
2909 
2910   MI.eraseFromParent();
2911   return true;
2912 }
2913 
2914 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2915                                        MachineRegisterInfo &MRI,
2916                                        MachineIRBuilder &B) const {
2917   Register Dst = MI.getOperand(0).getReg();
2918   LLT DstTy = MRI.getType(Dst);
2919   LLT S16 = LLT::scalar(16);
2920   LLT S32 = LLT::scalar(32);
2921   LLT S64 = LLT::scalar(64);
2922 
2923   if (DstTy == S16)
2924     return legalizeFDIV16(MI, MRI, B);
2925   if (DstTy == S32)
2926     return legalizeFDIV32(MI, MRI, B);
2927   if (DstTy == S64)
2928     return legalizeFDIV64(MI, MRI, B);
2929 
2930   return false;
2931 }
2932 
2933 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2934                                                         Register DstDivReg,
2935                                                         Register DstRemReg,
2936                                                         Register X,
2937                                                         Register Y) const {
2938   const LLT S1 = LLT::scalar(1);
2939   const LLT S32 = LLT::scalar(32);
2940 
2941   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2942   // algorithm used here.
2943 
2944   // Initial estimate of inv(y).
2945   auto FloatY = B.buildUITOFP(S32, Y);
2946   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2947   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2948   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2949   auto Z = B.buildFPTOUI(S32, ScaledY);
2950 
2951   // One round of UNR.
2952   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2953   auto NegYZ = B.buildMul(S32, NegY, Z);
2954   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2955 
2956   // Quotient/remainder estimate.
2957   auto Q = B.buildUMulH(S32, X, Z);
2958   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2959 
2960   // First quotient/remainder refinement.
2961   auto One = B.buildConstant(S32, 1);
2962   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2963   if (DstDivReg)
2964     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2965   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2966 
2967   // Second quotient/remainder refinement.
2968   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2969   if (DstDivReg)
2970     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2971 
2972   if (DstRemReg)
2973     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2974 }
2975 
2976 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2977 //
2978 // Return lo, hi of result
2979 //
2980 // %cvt.lo = G_UITOFP Val.lo
2981 // %cvt.hi = G_UITOFP Val.hi
2982 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2983 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2984 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2985 // %mul2 = G_FMUL %mul1, 2**(-32)
2986 // %trunc = G_INTRINSIC_TRUNC %mul2
2987 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2988 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2989 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2990                                                        Register Val) {
2991   const LLT S32 = LLT::scalar(32);
2992   auto Unmerge = B.buildUnmerge(S32, Val);
2993 
2994   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2995   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2996 
2997   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2998                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2999 
3000   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3001   auto Mul1 =
3002       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3003 
3004   // 2**(-32)
3005   auto Mul2 =
3006       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3007   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3008 
3009   // -(2**32)
3010   auto Mad2 = B.buildFMAD(S32, Trunc,
3011                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3012 
3013   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3014   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3015 
3016   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3017 }
3018 
3019 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3020                                                         Register DstDivReg,
3021                                                         Register DstRemReg,
3022                                                         Register Numer,
3023                                                         Register Denom) const {
3024   const LLT S32 = LLT::scalar(32);
3025   const LLT S64 = LLT::scalar(64);
3026   const LLT S1 = LLT::scalar(1);
3027   Register RcpLo, RcpHi;
3028 
3029   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3030 
3031   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3032 
3033   auto Zero64 = B.buildConstant(S64, 0);
3034   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3035 
3036   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3037   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3038 
3039   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3040   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3041   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3042 
3043   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3044   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3045   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3046   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3047 
3048   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3049   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3050   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3051   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3052   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3053 
3054   auto Zero32 = B.buildConstant(S32, 0);
3055   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3056   auto Add2_HiC =
3057       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3058   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3059   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3060 
3061   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3062   Register NumerLo = UnmergeNumer.getReg(0);
3063   Register NumerHi = UnmergeNumer.getReg(1);
3064 
3065   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3066   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3067   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3068   Register Mul3_Lo = UnmergeMul3.getReg(0);
3069   Register Mul3_Hi = UnmergeMul3.getReg(1);
3070   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3071   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3072   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3073   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3074 
3075   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3076   Register DenomLo = UnmergeDenom.getReg(0);
3077   Register DenomHi = UnmergeDenom.getReg(1);
3078 
3079   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3080   auto C1 = B.buildSExt(S32, CmpHi);
3081 
3082   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3083   auto C2 = B.buildSExt(S32, CmpLo);
3084 
3085   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3086   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3087 
3088   // TODO: Here and below portions of the code can be enclosed into if/endif.
3089   // Currently control flow is unconditional and we have 4 selects after
3090   // potential endif to substitute PHIs.
3091 
3092   // if C3 != 0 ...
3093   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3094   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3095   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3096   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3097 
3098   auto One64 = B.buildConstant(S64, 1);
3099   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3100 
3101   auto C4 =
3102       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3103   auto C5 =
3104       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3105   auto C6 = B.buildSelect(
3106       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3107 
3108   // if (C6 != 0)
3109   auto Add4 = B.buildAdd(S64, Add3, One64);
3110   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3111 
3112   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3113   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3114   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3115 
3116   // endif C6
3117   // endif C3
3118 
3119   if (DstDivReg) {
3120     auto Sel1 = B.buildSelect(
3121         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3122     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3123                   Sel1, MulHi3);
3124   }
3125 
3126   if (DstRemReg) {
3127     auto Sel2 = B.buildSelect(
3128         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3129     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3130                   Sel2, Sub1);
3131   }
3132 }
3133 
3134 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3135                                                   MachineRegisterInfo &MRI,
3136                                                   MachineIRBuilder &B) const {
3137   Register DstDivReg, DstRemReg;
3138   switch (MI.getOpcode()) {
3139   default:
3140     llvm_unreachable("Unexpected opcode!");
3141   case AMDGPU::G_UDIV: {
3142     DstDivReg = MI.getOperand(0).getReg();
3143     break;
3144   }
3145   case AMDGPU::G_UREM: {
3146     DstRemReg = MI.getOperand(0).getReg();
3147     break;
3148   }
3149   case AMDGPU::G_UDIVREM: {
3150     DstDivReg = MI.getOperand(0).getReg();
3151     DstRemReg = MI.getOperand(1).getReg();
3152     break;
3153   }
3154   }
3155 
3156   const LLT S64 = LLT::scalar(64);
3157   const LLT S32 = LLT::scalar(32);
3158   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3159   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3160   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3161   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3162 
3163   if (Ty == S32)
3164     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3165   else if (Ty == S64)
3166     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3167   else
3168     return false;
3169 
3170   MI.eraseFromParent();
3171   return true;
3172 }
3173 
3174 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3175                                                 MachineRegisterInfo &MRI,
3176                                                 MachineIRBuilder &B) const {
3177   const LLT S64 = LLT::scalar(64);
3178   const LLT S32 = LLT::scalar(32);
3179 
3180   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3181   if (Ty != S32 && Ty != S64)
3182     return false;
3183 
3184   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3185   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3186   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3187 
3188   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3189   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3190   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3191 
3192   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3193   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3194 
3195   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3196   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3197 
3198   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3199   switch (MI.getOpcode()) {
3200   default:
3201     llvm_unreachable("Unexpected opcode!");
3202   case AMDGPU::G_SDIV: {
3203     DstDivReg = MI.getOperand(0).getReg();
3204     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3205     break;
3206   }
3207   case AMDGPU::G_SREM: {
3208     DstRemReg = MI.getOperand(0).getReg();
3209     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3210     break;
3211   }
3212   case AMDGPU::G_SDIVREM: {
3213     DstDivReg = MI.getOperand(0).getReg();
3214     DstRemReg = MI.getOperand(1).getReg();
3215     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3216     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3217     break;
3218   }
3219   }
3220 
3221   if (Ty == S32)
3222     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3223   else
3224     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3225 
3226   if (DstDivReg) {
3227     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3228     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3229     B.buildSub(DstDivReg, SignXor, Sign);
3230   }
3231 
3232   if (DstRemReg) {
3233     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3234     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3235     B.buildSub(DstRemReg, SignXor, Sign);
3236   }
3237 
3238   MI.eraseFromParent();
3239   return true;
3240 }
3241 
3242 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3243                                                  MachineRegisterInfo &MRI,
3244                                                  MachineIRBuilder &B) const {
3245   Register Res = MI.getOperand(0).getReg();
3246   Register LHS = MI.getOperand(1).getReg();
3247   Register RHS = MI.getOperand(2).getReg();
3248   uint16_t Flags = MI.getFlags();
3249   LLT ResTy = MRI.getType(Res);
3250 
3251   const MachineFunction &MF = B.getMF();
3252   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3253                             MI.getFlag(MachineInstr::FmAfn);
3254 
3255   if (!AllowInaccurateRcp)
3256     return false;
3257 
3258   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3259     // 1 / x -> RCP(x)
3260     if (CLHS->isExactlyValue(1.0)) {
3261       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3262         .addUse(RHS)
3263         .setMIFlags(Flags);
3264 
3265       MI.eraseFromParent();
3266       return true;
3267     }
3268 
3269     // -1 / x -> RCP( FNEG(x) )
3270     if (CLHS->isExactlyValue(-1.0)) {
3271       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3272       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3273         .addUse(FNeg.getReg(0))
3274         .setMIFlags(Flags);
3275 
3276       MI.eraseFromParent();
3277       return true;
3278     }
3279   }
3280 
3281   // x / y -> x * (1.0 / y)
3282   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3283     .addUse(RHS)
3284     .setMIFlags(Flags);
3285   B.buildFMul(Res, LHS, RCP, Flags);
3286 
3287   MI.eraseFromParent();
3288   return true;
3289 }
3290 
3291 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3292                                                    MachineRegisterInfo &MRI,
3293                                                    MachineIRBuilder &B) const {
3294   Register Res = MI.getOperand(0).getReg();
3295   Register X = MI.getOperand(1).getReg();
3296   Register Y = MI.getOperand(2).getReg();
3297   uint16_t Flags = MI.getFlags();
3298   LLT ResTy = MRI.getType(Res);
3299 
3300   const MachineFunction &MF = B.getMF();
3301   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3302                             MI.getFlag(MachineInstr::FmAfn);
3303 
3304   if (!AllowInaccurateRcp)
3305     return false;
3306 
3307   auto NegY = B.buildFNeg(ResTy, Y);
3308   auto One = B.buildFConstant(ResTy, 1.0);
3309 
3310   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3311     .addUse(Y)
3312     .setMIFlags(Flags);
3313 
3314   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3315   R = B.buildFMA(ResTy, Tmp0, R, R);
3316 
3317   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3318   R = B.buildFMA(ResTy, Tmp1, R, R);
3319 
3320   auto Ret = B.buildFMul(ResTy, X, R);
3321   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3322 
3323   B.buildFMA(Res, Tmp2, R, Ret);
3324   MI.eraseFromParent();
3325   return true;
3326 }
3327 
3328 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3329                                          MachineRegisterInfo &MRI,
3330                                          MachineIRBuilder &B) const {
3331   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3332     return true;
3333 
3334   Register Res = MI.getOperand(0).getReg();
3335   Register LHS = MI.getOperand(1).getReg();
3336   Register RHS = MI.getOperand(2).getReg();
3337 
3338   uint16_t Flags = MI.getFlags();
3339 
3340   LLT S16 = LLT::scalar(16);
3341   LLT S32 = LLT::scalar(32);
3342 
3343   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3344   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3345 
3346   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3347     .addUse(RHSExt.getReg(0))
3348     .setMIFlags(Flags);
3349 
3350   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3351   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3352 
3353   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3354     .addUse(RDst.getReg(0))
3355     .addUse(RHS)
3356     .addUse(LHS)
3357     .setMIFlags(Flags);
3358 
3359   MI.eraseFromParent();
3360   return true;
3361 }
3362 
3363 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3364 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3365 static void toggleSPDenormMode(bool Enable,
3366                                MachineIRBuilder &B,
3367                                const GCNSubtarget &ST,
3368                                AMDGPU::SIModeRegisterDefaults Mode) {
3369   // Set SP denorm mode to this value.
3370   unsigned SPDenormMode =
3371     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3372 
3373   if (ST.hasDenormModeInst()) {
3374     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3375     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3376 
3377     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3378     B.buildInstr(AMDGPU::S_DENORM_MODE)
3379       .addImm(NewDenormModeValue);
3380 
3381   } else {
3382     // Select FP32 bit field in mode register.
3383     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3384                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3385                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3386 
3387     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3388       .addImm(SPDenormMode)
3389       .addImm(SPDenormModeBitField);
3390   }
3391 }
3392 
3393 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3394                                          MachineRegisterInfo &MRI,
3395                                          MachineIRBuilder &B) const {
3396   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3397     return true;
3398 
3399   Register Res = MI.getOperand(0).getReg();
3400   Register LHS = MI.getOperand(1).getReg();
3401   Register RHS = MI.getOperand(2).getReg();
3402   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3403   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3404 
3405   uint16_t Flags = MI.getFlags();
3406 
3407   LLT S32 = LLT::scalar(32);
3408   LLT S1 = LLT::scalar(1);
3409 
3410   auto One = B.buildFConstant(S32, 1.0f);
3411 
3412   auto DenominatorScaled =
3413     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3414       .addUse(LHS)
3415       .addUse(RHS)
3416       .addImm(0)
3417       .setMIFlags(Flags);
3418   auto NumeratorScaled =
3419     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3420       .addUse(LHS)
3421       .addUse(RHS)
3422       .addImm(1)
3423       .setMIFlags(Flags);
3424 
3425   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3426     .addUse(DenominatorScaled.getReg(0))
3427     .setMIFlags(Flags);
3428   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3429 
3430   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3431   // aren't modeled as reading it.
3432   if (!Mode.allFP32Denormals())
3433     toggleSPDenormMode(true, B, ST, Mode);
3434 
3435   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3436   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3437   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3438   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3439   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3440   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3441 
3442   if (!Mode.allFP32Denormals())
3443     toggleSPDenormMode(false, B, ST, Mode);
3444 
3445   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3446     .addUse(Fma4.getReg(0))
3447     .addUse(Fma1.getReg(0))
3448     .addUse(Fma3.getReg(0))
3449     .addUse(NumeratorScaled.getReg(1))
3450     .setMIFlags(Flags);
3451 
3452   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3453     .addUse(Fmas.getReg(0))
3454     .addUse(RHS)
3455     .addUse(LHS)
3456     .setMIFlags(Flags);
3457 
3458   MI.eraseFromParent();
3459   return true;
3460 }
3461 
3462 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3463                                          MachineRegisterInfo &MRI,
3464                                          MachineIRBuilder &B) const {
3465   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3466     return true;
3467 
3468   Register Res = MI.getOperand(0).getReg();
3469   Register LHS = MI.getOperand(1).getReg();
3470   Register RHS = MI.getOperand(2).getReg();
3471 
3472   uint16_t Flags = MI.getFlags();
3473 
3474   LLT S64 = LLT::scalar(64);
3475   LLT S1 = LLT::scalar(1);
3476 
3477   auto One = B.buildFConstant(S64, 1.0);
3478 
3479   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3480     .addUse(LHS)
3481     .addUse(RHS)
3482     .addImm(0)
3483     .setMIFlags(Flags);
3484 
3485   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3486 
3487   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3488     .addUse(DivScale0.getReg(0))
3489     .setMIFlags(Flags);
3490 
3491   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3492   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3493   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3494 
3495   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3496     .addUse(LHS)
3497     .addUse(RHS)
3498     .addImm(1)
3499     .setMIFlags(Flags);
3500 
3501   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3502   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3503   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3504 
3505   Register Scale;
3506   if (!ST.hasUsableDivScaleConditionOutput()) {
3507     // Workaround a hardware bug on SI where the condition output from div_scale
3508     // is not usable.
3509 
3510     LLT S32 = LLT::scalar(32);
3511 
3512     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3513     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3514     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3515     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3516 
3517     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3518                               Scale1Unmerge.getReg(1));
3519     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3520                               Scale0Unmerge.getReg(1));
3521     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3522   } else {
3523     Scale = DivScale1.getReg(1);
3524   }
3525 
3526   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3527     .addUse(Fma4.getReg(0))
3528     .addUse(Fma3.getReg(0))
3529     .addUse(Mul.getReg(0))
3530     .addUse(Scale)
3531     .setMIFlags(Flags);
3532 
3533   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3534     .addUse(Fmas.getReg(0))
3535     .addUse(RHS)
3536     .addUse(LHS)
3537     .setMIFlags(Flags);
3538 
3539   MI.eraseFromParent();
3540   return true;
3541 }
3542 
3543 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3544                                                  MachineRegisterInfo &MRI,
3545                                                  MachineIRBuilder &B) const {
3546   Register Res = MI.getOperand(0).getReg();
3547   Register LHS = MI.getOperand(2).getReg();
3548   Register RHS = MI.getOperand(3).getReg();
3549   uint16_t Flags = MI.getFlags();
3550 
3551   LLT S32 = LLT::scalar(32);
3552   LLT S1 = LLT::scalar(1);
3553 
3554   auto Abs = B.buildFAbs(S32, RHS, Flags);
3555   const APFloat C0Val(1.0f);
3556 
3557   auto C0 = B.buildConstant(S32, 0x6f800000);
3558   auto C1 = B.buildConstant(S32, 0x2f800000);
3559   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3560 
3561   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3562   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3563 
3564   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3565 
3566   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3567     .addUse(Mul0.getReg(0))
3568     .setMIFlags(Flags);
3569 
3570   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3571 
3572   B.buildFMul(Res, Sel, Mul1, Flags);
3573 
3574   MI.eraseFromParent();
3575   return true;
3576 }
3577 
3578 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3579 // FIXME: Why do we handle this one but not other removed instructions?
3580 //
3581 // Reciprocal square root.  The clamp prevents infinite results, clamping
3582 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3583 // +-max_float.
3584 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3585                                                     MachineRegisterInfo &MRI,
3586                                                     MachineIRBuilder &B) const {
3587   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3588     return true;
3589 
3590   Register Dst = MI.getOperand(0).getReg();
3591   Register Src = MI.getOperand(2).getReg();
3592   auto Flags = MI.getFlags();
3593 
3594   LLT Ty = MRI.getType(Dst);
3595 
3596   const fltSemantics *FltSemantics;
3597   if (Ty == LLT::scalar(32))
3598     FltSemantics = &APFloat::IEEEsingle();
3599   else if (Ty == LLT::scalar(64))
3600     FltSemantics = &APFloat::IEEEdouble();
3601   else
3602     return false;
3603 
3604   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3605     .addUse(Src)
3606     .setMIFlags(Flags);
3607 
3608   // We don't need to concern ourselves with the snan handling difference, since
3609   // the rsq quieted (or not) so use the one which will directly select.
3610   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3611   const bool UseIEEE = MFI->getMode().IEEE;
3612 
3613   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3614   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3615                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3616 
3617   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3618 
3619   if (UseIEEE)
3620     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3621   else
3622     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3623   MI.eraseFromParent();
3624   return true;
3625 }
3626 
3627 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3628   switch (IID) {
3629   case Intrinsic::amdgcn_ds_fadd:
3630     return AMDGPU::G_ATOMICRMW_FADD;
3631   case Intrinsic::amdgcn_ds_fmin:
3632     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3633   case Intrinsic::amdgcn_ds_fmax:
3634     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3635   default:
3636     llvm_unreachable("not a DS FP intrinsic");
3637   }
3638 }
3639 
3640 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3641                                                       MachineInstr &MI,
3642                                                       Intrinsic::ID IID) const {
3643   GISelChangeObserver &Observer = Helper.Observer;
3644   Observer.changingInstr(MI);
3645 
3646   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3647 
3648   // The remaining operands were used to set fields in the MemOperand on
3649   // construction.
3650   for (int I = 6; I > 3; --I)
3651     MI.RemoveOperand(I);
3652 
3653   MI.RemoveOperand(1); // Remove the intrinsic ID.
3654   Observer.changedInstr(MI);
3655   return true;
3656 }
3657 
3658 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3659                                             MachineRegisterInfo &MRI,
3660                                             MachineIRBuilder &B) const {
3661   uint64_t Offset =
3662     ST.getTargetLowering()->getImplicitParameterOffset(
3663       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3664   LLT DstTy = MRI.getType(DstReg);
3665   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3666 
3667   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3668   if (!loadInputValue(KernargPtrReg, B,
3669                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3670     return false;
3671 
3672   // FIXME: This should be nuw
3673   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3674   return true;
3675 }
3676 
3677 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3678                                                  MachineRegisterInfo &MRI,
3679                                                  MachineIRBuilder &B) const {
3680   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3681   if (!MFI->isEntryFunction()) {
3682     return legalizePreloadedArgIntrin(MI, MRI, B,
3683                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3684   }
3685 
3686   Register DstReg = MI.getOperand(0).getReg();
3687   if (!getImplicitArgPtr(DstReg, MRI, B))
3688     return false;
3689 
3690   MI.eraseFromParent();
3691   return true;
3692 }
3693 
3694 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3695                                               MachineRegisterInfo &MRI,
3696                                               MachineIRBuilder &B,
3697                                               unsigned AddrSpace) const {
3698   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3699   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3700   Register Hi32 = Unmerge.getReg(1);
3701 
3702   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3703   MI.eraseFromParent();
3704   return true;
3705 }
3706 
3707 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3708 // offset (the offset that is included in bounds checking and swizzling, to be
3709 // split between the instruction's voffset and immoffset fields) and soffset
3710 // (the offset that is excluded from bounds checking and swizzling, to go in
3711 // the instruction's soffset field).  This function takes the first kind of
3712 // offset and figures out how to split it between voffset and immoffset.
3713 std::pair<Register, unsigned>
3714 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3715                                         Register OrigOffset) const {
3716   const unsigned MaxImm = 4095;
3717   Register BaseReg;
3718   unsigned ImmOffset;
3719   const LLT S32 = LLT::scalar(32);
3720   MachineRegisterInfo &MRI = *B.getMRI();
3721 
3722   std::tie(BaseReg, ImmOffset) =
3723       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3724 
3725   // If BaseReg is a pointer, convert it to int.
3726   if (MRI.getType(BaseReg).isPointer())
3727     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3728 
3729   // If the immediate value is too big for the immoffset field, put the value
3730   // and -4096 into the immoffset field so that the value that is copied/added
3731   // for the voffset field is a multiple of 4096, and it stands more chance
3732   // of being CSEd with the copy/add for another similar load/store.
3733   // However, do not do that rounding down to a multiple of 4096 if that is a
3734   // negative number, as it appears to be illegal to have a negative offset
3735   // in the vgpr, even if adding the immediate offset makes it positive.
3736   unsigned Overflow = ImmOffset & ~MaxImm;
3737   ImmOffset -= Overflow;
3738   if ((int32_t)Overflow < 0) {
3739     Overflow += ImmOffset;
3740     ImmOffset = 0;
3741   }
3742 
3743   if (Overflow != 0) {
3744     if (!BaseReg) {
3745       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3746     } else {
3747       auto OverflowVal = B.buildConstant(S32, Overflow);
3748       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3749     }
3750   }
3751 
3752   if (!BaseReg)
3753     BaseReg = B.buildConstant(S32, 0).getReg(0);
3754 
3755   return std::make_pair(BaseReg, ImmOffset);
3756 }
3757 
3758 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3759 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3760                                           Register VOffset, Register SOffset,
3761                                           unsigned ImmOffset, Register VIndex,
3762                                           MachineRegisterInfo &MRI) const {
3763   Optional<ValueAndVReg> MaybeVOffsetVal =
3764       getConstantVRegValWithLookThrough(VOffset, MRI);
3765   Optional<ValueAndVReg> MaybeSOffsetVal =
3766       getConstantVRegValWithLookThrough(SOffset, MRI);
3767   Optional<ValueAndVReg> MaybeVIndexVal =
3768       getConstantVRegValWithLookThrough(VIndex, MRI);
3769   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3770   // update the MMO with that offset. The stride is unknown so we can only do
3771   // this if VIndex is constant 0.
3772   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3773       MaybeVIndexVal->Value == 0) {
3774     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3775                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3776     MMO->setOffset(TotalOffset);
3777   } else {
3778     // We don't have a constant combined offset to use in the MMO. Give up.
3779     MMO->setValue((Value *)nullptr);
3780   }
3781 }
3782 
3783 /// Handle register layout difference for f16 images for some subtargets.
3784 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3785                                              MachineRegisterInfo &MRI,
3786                                              Register Reg,
3787                                              bool ImageStore) const {
3788   const LLT S16 = LLT::scalar(16);
3789   const LLT S32 = LLT::scalar(32);
3790   LLT StoreVT = MRI.getType(Reg);
3791   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3792 
3793   if (ST.hasUnpackedD16VMem()) {
3794     auto Unmerge = B.buildUnmerge(S16, Reg);
3795 
3796     SmallVector<Register, 4> WideRegs;
3797     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3798       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3799 
3800     int NumElts = StoreVT.getNumElements();
3801 
3802     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3803         .getReg(0);
3804   }
3805 
3806   if (ImageStore && ST.hasImageStoreD16Bug()) {
3807     if (StoreVT.getNumElements() == 2) {
3808       SmallVector<Register, 4> PackedRegs;
3809       Reg = B.buildBitcast(S32, Reg).getReg(0);
3810       PackedRegs.push_back(Reg);
3811       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3812       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3813           .getReg(0);
3814     }
3815 
3816     if (StoreVT.getNumElements() == 3) {
3817       SmallVector<Register, 4> PackedRegs;
3818       auto Unmerge = B.buildUnmerge(S16, Reg);
3819       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3820         PackedRegs.push_back(Unmerge.getReg(I));
3821       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3822       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3823       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3824     }
3825 
3826     if (StoreVT.getNumElements() == 4) {
3827       SmallVector<Register, 4> PackedRegs;
3828       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3829       auto Unmerge = B.buildUnmerge(S32, Reg);
3830       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3831         PackedRegs.push_back(Unmerge.getReg(I));
3832       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3833       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3834           .getReg(0);
3835     }
3836 
3837     llvm_unreachable("invalid data type");
3838   }
3839 
3840   return Reg;
3841 }
3842 
3843 Register AMDGPULegalizerInfo::fixStoreSourceType(
3844   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3845   MachineRegisterInfo *MRI = B.getMRI();
3846   LLT Ty = MRI->getType(VData);
3847 
3848   const LLT S16 = LLT::scalar(16);
3849 
3850   // Fixup illegal register types for i8 stores.
3851   if (Ty == LLT::scalar(8) || Ty == S16) {
3852     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3853     return AnyExt;
3854   }
3855 
3856   if (Ty.isVector()) {
3857     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3858       if (IsFormat)
3859         return handleD16VData(B, *MRI, VData);
3860     }
3861   }
3862 
3863   return VData;
3864 }
3865 
3866 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3867                                               MachineRegisterInfo &MRI,
3868                                               MachineIRBuilder &B,
3869                                               bool IsTyped,
3870                                               bool IsFormat) const {
3871   Register VData = MI.getOperand(1).getReg();
3872   LLT Ty = MRI.getType(VData);
3873   LLT EltTy = Ty.getScalarType();
3874   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3875   const LLT S32 = LLT::scalar(32);
3876 
3877   VData = fixStoreSourceType(B, VData, IsFormat);
3878   Register RSrc = MI.getOperand(2).getReg();
3879 
3880   MachineMemOperand *MMO = *MI.memoperands_begin();
3881   const int MemSize = MMO->getSize();
3882 
3883   unsigned ImmOffset;
3884 
3885   // The typed intrinsics add an immediate after the registers.
3886   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3887 
3888   // The struct intrinsic variants add one additional operand over raw.
3889   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3890   Register VIndex;
3891   int OpOffset = 0;
3892   if (HasVIndex) {
3893     VIndex = MI.getOperand(3).getReg();
3894     OpOffset = 1;
3895   } else {
3896     VIndex = B.buildConstant(S32, 0).getReg(0);
3897   }
3898 
3899   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3900   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3901 
3902   unsigned Format = 0;
3903   if (IsTyped) {
3904     Format = MI.getOperand(5 + OpOffset).getImm();
3905     ++OpOffset;
3906   }
3907 
3908   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3909 
3910   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3911   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3912 
3913   unsigned Opc;
3914   if (IsTyped) {
3915     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3916                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3917   } else if (IsFormat) {
3918     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3919                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3920   } else {
3921     switch (MemSize) {
3922     case 1:
3923       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3924       break;
3925     case 2:
3926       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3927       break;
3928     default:
3929       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3930       break;
3931     }
3932   }
3933 
3934   auto MIB = B.buildInstr(Opc)
3935     .addUse(VData)              // vdata
3936     .addUse(RSrc)               // rsrc
3937     .addUse(VIndex)             // vindex
3938     .addUse(VOffset)            // voffset
3939     .addUse(SOffset)            // soffset
3940     .addImm(ImmOffset);         // offset(imm)
3941 
3942   if (IsTyped)
3943     MIB.addImm(Format);
3944 
3945   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3946      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3947      .addMemOperand(MMO);
3948 
3949   MI.eraseFromParent();
3950   return true;
3951 }
3952 
3953 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3954                                              MachineRegisterInfo &MRI,
3955                                              MachineIRBuilder &B,
3956                                              bool IsFormat,
3957                                              bool IsTyped) const {
3958   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3959   MachineMemOperand *MMO = *MI.memoperands_begin();
3960   const LLT MemTy = MMO->getMemoryType();
3961   const LLT S32 = LLT::scalar(32);
3962 
3963   Register Dst = MI.getOperand(0).getReg();
3964   Register RSrc = MI.getOperand(2).getReg();
3965 
3966   // The typed intrinsics add an immediate after the registers.
3967   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3968 
3969   // The struct intrinsic variants add one additional operand over raw.
3970   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3971   Register VIndex;
3972   int OpOffset = 0;
3973   if (HasVIndex) {
3974     VIndex = MI.getOperand(3).getReg();
3975     OpOffset = 1;
3976   } else {
3977     VIndex = B.buildConstant(S32, 0).getReg(0);
3978   }
3979 
3980   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3981   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3982 
3983   unsigned Format = 0;
3984   if (IsTyped) {
3985     Format = MI.getOperand(5 + OpOffset).getImm();
3986     ++OpOffset;
3987   }
3988 
3989   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3990   unsigned ImmOffset;
3991 
3992   LLT Ty = MRI.getType(Dst);
3993   LLT EltTy = Ty.getScalarType();
3994   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3995   const bool Unpacked = ST.hasUnpackedD16VMem();
3996 
3997   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3998   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3999 
4000   unsigned Opc;
4001 
4002   if (IsTyped) {
4003     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4004                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4005   } else if (IsFormat) {
4006     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4007                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4008   } else {
4009     switch (MemTy.getSizeInBits()) {
4010     case 8:
4011       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4012       break;
4013     case 16:
4014       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4015       break;
4016     default:
4017       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4018       break;
4019     }
4020   }
4021 
4022   Register LoadDstReg;
4023 
4024   bool IsExtLoad =
4025       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4026   LLT UnpackedTy = Ty.changeElementSize(32);
4027 
4028   if (IsExtLoad)
4029     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4030   else if (Unpacked && IsD16 && Ty.isVector())
4031     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4032   else
4033     LoadDstReg = Dst;
4034 
4035   auto MIB = B.buildInstr(Opc)
4036     .addDef(LoadDstReg)         // vdata
4037     .addUse(RSrc)               // rsrc
4038     .addUse(VIndex)             // vindex
4039     .addUse(VOffset)            // voffset
4040     .addUse(SOffset)            // soffset
4041     .addImm(ImmOffset);         // offset(imm)
4042 
4043   if (IsTyped)
4044     MIB.addImm(Format);
4045 
4046   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4047      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4048      .addMemOperand(MMO);
4049 
4050   if (LoadDstReg != Dst) {
4051     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4052 
4053     // Widen result for extending loads was widened.
4054     if (IsExtLoad)
4055       B.buildTrunc(Dst, LoadDstReg);
4056     else {
4057       // Repack to original 16-bit vector result
4058       // FIXME: G_TRUNC should work, but legalization currently fails
4059       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4060       SmallVector<Register, 4> Repack;
4061       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4062         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4063       B.buildMerge(Dst, Repack);
4064     }
4065   }
4066 
4067   MI.eraseFromParent();
4068   return true;
4069 }
4070 
4071 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4072                                                MachineIRBuilder &B,
4073                                                bool IsInc) const {
4074   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4075                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4076   B.buildInstr(Opc)
4077     .addDef(MI.getOperand(0).getReg())
4078     .addUse(MI.getOperand(2).getReg())
4079     .addUse(MI.getOperand(3).getReg())
4080     .cloneMemRefs(MI);
4081   MI.eraseFromParent();
4082   return true;
4083 }
4084 
4085 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4086   switch (IntrID) {
4087   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4088   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4089     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4090   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4091   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4092     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4093   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4094   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4095     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4096   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4097   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4098     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4099   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4100   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4101     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4102   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4103   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4104     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4105   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4106   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4107     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4108   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4109   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4110     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4111   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4112   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4113     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4114   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4115   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4116     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4117   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4118   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4119     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4120   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4121   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4122     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4123   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4124   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4125     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4126   case Intrinsic::amdgcn_buffer_atomic_fadd:
4127   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4128   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4129     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4130   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4131   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4132     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4133   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4134   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4135     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4136   default:
4137     llvm_unreachable("unhandled atomic opcode");
4138   }
4139 }
4140 
4141 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4142                                                MachineIRBuilder &B,
4143                                                Intrinsic::ID IID) const {
4144   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4145                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4146   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4147 
4148   Register Dst;
4149 
4150   int OpOffset = 0;
4151   if (HasReturn) {
4152     // A few FP atomics do not support return values.
4153     Dst = MI.getOperand(0).getReg();
4154   } else {
4155     OpOffset = -1;
4156   }
4157 
4158   Register VData = MI.getOperand(2 + OpOffset).getReg();
4159   Register CmpVal;
4160 
4161   if (IsCmpSwap) {
4162     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4163     ++OpOffset;
4164   }
4165 
4166   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4167   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4168 
4169   // The struct intrinsic variants add one additional operand over raw.
4170   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4171   Register VIndex;
4172   if (HasVIndex) {
4173     VIndex = MI.getOperand(4 + OpOffset).getReg();
4174     ++OpOffset;
4175   } else {
4176     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4177   }
4178 
4179   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4180   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4181   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4182 
4183   MachineMemOperand *MMO = *MI.memoperands_begin();
4184 
4185   unsigned ImmOffset;
4186   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4187   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4188 
4189   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4190 
4191   if (HasReturn)
4192     MIB.addDef(Dst);
4193 
4194   MIB.addUse(VData); // vdata
4195 
4196   if (IsCmpSwap)
4197     MIB.addReg(CmpVal);
4198 
4199   MIB.addUse(RSrc)               // rsrc
4200      .addUse(VIndex)             // vindex
4201      .addUse(VOffset)            // voffset
4202      .addUse(SOffset)            // soffset
4203      .addImm(ImmOffset)          // offset(imm)
4204      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4205      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4206      .addMemOperand(MMO);
4207 
4208   MI.eraseFromParent();
4209   return true;
4210 }
4211 
4212 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4213 /// vector with s16 typed elements.
4214 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4215                                       SmallVectorImpl<Register> &PackedAddrs,
4216                                       unsigned ArgOffset,
4217                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4218                                       bool IsA16, bool IsG16) {
4219   const LLT S16 = LLT::scalar(16);
4220   const LLT V2S16 = LLT::fixed_vector(2, 16);
4221   auto EndIdx = Intr->VAddrEnd;
4222 
4223   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4224     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4225     if (!SrcOp.isReg())
4226       continue; // _L to _LZ may have eliminated this.
4227 
4228     Register AddrReg = SrcOp.getReg();
4229 
4230     if ((I < Intr->GradientStart) ||
4231         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4232         (I >= Intr->CoordStart && !IsA16)) {
4233       // Handle any gradient or coordinate operands that should not be packed
4234       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4235       PackedAddrs.push_back(AddrReg);
4236     } else {
4237       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4238       // derivatives dx/dh and dx/dv are packed with undef.
4239       if (((I + 1) >= EndIdx) ||
4240           ((Intr->NumGradients / 2) % 2 == 1 &&
4241            (I == static_cast<unsigned>(Intr->GradientStart +
4242                                        (Intr->NumGradients / 2) - 1) ||
4243             I == static_cast<unsigned>(Intr->GradientStart +
4244                                        Intr->NumGradients - 1))) ||
4245           // Check for _L to _LZ optimization
4246           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4247         PackedAddrs.push_back(
4248             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4249                 .getReg(0));
4250       } else {
4251         PackedAddrs.push_back(
4252             B.buildBuildVector(
4253                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4254                 .getReg(0));
4255         ++I;
4256       }
4257     }
4258   }
4259 }
4260 
4261 /// Convert from separate vaddr components to a single vector address register,
4262 /// and replace the remaining operands with $noreg.
4263 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4264                                      int DimIdx, int NumVAddrs) {
4265   const LLT S32 = LLT::scalar(32);
4266 
4267   SmallVector<Register, 8> AddrRegs;
4268   for (int I = 0; I != NumVAddrs; ++I) {
4269     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4270     if (SrcOp.isReg()) {
4271       AddrRegs.push_back(SrcOp.getReg());
4272       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4273     }
4274   }
4275 
4276   int NumAddrRegs = AddrRegs.size();
4277   if (NumAddrRegs != 1) {
4278     // Above 8 elements round up to next power of 2 (i.e. 16).
4279     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4280       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4281       auto Undef = B.buildUndef(S32);
4282       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4283       NumAddrRegs = RoundedNumRegs;
4284     }
4285 
4286     auto VAddr =
4287         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4288     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4289   }
4290 
4291   for (int I = 1; I != NumVAddrs; ++I) {
4292     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4293     if (SrcOp.isReg())
4294       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4295   }
4296 }
4297 
4298 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4299 ///
4300 /// Depending on the subtarget, load/store with 16-bit element data need to be
4301 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4302 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4303 /// registers.
4304 ///
4305 /// We don't want to directly select image instructions just yet, but also want
4306 /// to exposes all register repacking to the legalizer/combiners. We also don't
4307 /// want a selected instrution entering RegBankSelect. In order to avoid
4308 /// defining a multitude of intermediate image instructions, directly hack on
4309 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4310 /// now unnecessary arguments with $noreg.
4311 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4312     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4313     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4314 
4315   const unsigned NumDefs = MI.getNumExplicitDefs();
4316   const unsigned ArgOffset = NumDefs + 1;
4317   bool IsTFE = NumDefs == 2;
4318   // We are only processing the operands of d16 image operations on subtargets
4319   // that use the unpacked register layout, or need to repack the TFE result.
4320 
4321   // TODO: Do we need to guard against already legalized intrinsics?
4322   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4323       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4324 
4325   MachineRegisterInfo *MRI = B.getMRI();
4326   const LLT S32 = LLT::scalar(32);
4327   const LLT S16 = LLT::scalar(16);
4328   const LLT V2S16 = LLT::fixed_vector(2, 16);
4329 
4330   unsigned DMask = 0;
4331 
4332   // Check for 16 bit addresses and pack if true.
4333   LLT GradTy =
4334       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4335   LLT AddrTy =
4336       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4337   const bool IsG16 = GradTy == S16;
4338   const bool IsA16 = AddrTy == S16;
4339 
4340   int DMaskLanes = 0;
4341   if (!BaseOpcode->Atomic) {
4342     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4343     if (BaseOpcode->Gather4) {
4344       DMaskLanes = 4;
4345     } else if (DMask != 0) {
4346       DMaskLanes = countPopulation(DMask);
4347     } else if (!IsTFE && !BaseOpcode->Store) {
4348       // If dmask is 0, this is a no-op load. This can be eliminated.
4349       B.buildUndef(MI.getOperand(0));
4350       MI.eraseFromParent();
4351       return true;
4352     }
4353   }
4354 
4355   Observer.changingInstr(MI);
4356   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4357 
4358   unsigned NewOpcode = NumDefs == 0 ?
4359     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4360 
4361   // Track that we legalized this
4362   MI.setDesc(B.getTII().get(NewOpcode));
4363 
4364   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4365   // dmask to be at least 1 otherwise the instruction will fail
4366   if (IsTFE && DMask == 0) {
4367     DMask = 0x1;
4368     DMaskLanes = 1;
4369     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4370   }
4371 
4372   if (BaseOpcode->Atomic) {
4373     Register VData0 = MI.getOperand(2).getReg();
4374     LLT Ty = MRI->getType(VData0);
4375 
4376     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4377     if (Ty.isVector())
4378       return false;
4379 
4380     if (BaseOpcode->AtomicX2) {
4381       Register VData1 = MI.getOperand(3).getReg();
4382       // The two values are packed in one register.
4383       LLT PackedTy = LLT::fixed_vector(2, Ty);
4384       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4385       MI.getOperand(2).setReg(Concat.getReg(0));
4386       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4387     }
4388   }
4389 
4390   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4391 
4392   // Optimize _L to _LZ when _L is zero
4393   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4394           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4395     const ConstantFP *ConstantLod;
4396 
4397     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4398                  m_GFCst(ConstantLod))) {
4399       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4400         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4401         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4402             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4403                                                       Intr->Dim);
4404 
4405         // The starting indexes should remain in the same place.
4406         --CorrectedNumVAddrs;
4407 
4408         MI.getOperand(MI.getNumExplicitDefs())
4409             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4410         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4411         Intr = NewImageDimIntr;
4412       }
4413     }
4414   }
4415 
4416   // Optimize _mip away, when 'lod' is zero
4417   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4418     int64_t ConstantLod;
4419     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4420                  m_ICst(ConstantLod))) {
4421       if (ConstantLod == 0) {
4422         // TODO: Change intrinsic opcode and remove operand instead or replacing
4423         // it with 0, as the _L to _LZ handling is done above.
4424         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4425         --CorrectedNumVAddrs;
4426       }
4427     }
4428   }
4429 
4430   // Rewrite the addressing register layout before doing anything else.
4431   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4432     // 16 bit gradients are supported, but are tied to the A16 control
4433     // so both gradients and addresses must be 16 bit
4434     return false;
4435   }
4436 
4437   if (IsA16 && !ST.hasA16()) {
4438     // A16 not supported
4439     return false;
4440   }
4441 
4442   if (IsA16 || IsG16) {
4443     if (Intr->NumVAddrs > 1) {
4444       SmallVector<Register, 4> PackedRegs;
4445 
4446       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4447                                 IsG16);
4448 
4449       // See also below in the non-a16 branch
4450       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4451                           PackedRegs.size() <= ST.getNSAMaxSize();
4452 
4453       if (!UseNSA && PackedRegs.size() > 1) {
4454         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4455         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4456         PackedRegs[0] = Concat.getReg(0);
4457         PackedRegs.resize(1);
4458       }
4459 
4460       const unsigned NumPacked = PackedRegs.size();
4461       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4462         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4463         if (!SrcOp.isReg()) {
4464           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4465           continue;
4466         }
4467 
4468         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4469 
4470         if (I - Intr->VAddrStart < NumPacked)
4471           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4472         else
4473           SrcOp.setReg(AMDGPU::NoRegister);
4474       }
4475     }
4476   } else {
4477     // If the register allocator cannot place the address registers contiguously
4478     // without introducing moves, then using the non-sequential address encoding
4479     // is always preferable, since it saves VALU instructions and is usually a
4480     // wash in terms of code size or even better.
4481     //
4482     // However, we currently have no way of hinting to the register allocator
4483     // that MIMG addresses should be placed contiguously when it is possible to
4484     // do so, so force non-NSA for the common 2-address case as a heuristic.
4485     //
4486     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4487     // allocation when possible.
4488     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4489                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4490 
4491     if (!UseNSA && Intr->NumVAddrs > 1)
4492       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4493                                Intr->NumVAddrs);
4494   }
4495 
4496   int Flags = 0;
4497   if (IsA16)
4498     Flags |= 1;
4499   if (IsG16)
4500     Flags |= 2;
4501   MI.addOperand(MachineOperand::CreateImm(Flags));
4502 
4503   if (BaseOpcode->Store) { // No TFE for stores?
4504     // TODO: Handle dmask trim
4505     Register VData = MI.getOperand(1).getReg();
4506     LLT Ty = MRI->getType(VData);
4507     if (!Ty.isVector() || Ty.getElementType() != S16)
4508       return true;
4509 
4510     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4511     if (RepackedReg != VData) {
4512       MI.getOperand(1).setReg(RepackedReg);
4513     }
4514 
4515     return true;
4516   }
4517 
4518   Register DstReg = MI.getOperand(0).getReg();
4519   LLT Ty = MRI->getType(DstReg);
4520   const LLT EltTy = Ty.getScalarType();
4521   const bool IsD16 = Ty.getScalarType() == S16;
4522   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4523 
4524   // Confirm that the return type is large enough for the dmask specified
4525   if (NumElts < DMaskLanes)
4526     return false;
4527 
4528   if (NumElts > 4 || DMaskLanes > 4)
4529     return false;
4530 
4531   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4532   const LLT AdjustedTy =
4533       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4534 
4535   // The raw dword aligned data component of the load. The only legal cases
4536   // where this matters should be when using the packed D16 format, for
4537   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4538   LLT RoundedTy;
4539 
4540   // S32 vector to to cover all data, plus TFE result element.
4541   LLT TFETy;
4542 
4543   // Register type to use for each loaded component. Will be S32 or V2S16.
4544   LLT RegTy;
4545 
4546   if (IsD16 && ST.hasUnpackedD16VMem()) {
4547     RoundedTy =
4548         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4549     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4550     RegTy = S32;
4551   } else {
4552     unsigned EltSize = EltTy.getSizeInBits();
4553     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4554     unsigned RoundedSize = 32 * RoundedElts;
4555     RoundedTy = LLT::scalarOrVector(
4556         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4557     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4558     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4559   }
4560 
4561   // The return type does not need adjustment.
4562   // TODO: Should we change s16 case to s32 or <2 x s16>?
4563   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4564     return true;
4565 
4566   Register Dst1Reg;
4567 
4568   // Insert after the instruction.
4569   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4570 
4571   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4572   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4573   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4574   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4575 
4576   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4577 
4578   MI.getOperand(0).setReg(NewResultReg);
4579 
4580   // In the IR, TFE is supposed to be used with a 2 element struct return
4581   // type. The intruction really returns these two values in one contiguous
4582   // register, with one additional dword beyond the loaded data. Rewrite the
4583   // return type to use a single register result.
4584 
4585   if (IsTFE) {
4586     Dst1Reg = MI.getOperand(1).getReg();
4587     if (MRI->getType(Dst1Reg) != S32)
4588       return false;
4589 
4590     // TODO: Make sure the TFE operand bit is set.
4591     MI.RemoveOperand(1);
4592 
4593     // Handle the easy case that requires no repack instructions.
4594     if (Ty == S32) {
4595       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4596       return true;
4597     }
4598   }
4599 
4600   // Now figure out how to copy the new result register back into the old
4601   // result.
4602   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4603 
4604   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4605 
4606   if (ResultNumRegs == 1) {
4607     assert(!IsTFE);
4608     ResultRegs[0] = NewResultReg;
4609   } else {
4610     // We have to repack into a new vector of some kind.
4611     for (int I = 0; I != NumDataRegs; ++I)
4612       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4613     B.buildUnmerge(ResultRegs, NewResultReg);
4614 
4615     // Drop the final TFE element to get the data part. The TFE result is
4616     // directly written to the right place already.
4617     if (IsTFE)
4618       ResultRegs.resize(NumDataRegs);
4619   }
4620 
4621   // For an s16 scalar result, we form an s32 result with a truncate regardless
4622   // of packed vs. unpacked.
4623   if (IsD16 && !Ty.isVector()) {
4624     B.buildTrunc(DstReg, ResultRegs[0]);
4625     return true;
4626   }
4627 
4628   // Avoid a build/concat_vector of 1 entry.
4629   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4630     B.buildBitcast(DstReg, ResultRegs[0]);
4631     return true;
4632   }
4633 
4634   assert(Ty.isVector());
4635 
4636   if (IsD16) {
4637     // For packed D16 results with TFE enabled, all the data components are
4638     // S32. Cast back to the expected type.
4639     //
4640     // TODO: We don't really need to use load s32 elements. We would only need one
4641     // cast for the TFE result if a multiple of v2s16 was used.
4642     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4643       for (Register &Reg : ResultRegs)
4644         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4645     } else if (ST.hasUnpackedD16VMem()) {
4646       for (Register &Reg : ResultRegs)
4647         Reg = B.buildTrunc(S16, Reg).getReg(0);
4648     }
4649   }
4650 
4651   auto padWithUndef = [&](LLT Ty, int NumElts) {
4652     if (NumElts == 0)
4653       return;
4654     Register Undef = B.buildUndef(Ty).getReg(0);
4655     for (int I = 0; I != NumElts; ++I)
4656       ResultRegs.push_back(Undef);
4657   };
4658 
4659   // Pad out any elements eliminated due to the dmask.
4660   LLT ResTy = MRI->getType(ResultRegs[0]);
4661   if (!ResTy.isVector()) {
4662     padWithUndef(ResTy, NumElts - ResultRegs.size());
4663     B.buildBuildVector(DstReg, ResultRegs);
4664     return true;
4665   }
4666 
4667   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4668   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4669 
4670   // Deal with the one annoying legal case.
4671   const LLT V3S16 = LLT::fixed_vector(3, 16);
4672   if (Ty == V3S16) {
4673     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4674     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4675     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4676     return true;
4677   }
4678 
4679   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4680   B.buildConcatVectors(DstReg, ResultRegs);
4681   return true;
4682 }
4683 
4684 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4685   LegalizerHelper &Helper, MachineInstr &MI) const {
4686   MachineIRBuilder &B = Helper.MIRBuilder;
4687   GISelChangeObserver &Observer = Helper.Observer;
4688 
4689   Register Dst = MI.getOperand(0).getReg();
4690   LLT Ty = B.getMRI()->getType(Dst);
4691   unsigned Size = Ty.getSizeInBits();
4692   MachineFunction &MF = B.getMF();
4693 
4694   Observer.changingInstr(MI);
4695 
4696   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4697     Ty = getBitcastRegisterType(Ty);
4698     Helper.bitcastDst(MI, Ty, 0);
4699     Dst = MI.getOperand(0).getReg();
4700     B.setInsertPt(B.getMBB(), MI);
4701   }
4702 
4703   // FIXME: We don't really need this intermediate instruction. The intrinsic
4704   // should be fixed to have a memory operand. Since it's readnone, we're not
4705   // allowed to add one.
4706   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4707   MI.RemoveOperand(1); // Remove intrinsic ID
4708 
4709   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4710   // TODO: Should this use datalayout alignment?
4711   const unsigned MemSize = (Size + 7) / 8;
4712   const Align MemAlign(4);
4713   MachineMemOperand *MMO = MF.getMachineMemOperand(
4714       MachinePointerInfo(),
4715       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4716           MachineMemOperand::MOInvariant,
4717       MemSize, MemAlign);
4718   MI.addMemOperand(MF, MMO);
4719 
4720   // There are no 96-bit result scalar loads, but widening to 128-bit should
4721   // always be legal. We may need to restore this to a 96-bit result if it turns
4722   // out this needs to be converted to a vector load during RegBankSelect.
4723   if (!isPowerOf2_32(Size)) {
4724     if (Ty.isVector())
4725       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4726     else
4727       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4728   }
4729 
4730   Observer.changedInstr(MI);
4731   return true;
4732 }
4733 
4734 // TODO: Move to selection
4735 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4736                                                 MachineRegisterInfo &MRI,
4737                                                 MachineIRBuilder &B) const {
4738   if (!ST.isTrapHandlerEnabled() ||
4739       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4740     return legalizeTrapEndpgm(MI, MRI, B);
4741 
4742   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4743     switch (*HsaAbiVer) {
4744     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4745     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4746       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4747     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4748       return ST.supportsGetDoorbellID() ?
4749           legalizeTrapHsa(MI, MRI, B) :
4750           legalizeTrapHsaQueuePtr(MI, MRI, B);
4751     }
4752   }
4753 
4754   llvm_unreachable("Unknown trap handler");
4755 }
4756 
4757 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4758     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4759   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4760   MI.eraseFromParent();
4761   return true;
4762 }
4763 
4764 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4765     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4766   // Pass queue pointer to trap handler as input, and insert trap instruction
4767   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4768   Register LiveIn =
4769     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4770   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4771     return false;
4772 
4773   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4774   B.buildCopy(SGPR01, LiveIn);
4775   B.buildInstr(AMDGPU::S_TRAP)
4776       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4777       .addReg(SGPR01, RegState::Implicit);
4778 
4779   MI.eraseFromParent();
4780   return true;
4781 }
4782 
4783 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4784     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4785   B.buildInstr(AMDGPU::S_TRAP)
4786       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4787   MI.eraseFromParent();
4788   return true;
4789 }
4790 
4791 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4792     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4793   // Is non-HSA path or trap-handler disabled? then, report a warning
4794   // accordingly
4795   if (!ST.isTrapHandlerEnabled() ||
4796       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4797     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4798                                      "debugtrap handler not supported",
4799                                      MI.getDebugLoc(), DS_Warning);
4800     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4801     Ctx.diagnose(NoTrap);
4802   } else {
4803     // Insert debug-trap instruction
4804     B.buildInstr(AMDGPU::S_TRAP)
4805         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4806   }
4807 
4808   MI.eraseFromParent();
4809   return true;
4810 }
4811 
4812 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4813                                                MachineIRBuilder &B) const {
4814   MachineRegisterInfo &MRI = *B.getMRI();
4815   const LLT S16 = LLT::scalar(16);
4816   const LLT S32 = LLT::scalar(32);
4817 
4818   Register DstReg = MI.getOperand(0).getReg();
4819   Register NodePtr = MI.getOperand(2).getReg();
4820   Register RayExtent = MI.getOperand(3).getReg();
4821   Register RayOrigin = MI.getOperand(4).getReg();
4822   Register RayDir = MI.getOperand(5).getReg();
4823   Register RayInvDir = MI.getOperand(6).getReg();
4824   Register TDescr = MI.getOperand(7).getReg();
4825 
4826   if (!ST.hasGFX10_AEncoding()) {
4827     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4828                                         "intrinsic not supported on subtarget",
4829                                         MI.getDebugLoc());
4830     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4831     return false;
4832   }
4833 
4834   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4835   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4836   const unsigned NumVDataDwords = 4;
4837   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4838   const bool UseNSA =
4839       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4840   const unsigned BaseOpcodes[2][2] = {
4841       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4842       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4843        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4844   int Opcode;
4845   if (UseNSA) {
4846     Opcode =
4847         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4848                               NumVDataDwords, NumVAddrDwords);
4849   } else {
4850     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4851                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4852                                    PowerOf2Ceil(NumVAddrDwords));
4853   }
4854   assert(Opcode != -1);
4855 
4856   SmallVector<Register, 12> Ops;
4857   if (Is64) {
4858     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4859     Ops.push_back(Unmerge.getReg(0));
4860     Ops.push_back(Unmerge.getReg(1));
4861   } else {
4862     Ops.push_back(NodePtr);
4863   }
4864   Ops.push_back(RayExtent);
4865 
4866   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4867     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4868     Ops.push_back(Unmerge.getReg(0));
4869     Ops.push_back(Unmerge.getReg(1));
4870     Ops.push_back(Unmerge.getReg(2));
4871   };
4872 
4873   packLanes(RayOrigin);
4874   if (IsA16) {
4875     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4876     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4877     Register R1 = MRI.createGenericVirtualRegister(S32);
4878     Register R2 = MRI.createGenericVirtualRegister(S32);
4879     Register R3 = MRI.createGenericVirtualRegister(S32);
4880     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4881     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4882     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4883     Ops.push_back(R1);
4884     Ops.push_back(R2);
4885     Ops.push_back(R3);
4886   } else {
4887     packLanes(RayDir);
4888     packLanes(RayInvDir);
4889   }
4890 
4891   if (!UseNSA) {
4892     // Build a single vector containing all the operands so far prepared.
4893     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4894     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4895     Ops.clear();
4896     Ops.push_back(MergedOps);
4897   }
4898 
4899   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4900     .addDef(DstReg)
4901     .addImm(Opcode);
4902 
4903   for (Register R : Ops) {
4904     MIB.addUse(R);
4905   }
4906 
4907   MIB.addUse(TDescr)
4908      .addImm(IsA16 ? 1 : 0)
4909      .cloneMemRefs(MI);
4910 
4911   MI.eraseFromParent();
4912   return true;
4913 }
4914 
4915 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4916                                             MachineInstr &MI) const {
4917   MachineIRBuilder &B = Helper.MIRBuilder;
4918   MachineRegisterInfo &MRI = *B.getMRI();
4919 
4920   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4921   auto IntrID = MI.getIntrinsicID();
4922   switch (IntrID) {
4923   case Intrinsic::amdgcn_if:
4924   case Intrinsic::amdgcn_else: {
4925     MachineInstr *Br = nullptr;
4926     MachineBasicBlock *UncondBrTarget = nullptr;
4927     bool Negated = false;
4928     if (MachineInstr *BrCond =
4929             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4930       const SIRegisterInfo *TRI
4931         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4932 
4933       Register Def = MI.getOperand(1).getReg();
4934       Register Use = MI.getOperand(3).getReg();
4935 
4936       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4937 
4938       if (Negated)
4939         std::swap(CondBrTarget, UncondBrTarget);
4940 
4941       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4942       if (IntrID == Intrinsic::amdgcn_if) {
4943         B.buildInstr(AMDGPU::SI_IF)
4944           .addDef(Def)
4945           .addUse(Use)
4946           .addMBB(UncondBrTarget);
4947       } else {
4948         B.buildInstr(AMDGPU::SI_ELSE)
4949             .addDef(Def)
4950             .addUse(Use)
4951             .addMBB(UncondBrTarget);
4952       }
4953 
4954       if (Br) {
4955         Br->getOperand(0).setMBB(CondBrTarget);
4956       } else {
4957         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4958         // since we're swapping branch targets it needs to be reinserted.
4959         // FIXME: IRTranslator should probably not do this
4960         B.buildBr(*CondBrTarget);
4961       }
4962 
4963       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4964       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4965       MI.eraseFromParent();
4966       BrCond->eraseFromParent();
4967       return true;
4968     }
4969 
4970     return false;
4971   }
4972   case Intrinsic::amdgcn_loop: {
4973     MachineInstr *Br = nullptr;
4974     MachineBasicBlock *UncondBrTarget = nullptr;
4975     bool Negated = false;
4976     if (MachineInstr *BrCond =
4977             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4978       const SIRegisterInfo *TRI
4979         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4980 
4981       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4982       Register Reg = MI.getOperand(2).getReg();
4983 
4984       if (Negated)
4985         std::swap(CondBrTarget, UncondBrTarget);
4986 
4987       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4988       B.buildInstr(AMDGPU::SI_LOOP)
4989         .addUse(Reg)
4990         .addMBB(UncondBrTarget);
4991 
4992       if (Br)
4993         Br->getOperand(0).setMBB(CondBrTarget);
4994       else
4995         B.buildBr(*CondBrTarget);
4996 
4997       MI.eraseFromParent();
4998       BrCond->eraseFromParent();
4999       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5000       return true;
5001     }
5002 
5003     return false;
5004   }
5005   case Intrinsic::amdgcn_kernarg_segment_ptr:
5006     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5007       // This only makes sense to call in a kernel, so just lower to null.
5008       B.buildConstant(MI.getOperand(0).getReg(), 0);
5009       MI.eraseFromParent();
5010       return true;
5011     }
5012 
5013     return legalizePreloadedArgIntrin(
5014       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5015   case Intrinsic::amdgcn_implicitarg_ptr:
5016     return legalizeImplicitArgPtr(MI, MRI, B);
5017   case Intrinsic::amdgcn_workitem_id_x:
5018     return legalizePreloadedArgIntrin(MI, MRI, B,
5019                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5020   case Intrinsic::amdgcn_workitem_id_y:
5021     return legalizePreloadedArgIntrin(MI, MRI, B,
5022                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5023   case Intrinsic::amdgcn_workitem_id_z:
5024     return legalizePreloadedArgIntrin(MI, MRI, B,
5025                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5026   case Intrinsic::amdgcn_workgroup_id_x:
5027     return legalizePreloadedArgIntrin(MI, MRI, B,
5028                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5029   case Intrinsic::amdgcn_workgroup_id_y:
5030     return legalizePreloadedArgIntrin(MI, MRI, B,
5031                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5032   case Intrinsic::amdgcn_workgroup_id_z:
5033     return legalizePreloadedArgIntrin(MI, MRI, B,
5034                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5035   case Intrinsic::amdgcn_dispatch_ptr:
5036     return legalizePreloadedArgIntrin(MI, MRI, B,
5037                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5038   case Intrinsic::amdgcn_queue_ptr:
5039     return legalizePreloadedArgIntrin(MI, MRI, B,
5040                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5041   case Intrinsic::amdgcn_implicit_buffer_ptr:
5042     return legalizePreloadedArgIntrin(
5043       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5044   case Intrinsic::amdgcn_dispatch_id:
5045     return legalizePreloadedArgIntrin(MI, MRI, B,
5046                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5047   case Intrinsic::amdgcn_fdiv_fast:
5048     return legalizeFDIVFastIntrin(MI, MRI, B);
5049   case Intrinsic::amdgcn_is_shared:
5050     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5051   case Intrinsic::amdgcn_is_private:
5052     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5053   case Intrinsic::amdgcn_wavefrontsize: {
5054     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5055     MI.eraseFromParent();
5056     return true;
5057   }
5058   case Intrinsic::amdgcn_s_buffer_load:
5059     return legalizeSBufferLoad(Helper, MI);
5060   case Intrinsic::amdgcn_raw_buffer_store:
5061   case Intrinsic::amdgcn_struct_buffer_store:
5062     return legalizeBufferStore(MI, MRI, B, false, false);
5063   case Intrinsic::amdgcn_raw_buffer_store_format:
5064   case Intrinsic::amdgcn_struct_buffer_store_format:
5065     return legalizeBufferStore(MI, MRI, B, false, true);
5066   case Intrinsic::amdgcn_raw_tbuffer_store:
5067   case Intrinsic::amdgcn_struct_tbuffer_store:
5068     return legalizeBufferStore(MI, MRI, B, true, true);
5069   case Intrinsic::amdgcn_raw_buffer_load:
5070   case Intrinsic::amdgcn_struct_buffer_load:
5071     return legalizeBufferLoad(MI, MRI, B, false, false);
5072   case Intrinsic::amdgcn_raw_buffer_load_format:
5073   case Intrinsic::amdgcn_struct_buffer_load_format:
5074     return legalizeBufferLoad(MI, MRI, B, true, false);
5075   case Intrinsic::amdgcn_raw_tbuffer_load:
5076   case Intrinsic::amdgcn_struct_tbuffer_load:
5077     return legalizeBufferLoad(MI, MRI, B, true, true);
5078   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5079   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5080   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5081   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5082   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5083   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5084   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5085   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5086   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5087   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5088   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5089   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5090   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5091   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5092   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5093   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5094   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5095   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5096   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5097   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5098   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5099   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5100   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5101   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5102   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5103   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5104   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5105   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5106   case Intrinsic::amdgcn_buffer_atomic_fadd:
5107   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5108   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5109   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5110   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5111     return legalizeBufferAtomic(MI, B, IntrID);
5112   case Intrinsic::amdgcn_atomic_inc:
5113     return legalizeAtomicIncDec(MI, B, true);
5114   case Intrinsic::amdgcn_atomic_dec:
5115     return legalizeAtomicIncDec(MI, B, false);
5116   case Intrinsic::trap:
5117     return legalizeTrapIntrinsic(MI, MRI, B);
5118   case Intrinsic::debugtrap:
5119     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5120   case Intrinsic::amdgcn_rsq_clamp:
5121     return legalizeRsqClampIntrinsic(MI, MRI, B);
5122   case Intrinsic::amdgcn_ds_fadd:
5123   case Intrinsic::amdgcn_ds_fmin:
5124   case Intrinsic::amdgcn_ds_fmax:
5125     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5126   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5127     return legalizeBVHIntrinsic(MI, B);
5128   default: {
5129     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5130             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5131       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5132     return true;
5133   }
5134   }
5135 
5136   return true;
5137 }
5138