1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElements(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElements(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalarOrElt(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElements(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElements(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElements(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1056 
1057     if (MemSize < DstTy.getSizeInBits())
1058       MemSize = std::max(MemSize, AlignBits);
1059 
1060     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1061       return true;
1062 
1063     const LLT PtrTy = Query.Types[1];
1064     unsigned AS = PtrTy.getAddressSpace();
1065     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1066       return true;
1067 
1068     // Catch weird sized loads that don't evenly divide into the access sizes
1069     // TODO: May be able to widen depending on alignment etc.
1070     unsigned NumRegs = (MemSize + 31) / 32;
1071     if (NumRegs == 3) {
1072       if (!ST.hasDwordx3LoadStores())
1073         return true;
1074     } else {
1075       // If the alignment allows, these should have been widened.
1076       if (!isPowerOf2_32(NumRegs))
1077         return true;
1078     }
1079 
1080     if (AlignBits < MemSize) {
1081       const SITargetLowering *TLI = ST.getTargetLowering();
1082       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1083                                                       Align(AlignBits / 8));
1084     }
1085 
1086     return false;
1087   };
1088 
1089   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1090   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1091   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1092 
1093   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1094   // LDS
1095   // TODO: Unsupported flat for SI.
1096 
1097   for (unsigned Op : {G_LOAD, G_STORE}) {
1098     const bool IsStore = Op == G_STORE;
1099 
1100     auto &Actions = getActionDefinitionsBuilder(Op);
1101     // Explicitly list some common cases.
1102     // TODO: Does this help compile time at all?
1103     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1104                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1105                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1106                                       {S64, GlobalPtr, S64, GlobalAlign32},
1107                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1108                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1109                                       {S32, GlobalPtr, S8, GlobalAlign8},
1110                                       {S32, GlobalPtr, S16, GlobalAlign16},
1111 
1112                                       {S32, LocalPtr, S32, 32},
1113                                       {S64, LocalPtr, S64, 32},
1114                                       {V2S32, LocalPtr, V2S32, 32},
1115                                       {S32, LocalPtr, S8, 8},
1116                                       {S32, LocalPtr, S16, 16},
1117                                       {V2S16, LocalPtr, S32, 32},
1118 
1119                                       {S32, PrivatePtr, S32, 32},
1120                                       {S32, PrivatePtr, S8, 8},
1121                                       {S32, PrivatePtr, S16, 16},
1122                                       {V2S16, PrivatePtr, S32, 32},
1123 
1124                                       {S32, ConstantPtr, S32, GlobalAlign32},
1125                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1126                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1127                                       {S64, ConstantPtr, S64, GlobalAlign32},
1128                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1129     Actions.legalIf(
1130       [=](const LegalityQuery &Query) -> bool {
1131         return isLoadStoreLegal(ST, Query);
1132       });
1133 
1134     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1135     // 64-bits.
1136     //
1137     // TODO: Should generalize bitcast action into coerce, which will also cover
1138     // inserting addrspacecasts.
1139     Actions.customIf(typeIs(1, Constant32Ptr));
1140 
1141     // Turn any illegal element vectors into something easier to deal
1142     // with. These will ultimately produce 32-bit scalar shifts to extract the
1143     // parts anyway.
1144     //
1145     // For odd 16-bit element vectors, prefer to split those into pieces with
1146     // 16-bit vector parts.
1147     Actions.bitcastIf(
1148       [=](const LegalityQuery &Query) -> bool {
1149         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1150                                           Query.MMODescrs[0].MemoryTy);
1151       }, bitcastToRegisterType(0));
1152 
1153     if (!IsStore) {
1154       // Widen suitably aligned loads by loading extra bytes. The standard
1155       // legalization actions can't properly express widening memory operands.
1156       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1157         return shouldWidenLoad(ST, Query, G_LOAD);
1158       });
1159     }
1160 
1161     // FIXME: load/store narrowing should be moved to lower action
1162     Actions
1163         .narrowScalarIf(
1164             [=](const LegalityQuery &Query) -> bool {
1165               return !Query.Types[0].isVector() &&
1166                      needToSplitMemOp(Query, Op == G_LOAD);
1167             },
1168             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1169               const LLT DstTy = Query.Types[0];
1170               const LLT PtrTy = Query.Types[1];
1171 
1172               const unsigned DstSize = DstTy.getSizeInBits();
1173               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1174 
1175               // Split extloads.
1176               if (DstSize > MemSize)
1177                 return std::make_pair(0, LLT::scalar(MemSize));
1178 
1179               if (!isPowerOf2_32(DstSize)) {
1180                 // We're probably decomposing an odd sized store. Try to split
1181                 // to the widest type. TODO: Account for alignment. As-is it
1182                 // should be OK, since the new parts will be further legalized.
1183                 unsigned FloorSize = PowerOf2Floor(DstSize);
1184                 return std::make_pair(0, LLT::scalar(FloorSize));
1185               }
1186 
1187               if (DstSize > 32 && (DstSize % 32 != 0)) {
1188                 // FIXME: Need a way to specify non-extload of larger size if
1189                 // suitably aligned.
1190                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1191               }
1192 
1193               unsigned MaxSize = maxSizeForAddrSpace(ST,
1194                                                      PtrTy.getAddressSpace(),
1195                                                      Op == G_LOAD);
1196               if (MemSize > MaxSize)
1197                 return std::make_pair(0, LLT::scalar(MaxSize));
1198 
1199               unsigned Align = Query.MMODescrs[0].AlignInBits;
1200               return std::make_pair(0, LLT::scalar(Align));
1201             })
1202         .fewerElementsIf(
1203             [=](const LegalityQuery &Query) -> bool {
1204               return Query.Types[0].isVector() &&
1205                      needToSplitMemOp(Query, Op == G_LOAD);
1206             },
1207             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1208               const LLT DstTy = Query.Types[0];
1209               const LLT PtrTy = Query.Types[1];
1210 
1211               LLT EltTy = DstTy.getElementType();
1212               unsigned MaxSize = maxSizeForAddrSpace(ST,
1213                                                      PtrTy.getAddressSpace(),
1214                                                      Op == G_LOAD);
1215 
1216               // FIXME: Handle widened to power of 2 results better. This ends
1217               // up scalarizing.
1218               // FIXME: 3 element stores scalarized on SI
1219 
1220               // Split if it's too large for the address space.
1221               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1222               if (MemSize > MaxSize) {
1223                 unsigned NumElts = DstTy.getNumElements();
1224                 unsigned EltSize = EltTy.getSizeInBits();
1225 
1226                 if (MaxSize % EltSize == 0) {
1227                   return std::make_pair(
1228                       0, LLT::scalarOrVector(
1229                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1230                 }
1231 
1232                 unsigned NumPieces = MemSize / MaxSize;
1233 
1234                 // FIXME: Refine when odd breakdowns handled
1235                 // The scalars will need to be re-legalized.
1236                 if (NumPieces == 1 || NumPieces >= NumElts ||
1237                     NumElts % NumPieces != 0)
1238                   return std::make_pair(0, EltTy);
1239 
1240                 return std::make_pair(
1241                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1242               }
1243 
1244               // FIXME: We could probably handle weird extending loads better.
1245               if (DstTy.getSizeInBits() > MemSize)
1246                 return std::make_pair(0, EltTy);
1247 
1248               unsigned EltSize = EltTy.getSizeInBits();
1249               unsigned DstSize = DstTy.getSizeInBits();
1250               if (!isPowerOf2_32(DstSize)) {
1251                 // We're probably decomposing an odd sized store. Try to split
1252                 // to the widest type. TODO: Account for alignment. As-is it
1253                 // should be OK, since the new parts will be further legalized.
1254                 unsigned FloorSize = PowerOf2Floor(DstSize);
1255                 return std::make_pair(
1256                     0, LLT::scalarOrVector(
1257                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1258               }
1259 
1260               // Need to split because of alignment.
1261               unsigned Align = Query.MMODescrs[0].AlignInBits;
1262               if (EltSize > Align &&
1263                   (EltSize / Align < DstTy.getNumElements())) {
1264                 return std::make_pair(
1265                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1266               }
1267 
1268               // May need relegalization for the scalars.
1269               return std::make_pair(0, EltTy);
1270             })
1271     .minScalar(0, S32)
1272     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1273     .widenScalarToNextPow2(0)
1274     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1275     .lower();
1276   }
1277 
1278   // FIXME: Unaligned accesses not lowered.
1279   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1280                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1281                                                   {S32, GlobalPtr, S16, 2 * 8},
1282                                                   {S32, LocalPtr, S8, 8},
1283                                                   {S32, LocalPtr, S16, 16},
1284                                                   {S32, PrivatePtr, S8, 8},
1285                                                   {S32, PrivatePtr, S16, 16},
1286                                                   {S32, ConstantPtr, S8, 8},
1287                                                   {S32, ConstantPtr, S16, 2 * 8}})
1288                        .legalIf(
1289                          [=](const LegalityQuery &Query) -> bool {
1290                            return isLoadStoreLegal(ST, Query);
1291                          });
1292 
1293   if (ST.hasFlatAddressSpace()) {
1294     ExtLoads.legalForTypesWithMemDesc(
1295         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1296   }
1297 
1298   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1299   // 64-bits.
1300   //
1301   // TODO: Should generalize bitcast action into coerce, which will also cover
1302   // inserting addrspacecasts.
1303   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1304 
1305   ExtLoads.clampScalar(0, S32, S32)
1306           .widenScalarToNextPow2(0)
1307           .lower();
1308 
1309   auto &Atomics = getActionDefinitionsBuilder(
1310     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1311      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1312      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1313      G_ATOMICRMW_UMIN})
1314     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1315                {S64, GlobalPtr}, {S64, LocalPtr},
1316                {S32, RegionPtr}, {S64, RegionPtr}});
1317   if (ST.hasFlatAddressSpace()) {
1318     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1319   }
1320 
1321   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1322   if (ST.hasLDSFPAtomicAdd()) {
1323     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1324     if (ST.hasGFX90AInsts())
1325       Atomic.legalFor({{S64, LocalPtr}});
1326   }
1327   if (ST.hasAtomicFaddInsts())
1328     Atomic.legalFor({{S32, GlobalPtr}});
1329 
1330   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1331   // demarshalling
1332   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1333     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1334                 {S32, FlatPtr}, {S64, FlatPtr}})
1335     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1336                {S32, RegionPtr}, {S64, RegionPtr}});
1337   // TODO: Pointer types, any 32-bit or 64-bit vector
1338 
1339   // Condition should be s32 for scalar, s1 for vector.
1340   getActionDefinitionsBuilder(G_SELECT)
1341       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1342                                  LocalPtr, FlatPtr, PrivatePtr,
1343                                  LLT::fixed_vector(2, LocalPtr),
1344                                  LLT::fixed_vector(2, PrivatePtr)},
1345                                 {S1, S32})
1346       .clampScalar(0, S16, S64)
1347       .scalarize(1)
1348       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1349       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1350       .clampMaxNumElements(0, S32, 2)
1351       .clampMaxNumElements(0, LocalPtr, 2)
1352       .clampMaxNumElements(0, PrivatePtr, 2)
1353       .scalarize(0)
1354       .widenScalarToNextPow2(0)
1355       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1356 
1357   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1358   // be more flexible with the shift amount type.
1359   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1360     .legalFor({{S32, S32}, {S64, S32}});
1361   if (ST.has16BitInsts()) {
1362     if (ST.hasVOP3PInsts()) {
1363       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1364             .clampMaxNumElements(0, S16, 2);
1365     } else
1366       Shifts.legalFor({{S16, S16}});
1367 
1368     // TODO: Support 16-bit shift amounts for all types
1369     Shifts.widenScalarIf(
1370       [=](const LegalityQuery &Query) {
1371         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1372         // 32-bit amount.
1373         const LLT ValTy = Query.Types[0];
1374         const LLT AmountTy = Query.Types[1];
1375         return ValTy.getSizeInBits() <= 16 &&
1376                AmountTy.getSizeInBits() < 16;
1377       }, changeTo(1, S16));
1378     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1379     Shifts.clampScalar(1, S32, S32);
1380     Shifts.clampScalar(0, S16, S64);
1381     Shifts.widenScalarToNextPow2(0, 16);
1382 
1383     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384       .minScalar(0, S16)
1385       .scalarize(0)
1386       .lower();
1387   } else {
1388     // Make sure we legalize the shift amount type first, as the general
1389     // expansion for the shifted type will produce much worse code if it hasn't
1390     // been truncated already.
1391     Shifts.clampScalar(1, S32, S32);
1392     Shifts.clampScalar(0, S32, S64);
1393     Shifts.widenScalarToNextPow2(0, 32);
1394 
1395     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1396       .minScalar(0, S32)
1397       .scalarize(0)
1398       .lower();
1399   }
1400   Shifts.scalarize(0);
1401 
1402   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1403     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1404     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1405     unsigned IdxTypeIdx = 2;
1406 
1407     getActionDefinitionsBuilder(Op)
1408       .customIf([=](const LegalityQuery &Query) {
1409           const LLT EltTy = Query.Types[EltTypeIdx];
1410           const LLT VecTy = Query.Types[VecTypeIdx];
1411           const LLT IdxTy = Query.Types[IdxTypeIdx];
1412           const unsigned EltSize = EltTy.getSizeInBits();
1413           return (EltSize == 32 || EltSize == 64) &&
1414                   VecTy.getSizeInBits() % 32 == 0 &&
1415                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1416                   IdxTy.getSizeInBits() == 32;
1417         })
1418       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1419                  bitcastToVectorElement32(VecTypeIdx))
1420       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1421       .bitcastIf(
1422         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1423         [=](const LegalityQuery &Query) {
1424           // For > 64-bit element types, try to turn this into a 64-bit
1425           // element vector since we may be able to do better indexing
1426           // if this is scalar. If not, fall back to 32.
1427           const LLT EltTy = Query.Types[EltTypeIdx];
1428           const LLT VecTy = Query.Types[VecTypeIdx];
1429           const unsigned DstEltSize = EltTy.getSizeInBits();
1430           const unsigned VecSize = VecTy.getSizeInBits();
1431 
1432           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1433           return std::make_pair(
1434               VecTypeIdx,
1435               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1436         })
1437       .clampScalar(EltTypeIdx, S32, S64)
1438       .clampScalar(VecTypeIdx, S32, S64)
1439       .clampScalar(IdxTypeIdx, S32, S32)
1440       .clampMaxNumElements(VecTypeIdx, S32, 32)
1441       // TODO: Clamp elements for 64-bit vectors?
1442       // It should only be necessary with variable indexes.
1443       // As a last resort, lower to the stack
1444       .lower();
1445   }
1446 
1447   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1448     .unsupportedIf([=](const LegalityQuery &Query) {
1449         const LLT &EltTy = Query.Types[1].getElementType();
1450         return Query.Types[0] != EltTy;
1451       });
1452 
1453   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1454     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1455     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1456 
1457     // FIXME: Doesn't handle extract of illegal sizes.
1458     getActionDefinitionsBuilder(Op)
1459       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1460       // FIXME: Multiples of 16 should not be legal.
1461       .legalIf([=](const LegalityQuery &Query) {
1462           const LLT BigTy = Query.Types[BigTyIdx];
1463           const LLT LitTy = Query.Types[LitTyIdx];
1464           return (BigTy.getSizeInBits() % 32 == 0) &&
1465                  (LitTy.getSizeInBits() % 16 == 0);
1466         })
1467       .widenScalarIf(
1468         [=](const LegalityQuery &Query) {
1469           const LLT BigTy = Query.Types[BigTyIdx];
1470           return (BigTy.getScalarSizeInBits() < 16);
1471         },
1472         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1473       .widenScalarIf(
1474         [=](const LegalityQuery &Query) {
1475           const LLT LitTy = Query.Types[LitTyIdx];
1476           return (LitTy.getScalarSizeInBits() < 16);
1477         },
1478         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1479       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1480       .widenScalarToNextPow2(BigTyIdx, 32);
1481 
1482   }
1483 
1484   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1485     .legalForCartesianProduct(AllS32Vectors, {S32})
1486     .legalForCartesianProduct(AllS64Vectors, {S64})
1487     .clampNumElements(0, V16S32, V32S32)
1488     .clampNumElements(0, V2S64, V16S64)
1489     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1490 
1491   if (ST.hasScalarPackInsts()) {
1492     BuildVector
1493       // FIXME: Should probably widen s1 vectors straight to s32
1494       .minScalarOrElt(0, S16)
1495       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1496       .minScalar(1, S32);
1497 
1498     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1499       .legalFor({V2S16, S32})
1500       .lower();
1501     BuildVector.minScalarOrElt(0, S32);
1502   } else {
1503     BuildVector.customFor({V2S16, S16});
1504     BuildVector.minScalarOrElt(0, S32);
1505 
1506     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1507       .customFor({V2S16, S32})
1508       .lower();
1509   }
1510 
1511   BuildVector.legalIf(isRegisterType(0));
1512 
1513   // FIXME: Clamp maximum size
1514   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1515     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516     .clampMaxNumElements(0, S32, 32)
1517     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1518     .clampMaxNumElements(0, S16, 64);
1519 
1520   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1521   // pre-legalize.
1522   if (ST.hasVOP3PInsts()) {
1523     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1524       .customFor({V2S16, V2S16})
1525       .lower();
1526   } else
1527     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1528 
1529   // Merge/Unmerge
1530   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1531     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1532     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1533 
1534     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1535       const LLT Ty = Query.Types[TypeIdx];
1536       if (Ty.isVector()) {
1537         const LLT &EltTy = Ty.getElementType();
1538         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1539           return true;
1540         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1541           return true;
1542       }
1543       return false;
1544     };
1545 
1546     auto &Builder = getActionDefinitionsBuilder(Op)
1547       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1548       .lowerFor({{S16, V2S16}})
1549       .lowerIf([=](const LegalityQuery &Query) {
1550           const LLT BigTy = Query.Types[BigTyIdx];
1551           return BigTy.getSizeInBits() == 32;
1552         })
1553       // Try to widen to s16 first for small types.
1554       // TODO: Only do this on targets with legal s16 shifts
1555       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1556       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1557       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1558       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1559                            elementTypeIs(1, S16)),
1560                        changeTo(1, V2S16))
1561       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1562       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1563       // valid.
1564       .clampScalar(LitTyIdx, S32, S512)
1565       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1566       // Break up vectors with weird elements into scalars
1567       .fewerElementsIf(
1568         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1569         scalarize(0))
1570       .fewerElementsIf(
1571         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1572         scalarize(1))
1573       .clampScalar(BigTyIdx, S32, MaxScalar);
1574 
1575     if (Op == G_MERGE_VALUES) {
1576       Builder.widenScalarIf(
1577         // TODO: Use 16-bit shifts if legal for 8-bit values?
1578         [=](const LegalityQuery &Query) {
1579           const LLT Ty = Query.Types[LitTyIdx];
1580           return Ty.getSizeInBits() < 32;
1581         },
1582         changeTo(LitTyIdx, S32));
1583     }
1584 
1585     Builder.widenScalarIf(
1586       [=](const LegalityQuery &Query) {
1587         const LLT Ty = Query.Types[BigTyIdx];
1588         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1589           Ty.getSizeInBits() % 16 != 0;
1590       },
1591       [=](const LegalityQuery &Query) {
1592         // Pick the next power of 2, or a multiple of 64 over 128.
1593         // Whichever is smaller.
1594         const LLT &Ty = Query.Types[BigTyIdx];
1595         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1596         if (NewSizeInBits >= 256) {
1597           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1598           if (RoundedTo < NewSizeInBits)
1599             NewSizeInBits = RoundedTo;
1600         }
1601         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1602       })
1603       // Any vectors left are the wrong size. Scalarize them.
1604       .scalarize(0)
1605       .scalarize(1);
1606   }
1607 
1608   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1609   // RegBankSelect.
1610   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1611     .legalFor({{S32}, {S64}});
1612 
1613   if (ST.hasVOP3PInsts()) {
1614     SextInReg.lowerFor({{V2S16}})
1615       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1616       // get more vector shift opportunities, since we'll get those when
1617       // expanded.
1618       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1619   } else if (ST.has16BitInsts()) {
1620     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1621   } else {
1622     // Prefer to promote to s32 before lowering if we don't have 16-bit
1623     // shifts. This avoid a lot of intermediate truncate and extend operations.
1624     SextInReg.lowerFor({{S32}, {S64}});
1625   }
1626 
1627   SextInReg
1628     .scalarize(0)
1629     .clampScalar(0, S32, S64)
1630     .lower();
1631 
1632   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1633     .scalarize(0)
1634     .lower();
1635 
1636   // TODO: Only Try to form v2s16 with legal packed instructions.
1637   getActionDefinitionsBuilder(G_FSHR)
1638     .legalFor({{S32, S32}})
1639     .lowerFor({{V2S16, V2S16}})
1640     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641     .scalarize(0)
1642     .lower();
1643 
1644   if (ST.hasVOP3PInsts()) {
1645     getActionDefinitionsBuilder(G_FSHL)
1646       .lowerFor({{V2S16, V2S16}})
1647       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1648       .scalarize(0)
1649       .lower();
1650   } else {
1651     getActionDefinitionsBuilder(G_FSHL)
1652       .scalarize(0)
1653       .lower();
1654   }
1655 
1656   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1657     .legalFor({S64});
1658 
1659   getActionDefinitionsBuilder(G_FENCE)
1660     .alwaysLegal();
1661 
1662   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1663       .scalarize(0)
1664       .minScalar(0, S32)
1665       .lower();
1666 
1667   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1668       .legalFor({{S32, S32}, {S64, S32}})
1669       .clampScalar(1, S32, S32)
1670       .clampScalar(0, S32, S64)
1671       .widenScalarToNextPow2(0)
1672       .scalarize(0);
1673 
1674   getActionDefinitionsBuilder({
1675       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1676       G_FCOPYSIGN,
1677 
1678       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1679       G_ATOMICRMW_NAND,
1680       G_ATOMICRMW_FSUB,
1681       G_READ_REGISTER,
1682       G_WRITE_REGISTER,
1683 
1684       G_SADDO, G_SSUBO,
1685 
1686        // TODO: Implement
1687       G_FMINIMUM, G_FMAXIMUM}).lower();
1688 
1689   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1690       .lower();
1691 
1692   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1693         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1694         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1695     .unsupported();
1696 
1697   getLegacyLegalizerInfo().computeTables();
1698   verify(*ST.getInstrInfo());
1699 }
1700 
1701 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1702                                          MachineInstr &MI) const {
1703   MachineIRBuilder &B = Helper.MIRBuilder;
1704   MachineRegisterInfo &MRI = *B.getMRI();
1705 
1706   switch (MI.getOpcode()) {
1707   case TargetOpcode::G_ADDRSPACE_CAST:
1708     return legalizeAddrSpaceCast(MI, MRI, B);
1709   case TargetOpcode::G_FRINT:
1710     return legalizeFrint(MI, MRI, B);
1711   case TargetOpcode::G_FCEIL:
1712     return legalizeFceil(MI, MRI, B);
1713   case TargetOpcode::G_FREM:
1714     return legalizeFrem(MI, MRI, B);
1715   case TargetOpcode::G_INTRINSIC_TRUNC:
1716     return legalizeIntrinsicTrunc(MI, MRI, B);
1717   case TargetOpcode::G_SITOFP:
1718     return legalizeITOFP(MI, MRI, B, true);
1719   case TargetOpcode::G_UITOFP:
1720     return legalizeITOFP(MI, MRI, B, false);
1721   case TargetOpcode::G_FPTOSI:
1722     return legalizeFPTOI(MI, MRI, B, true);
1723   case TargetOpcode::G_FPTOUI:
1724     return legalizeFPTOI(MI, MRI, B, false);
1725   case TargetOpcode::G_FMINNUM:
1726   case TargetOpcode::G_FMAXNUM:
1727   case TargetOpcode::G_FMINNUM_IEEE:
1728   case TargetOpcode::G_FMAXNUM_IEEE:
1729     return legalizeMinNumMaxNum(Helper, MI);
1730   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1731     return legalizeExtractVectorElt(MI, MRI, B);
1732   case TargetOpcode::G_INSERT_VECTOR_ELT:
1733     return legalizeInsertVectorElt(MI, MRI, B);
1734   case TargetOpcode::G_SHUFFLE_VECTOR:
1735     return legalizeShuffleVector(MI, MRI, B);
1736   case TargetOpcode::G_FSIN:
1737   case TargetOpcode::G_FCOS:
1738     return legalizeSinCos(MI, MRI, B);
1739   case TargetOpcode::G_GLOBAL_VALUE:
1740     return legalizeGlobalValue(MI, MRI, B);
1741   case TargetOpcode::G_LOAD:
1742   case TargetOpcode::G_SEXTLOAD:
1743   case TargetOpcode::G_ZEXTLOAD:
1744     return legalizeLoad(Helper, MI);
1745   case TargetOpcode::G_FMAD:
1746     return legalizeFMad(MI, MRI, B);
1747   case TargetOpcode::G_FDIV:
1748     return legalizeFDIV(MI, MRI, B);
1749   case TargetOpcode::G_UDIV:
1750   case TargetOpcode::G_UREM:
1751   case TargetOpcode::G_UDIVREM:
1752     return legalizeUnsignedDIV_REM(MI, MRI, B);
1753   case TargetOpcode::G_SDIV:
1754   case TargetOpcode::G_SREM:
1755   case TargetOpcode::G_SDIVREM:
1756     return legalizeSignedDIV_REM(MI, MRI, B);
1757   case TargetOpcode::G_ATOMIC_CMPXCHG:
1758     return legalizeAtomicCmpXChg(MI, MRI, B);
1759   case TargetOpcode::G_FLOG:
1760     return legalizeFlog(MI, B, numbers::ln2f);
1761   case TargetOpcode::G_FLOG10:
1762     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1763   case TargetOpcode::G_FEXP:
1764     return legalizeFExp(MI, B);
1765   case TargetOpcode::G_FPOW:
1766     return legalizeFPow(MI, B);
1767   case TargetOpcode::G_FFLOOR:
1768     return legalizeFFloor(MI, MRI, B);
1769   case TargetOpcode::G_BUILD_VECTOR:
1770     return legalizeBuildVector(MI, MRI, B);
1771   case TargetOpcode::G_CTLZ:
1772   case TargetOpcode::G_CTTZ:
1773     return legalizeCTLZ_CTTZ(MI, MRI, B);
1774   default:
1775     return false;
1776   }
1777 
1778   llvm_unreachable("expected switch to return");
1779 }
1780 
1781 Register AMDGPULegalizerInfo::getSegmentAperture(
1782   unsigned AS,
1783   MachineRegisterInfo &MRI,
1784   MachineIRBuilder &B) const {
1785   MachineFunction &MF = B.getMF();
1786   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1787   const LLT S32 = LLT::scalar(32);
1788 
1789   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1790 
1791   if (ST.hasApertureRegs()) {
1792     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1793     // getreg.
1794     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1795         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1796         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1797     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1798         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1799         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1800     unsigned Encoding =
1801         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1802         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1803         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1804 
1805     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1806 
1807     B.buildInstr(AMDGPU::S_GETREG_B32)
1808       .addDef(GetReg)
1809       .addImm(Encoding);
1810     MRI.setType(GetReg, S32);
1811 
1812     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1813     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1814   }
1815 
1816   Register QueuePtr = MRI.createGenericVirtualRegister(
1817     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1818 
1819   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1820     return Register();
1821 
1822   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1823   // private_segment_aperture_base_hi.
1824   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1825 
1826   // TODO: can we be smarter about machine pointer info?
1827   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1828   MachineMemOperand *MMO = MF.getMachineMemOperand(
1829       PtrInfo,
1830       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1831           MachineMemOperand::MOInvariant,
1832       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1833 
1834   Register LoadAddr;
1835 
1836   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1837   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1838 }
1839 
1840 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1841   MachineInstr &MI, MachineRegisterInfo &MRI,
1842   MachineIRBuilder &B) const {
1843   MachineFunction &MF = B.getMF();
1844 
1845   const LLT S32 = LLT::scalar(32);
1846   Register Dst = MI.getOperand(0).getReg();
1847   Register Src = MI.getOperand(1).getReg();
1848 
1849   LLT DstTy = MRI.getType(Dst);
1850   LLT SrcTy = MRI.getType(Src);
1851   unsigned DestAS = DstTy.getAddressSpace();
1852   unsigned SrcAS = SrcTy.getAddressSpace();
1853 
1854   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1855   // vector element.
1856   assert(!DstTy.isVector());
1857 
1858   const AMDGPUTargetMachine &TM
1859     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1860 
1861   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1862     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1863     return true;
1864   }
1865 
1866   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1867     // Truncate.
1868     B.buildExtract(Dst, Src, 0);
1869     MI.eraseFromParent();
1870     return true;
1871   }
1872 
1873   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1874     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1875     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1876 
1877     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1878     // another. Merge operands are required to be the same type, but creating an
1879     // extra ptrtoint would be kind of pointless.
1880     auto HighAddr = B.buildConstant(
1881       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1882     B.buildMerge(Dst, {Src, HighAddr});
1883     MI.eraseFromParent();
1884     return true;
1885   }
1886 
1887   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1888     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1889            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1890     unsigned NullVal = TM.getNullPointerValue(DestAS);
1891 
1892     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1893     auto FlatNull = B.buildConstant(SrcTy, 0);
1894 
1895     // Extract low 32-bits of the pointer.
1896     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1897 
1898     auto CmpRes =
1899         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1900     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1901 
1902     MI.eraseFromParent();
1903     return true;
1904   }
1905 
1906   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1907     return false;
1908 
1909   if (!ST.hasFlatAddressSpace())
1910     return false;
1911 
1912   auto SegmentNull =
1913       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1914   auto FlatNull =
1915       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1916 
1917   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1918   if (!ApertureReg.isValid())
1919     return false;
1920 
1921   auto CmpRes =
1922       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1923 
1924   // Coerce the type of the low half of the result so we can use merge_values.
1925   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1926 
1927   // TODO: Should we allow mismatched types but matching sizes in merges to
1928   // avoid the ptrtoint?
1929   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1930   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1931 
1932   MI.eraseFromParent();
1933   return true;
1934 }
1935 
1936 bool AMDGPULegalizerInfo::legalizeFrint(
1937   MachineInstr &MI, MachineRegisterInfo &MRI,
1938   MachineIRBuilder &B) const {
1939   Register Src = MI.getOperand(1).getReg();
1940   LLT Ty = MRI.getType(Src);
1941   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1942 
1943   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1944   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1945 
1946   auto C1 = B.buildFConstant(Ty, C1Val);
1947   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1948 
1949   // TODO: Should this propagate fast-math-flags?
1950   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1951   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1952 
1953   auto C2 = B.buildFConstant(Ty, C2Val);
1954   auto Fabs = B.buildFAbs(Ty, Src);
1955 
1956   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1957   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1958   MI.eraseFromParent();
1959   return true;
1960 }
1961 
1962 bool AMDGPULegalizerInfo::legalizeFceil(
1963   MachineInstr &MI, MachineRegisterInfo &MRI,
1964   MachineIRBuilder &B) const {
1965 
1966   const LLT S1 = LLT::scalar(1);
1967   const LLT S64 = LLT::scalar(64);
1968 
1969   Register Src = MI.getOperand(1).getReg();
1970   assert(MRI.getType(Src) == S64);
1971 
1972   // result = trunc(src)
1973   // if (src > 0.0 && src != result)
1974   //   result += 1.0
1975 
1976   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1977 
1978   const auto Zero = B.buildFConstant(S64, 0.0);
1979   const auto One = B.buildFConstant(S64, 1.0);
1980   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1981   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1982   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1983   auto Add = B.buildSelect(S64, And, One, Zero);
1984 
1985   // TODO: Should this propagate fast-math-flags?
1986   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1987   return true;
1988 }
1989 
1990 bool AMDGPULegalizerInfo::legalizeFrem(
1991   MachineInstr &MI, MachineRegisterInfo &MRI,
1992   MachineIRBuilder &B) const {
1993     Register DstReg = MI.getOperand(0).getReg();
1994     Register Src0Reg = MI.getOperand(1).getReg();
1995     Register Src1Reg = MI.getOperand(2).getReg();
1996     auto Flags = MI.getFlags();
1997     LLT Ty = MRI.getType(DstReg);
1998 
1999     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2000     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2001     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2002     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2003     MI.eraseFromParent();
2004     return true;
2005 }
2006 
2007 static MachineInstrBuilder extractF64Exponent(Register Hi,
2008                                               MachineIRBuilder &B) {
2009   const unsigned FractBits = 52;
2010   const unsigned ExpBits = 11;
2011   LLT S32 = LLT::scalar(32);
2012 
2013   auto Const0 = B.buildConstant(S32, FractBits - 32);
2014   auto Const1 = B.buildConstant(S32, ExpBits);
2015 
2016   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2017     .addUse(Hi)
2018     .addUse(Const0.getReg(0))
2019     .addUse(Const1.getReg(0));
2020 
2021   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2022 }
2023 
2024 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B) const {
2027   const LLT S1 = LLT::scalar(1);
2028   const LLT S32 = LLT::scalar(32);
2029   const LLT S64 = LLT::scalar(64);
2030 
2031   Register Src = MI.getOperand(1).getReg();
2032   assert(MRI.getType(Src) == S64);
2033 
2034   // TODO: Should this use extract since the low half is unused?
2035   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2036   Register Hi = Unmerge.getReg(1);
2037 
2038   // Extract the upper half, since this is where we will find the sign and
2039   // exponent.
2040   auto Exp = extractF64Exponent(Hi, B);
2041 
2042   const unsigned FractBits = 52;
2043 
2044   // Extract the sign bit.
2045   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2046   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2047 
2048   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2049 
2050   const auto Zero32 = B.buildConstant(S32, 0);
2051 
2052   // Extend back to 64-bits.
2053   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2054 
2055   auto Shr = B.buildAShr(S64, FractMask, Exp);
2056   auto Not = B.buildNot(S64, Shr);
2057   auto Tmp0 = B.buildAnd(S64, Src, Not);
2058   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2059 
2060   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2061   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2062 
2063   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2064   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2065   MI.eraseFromParent();
2066   return true;
2067 }
2068 
2069 bool AMDGPULegalizerInfo::legalizeITOFP(
2070   MachineInstr &MI, MachineRegisterInfo &MRI,
2071   MachineIRBuilder &B, bool Signed) const {
2072 
2073   Register Dst = MI.getOperand(0).getReg();
2074   Register Src = MI.getOperand(1).getReg();
2075 
2076   const LLT S64 = LLT::scalar(64);
2077   const LLT S32 = LLT::scalar(32);
2078 
2079   assert(MRI.getType(Src) == S64);
2080 
2081   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2082   auto ThirtyTwo = B.buildConstant(S32, 32);
2083 
2084   if (MRI.getType(Dst) == S64) {
2085     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2086                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2087 
2088     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2089     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2090                      .addUse(CvtHi.getReg(0))
2091                      .addUse(ThirtyTwo.getReg(0));
2092 
2093     // TODO: Should this propagate fast-math-flags?
2094     B.buildFAdd(Dst, LdExp, CvtLo);
2095     MI.eraseFromParent();
2096     return true;
2097   }
2098 
2099   assert(MRI.getType(Dst) == S32);
2100 
2101   auto One = B.buildConstant(S32, 1);
2102 
2103   MachineInstrBuilder ShAmt;
2104   if (Signed) {
2105     auto ThirtyOne = B.buildConstant(S32, 31);
2106     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2107     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2108     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2109     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2110                                /*HasSideEffects=*/false)
2111                   .addUse(Unmerge.getReg(1));
2112     auto LS2 = B.buildSub(S32, LS, One);
2113     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2114   } else
2115     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2116   auto Norm = B.buildShl(S64, Src, ShAmt);
2117   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2118   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2119   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2120   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2121   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2122   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2123                    /*HasSideEffects=*/false)
2124       .addUse(FVal.getReg(0))
2125       .addUse(Scale.getReg(0));
2126   MI.eraseFromParent();
2127   return true;
2128 }
2129 
2130 // TODO: Copied from DAG implementation. Verify logic and document how this
2131 // actually works.
2132 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2133                                         MachineRegisterInfo &MRI,
2134                                         MachineIRBuilder &B,
2135                                         bool Signed) const {
2136 
2137   Register Dst = MI.getOperand(0).getReg();
2138   Register Src = MI.getOperand(1).getReg();
2139 
2140   const LLT S64 = LLT::scalar(64);
2141   const LLT S32 = LLT::scalar(32);
2142 
2143   const LLT SrcLT = MRI.getType(Src);
2144   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2145 
2146   unsigned Flags = MI.getFlags();
2147 
2148   // The basic idea of converting a floating point number into a pair of 32-bit
2149   // integers is illustrated as follows:
2150   //
2151   //     tf := trunc(val);
2152   //    hif := floor(tf * 2^-32);
2153   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2154   //     hi := fptoi(hif);
2155   //     lo := fptoi(lof);
2156   //
2157   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2158   MachineInstrBuilder Sign;
2159   if (Signed && SrcLT == S32) {
2160     // However, a 32-bit floating point number has only 23 bits mantissa and
2161     // it's not enough to hold all the significant bits of `lof` if val is
2162     // negative. To avoid the loss of precision, We need to take the absolute
2163     // value after truncating and flip the result back based on the original
2164     // signedness.
2165     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2166     Trunc = B.buildFAbs(S32, Trunc, Flags);
2167   }
2168   MachineInstrBuilder K0, K1;
2169   if (SrcLT == S64) {
2170     K0 = B.buildFConstant(S64,
2171                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2172     K1 = B.buildFConstant(S64,
2173                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2174   } else {
2175     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2176     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2177   }
2178 
2179   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2180   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2181   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2182 
2183   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2184                                      : B.buildFPTOUI(S32, FloorMul);
2185   auto Lo = B.buildFPTOUI(S32, Fma);
2186 
2187   if (Signed && SrcLT == S32) {
2188     // Flip the result based on the signedness, which is either all 0s or 1s.
2189     Sign = B.buildMerge(S64, {Sign, Sign});
2190     // r := xor({lo, hi}, sign) - sign;
2191     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2192   } else
2193     B.buildMerge(Dst, {Lo, Hi});
2194   MI.eraseFromParent();
2195 
2196   return true;
2197 }
2198 
2199 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2200                                                MachineInstr &MI) const {
2201   MachineFunction &MF = Helper.MIRBuilder.getMF();
2202   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2203 
2204   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2205                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2206 
2207   // With ieee_mode disabled, the instructions have the correct behavior
2208   // already for G_FMINNUM/G_FMAXNUM
2209   if (!MFI->getMode().IEEE)
2210     return !IsIEEEOp;
2211 
2212   if (IsIEEEOp)
2213     return true;
2214 
2215   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2216 }
2217 
2218 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2219   MachineInstr &MI, MachineRegisterInfo &MRI,
2220   MachineIRBuilder &B) const {
2221   // TODO: Should move some of this into LegalizerHelper.
2222 
2223   // TODO: Promote dynamic indexing of s16 to s32
2224 
2225   // FIXME: Artifact combiner probably should have replaced the truncated
2226   // constant before this, so we shouldn't need
2227   // getIConstantVRegValWithLookThrough.
2228   Optional<ValueAndVReg> MaybeIdxVal =
2229       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2230   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2231     return true;
2232   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2233 
2234   Register Dst = MI.getOperand(0).getReg();
2235   Register Vec = MI.getOperand(1).getReg();
2236 
2237   LLT VecTy = MRI.getType(Vec);
2238   LLT EltTy = VecTy.getElementType();
2239   assert(EltTy == MRI.getType(Dst));
2240 
2241   if (IdxVal < VecTy.getNumElements())
2242     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2243   else
2244     B.buildUndef(Dst);
2245 
2246   MI.eraseFromParent();
2247   return true;
2248 }
2249 
2250 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2251   MachineInstr &MI, MachineRegisterInfo &MRI,
2252   MachineIRBuilder &B) const {
2253   // TODO: Should move some of this into LegalizerHelper.
2254 
2255   // TODO: Promote dynamic indexing of s16 to s32
2256 
2257   // FIXME: Artifact combiner probably should have replaced the truncated
2258   // constant before this, so we shouldn't need
2259   // getIConstantVRegValWithLookThrough.
2260   Optional<ValueAndVReg> MaybeIdxVal =
2261       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2262   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2263     return true;
2264 
2265   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2266   Register Dst = MI.getOperand(0).getReg();
2267   Register Vec = MI.getOperand(1).getReg();
2268   Register Ins = MI.getOperand(2).getReg();
2269 
2270   LLT VecTy = MRI.getType(Vec);
2271   LLT EltTy = VecTy.getElementType();
2272   assert(EltTy == MRI.getType(Ins));
2273 
2274   if (IdxVal < VecTy.getNumElements())
2275     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2276   else
2277     B.buildUndef(Dst);
2278 
2279   MI.eraseFromParent();
2280   return true;
2281 }
2282 
2283 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2284   MachineInstr &MI, MachineRegisterInfo &MRI,
2285   MachineIRBuilder &B) const {
2286   const LLT V2S16 = LLT::fixed_vector(2, 16);
2287 
2288   Register Dst = MI.getOperand(0).getReg();
2289   Register Src0 = MI.getOperand(1).getReg();
2290   LLT DstTy = MRI.getType(Dst);
2291   LLT SrcTy = MRI.getType(Src0);
2292 
2293   if (SrcTy == V2S16 && DstTy == V2S16 &&
2294       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2295     return true;
2296 
2297   MachineIRBuilder HelperBuilder(MI);
2298   GISelObserverWrapper DummyObserver;
2299   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2300   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2301 }
2302 
2303 bool AMDGPULegalizerInfo::legalizeSinCos(
2304   MachineInstr &MI, MachineRegisterInfo &MRI,
2305   MachineIRBuilder &B) const {
2306 
2307   Register DstReg = MI.getOperand(0).getReg();
2308   Register SrcReg = MI.getOperand(1).getReg();
2309   LLT Ty = MRI.getType(DstReg);
2310   unsigned Flags = MI.getFlags();
2311 
2312   Register TrigVal;
2313   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2314   if (ST.hasTrigReducedRange()) {
2315     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2316     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2317       .addUse(MulVal.getReg(0))
2318       .setMIFlags(Flags).getReg(0);
2319   } else
2320     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2321 
2322   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2323     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2324   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2325     .addUse(TrigVal)
2326     .setMIFlags(Flags);
2327   MI.eraseFromParent();
2328   return true;
2329 }
2330 
2331 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2332                                                   MachineIRBuilder &B,
2333                                                   const GlobalValue *GV,
2334                                                   int64_t Offset,
2335                                                   unsigned GAFlags) const {
2336   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2337   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2338   // to the following code sequence:
2339   //
2340   // For constant address space:
2341   //   s_getpc_b64 s[0:1]
2342   //   s_add_u32 s0, s0, $symbol
2343   //   s_addc_u32 s1, s1, 0
2344   //
2345   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346   //   a fixup or relocation is emitted to replace $symbol with a literal
2347   //   constant, which is a pc-relative offset from the encoding of the $symbol
2348   //   operand to the global variable.
2349   //
2350   // For global address space:
2351   //   s_getpc_b64 s[0:1]
2352   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2353   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2354   //
2355   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2356   //   fixups or relocations are emitted to replace $symbol@*@lo and
2357   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2358   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2359   //   operand to the global variable.
2360   //
2361   // What we want here is an offset from the value returned by s_getpc
2362   // (which is the address of the s_add_u32 instruction) to the global
2363   // variable, but since the encoding of $symbol starts 4 bytes after the start
2364   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2365   // small. This requires us to add 4 to the global variable offset in order to
2366   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2367   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2368   // instruction.
2369 
2370   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2371 
2372   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2373     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2374 
2375   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2376     .addDef(PCReg);
2377 
2378   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2379   if (GAFlags == SIInstrInfo::MO_NONE)
2380     MIB.addImm(0);
2381   else
2382     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2383 
2384   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2385 
2386   if (PtrTy.getSizeInBits() == 32)
2387     B.buildExtract(DstReg, PCReg, 0);
2388   return true;
2389  }
2390 
2391 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2392   MachineInstr &MI, MachineRegisterInfo &MRI,
2393   MachineIRBuilder &B) const {
2394   Register DstReg = MI.getOperand(0).getReg();
2395   LLT Ty = MRI.getType(DstReg);
2396   unsigned AS = Ty.getAddressSpace();
2397 
2398   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2399   MachineFunction &MF = B.getMF();
2400   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2401 
2402   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2403     if (!MFI->isModuleEntryFunction() &&
2404         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2405       const Function &Fn = MF.getFunction();
2406       DiagnosticInfoUnsupported BadLDSDecl(
2407         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2408         DS_Warning);
2409       Fn.getContext().diagnose(BadLDSDecl);
2410 
2411       // We currently don't have a way to correctly allocate LDS objects that
2412       // aren't directly associated with a kernel. We do force inlining of
2413       // functions that use local objects. However, if these dead functions are
2414       // not eliminated, we don't want a compile time error. Just emit a warning
2415       // and a trap, since there should be no callable path here.
2416       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2417       B.buildUndef(DstReg);
2418       MI.eraseFromParent();
2419       return true;
2420     }
2421 
2422     // TODO: We could emit code to handle the initialization somewhere.
2423     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2424       const SITargetLowering *TLI = ST.getTargetLowering();
2425       if (!TLI->shouldUseLDSConstAddress(GV)) {
2426         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2427         return true; // Leave in place;
2428       }
2429 
2430       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2431         Type *Ty = GV->getValueType();
2432         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2433         // zero-sized type in other languages to declare the dynamic shared
2434         // memory which size is not known at the compile time. They will be
2435         // allocated by the runtime and placed directly after the static
2436         // allocated ones. They all share the same offset.
2437         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2438           // Adjust alignment for that dynamic shared memory array.
2439           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2440           LLT S32 = LLT::scalar(32);
2441           auto Sz =
2442               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2443           B.buildIntToPtr(DstReg, Sz);
2444           MI.eraseFromParent();
2445           return true;
2446         }
2447       }
2448 
2449       B.buildConstant(
2450           DstReg,
2451           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2452       MI.eraseFromParent();
2453       return true;
2454     }
2455 
2456     const Function &Fn = MF.getFunction();
2457     DiagnosticInfoUnsupported BadInit(
2458       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2459     Fn.getContext().diagnose(BadInit);
2460     return true;
2461   }
2462 
2463   const SITargetLowering *TLI = ST.getTargetLowering();
2464 
2465   if (TLI->shouldEmitFixup(GV)) {
2466     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2467     MI.eraseFromParent();
2468     return true;
2469   }
2470 
2471   if (TLI->shouldEmitPCReloc(GV)) {
2472     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2473     MI.eraseFromParent();
2474     return true;
2475   }
2476 
2477   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2478   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2479 
2480   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2481   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2482       MachinePointerInfo::getGOT(MF),
2483       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2484           MachineMemOperand::MOInvariant,
2485       LoadTy, Align(8));
2486 
2487   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2488 
2489   if (Ty.getSizeInBits() == 32) {
2490     // Truncate if this is a 32-bit constant address.
2491     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2492     B.buildExtract(DstReg, Load, 0);
2493   } else
2494     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2495 
2496   MI.eraseFromParent();
2497   return true;
2498 }
2499 
2500 static LLT widenToNextPowerOf2(LLT Ty) {
2501   if (Ty.isVector())
2502     return Ty.changeElementCount(
2503         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2504   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2505 }
2506 
2507 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2508                                        MachineInstr &MI) const {
2509   MachineIRBuilder &B = Helper.MIRBuilder;
2510   MachineRegisterInfo &MRI = *B.getMRI();
2511   GISelChangeObserver &Observer = Helper.Observer;
2512 
2513   Register PtrReg = MI.getOperand(1).getReg();
2514   LLT PtrTy = MRI.getType(PtrReg);
2515   unsigned AddrSpace = PtrTy.getAddressSpace();
2516 
2517   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2518     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2519     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2520     Observer.changingInstr(MI);
2521     MI.getOperand(1).setReg(Cast.getReg(0));
2522     Observer.changedInstr(MI);
2523     return true;
2524   }
2525 
2526   if (MI.getOpcode() != AMDGPU::G_LOAD)
2527     return false;
2528 
2529   Register ValReg = MI.getOperand(0).getReg();
2530   LLT ValTy = MRI.getType(ValReg);
2531 
2532   MachineMemOperand *MMO = *MI.memoperands_begin();
2533   const unsigned ValSize = ValTy.getSizeInBits();
2534   const LLT MemTy = MMO->getMemoryType();
2535   const Align MemAlign = MMO->getAlign();
2536   const unsigned MemSize = MemTy.getSizeInBits();
2537   const unsigned AlignInBits = 8 * MemAlign.value();
2538 
2539   // Widen non-power-of-2 loads to the alignment if needed
2540   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2541     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2542 
2543     // This was already the correct extending load result type, so just adjust
2544     // the memory type.
2545     if (WideMemSize == ValSize) {
2546       MachineFunction &MF = B.getMF();
2547 
2548       MachineMemOperand *WideMMO =
2549           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2550       Observer.changingInstr(MI);
2551       MI.setMemRefs(MF, {WideMMO});
2552       Observer.changedInstr(MI);
2553       return true;
2554     }
2555 
2556     // Don't bother handling edge case that should probably never be produced.
2557     if (ValSize > WideMemSize)
2558       return false;
2559 
2560     LLT WideTy = widenToNextPowerOf2(ValTy);
2561 
2562     Register WideLoad;
2563     if (!WideTy.isVector()) {
2564       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2565       B.buildTrunc(ValReg, WideLoad).getReg(0);
2566     } else {
2567       // Extract the subvector.
2568 
2569       if (isRegisterType(ValTy)) {
2570         // If this a case where G_EXTRACT is legal, use it.
2571         // (e.g. <3 x s32> -> <4 x s32>)
2572         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2573         B.buildExtract(ValReg, WideLoad, 0);
2574       } else {
2575         // For cases where the widened type isn't a nice register value, unmerge
2576         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2577         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2578         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2579         B.setInsertPt(B.getMBB(), MI.getIterator());
2580         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2581       }
2582     }
2583 
2584     MI.eraseFromParent();
2585     return true;
2586   }
2587 
2588   return false;
2589 }
2590 
2591 bool AMDGPULegalizerInfo::legalizeFMad(
2592   MachineInstr &MI, MachineRegisterInfo &MRI,
2593   MachineIRBuilder &B) const {
2594   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2595   assert(Ty.isScalar());
2596 
2597   MachineFunction &MF = B.getMF();
2598   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2599 
2600   // TODO: Always legal with future ftz flag.
2601   // FIXME: Do we need just output?
2602   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2603     return true;
2604   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2605     return true;
2606 
2607   MachineIRBuilder HelperBuilder(MI);
2608   GISelObserverWrapper DummyObserver;
2609   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2610   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2611 }
2612 
2613 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2614   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2615   Register DstReg = MI.getOperand(0).getReg();
2616   Register PtrReg = MI.getOperand(1).getReg();
2617   Register CmpVal = MI.getOperand(2).getReg();
2618   Register NewVal = MI.getOperand(3).getReg();
2619 
2620   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2621          "this should not have been custom lowered");
2622 
2623   LLT ValTy = MRI.getType(CmpVal);
2624   LLT VecTy = LLT::fixed_vector(2, ValTy);
2625 
2626   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2627 
2628   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2629     .addDef(DstReg)
2630     .addUse(PtrReg)
2631     .addUse(PackedVal)
2632     .setMemRefs(MI.memoperands());
2633 
2634   MI.eraseFromParent();
2635   return true;
2636 }
2637 
2638 bool AMDGPULegalizerInfo::legalizeFlog(
2639   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2640   Register Dst = MI.getOperand(0).getReg();
2641   Register Src = MI.getOperand(1).getReg();
2642   LLT Ty = B.getMRI()->getType(Dst);
2643   unsigned Flags = MI.getFlags();
2644 
2645   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2646   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2647 
2648   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2649   MI.eraseFromParent();
2650   return true;
2651 }
2652 
2653 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2654                                        MachineIRBuilder &B) const {
2655   Register Dst = MI.getOperand(0).getReg();
2656   Register Src = MI.getOperand(1).getReg();
2657   unsigned Flags = MI.getFlags();
2658   LLT Ty = B.getMRI()->getType(Dst);
2659 
2660   auto K = B.buildFConstant(Ty, numbers::log2e);
2661   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2662   B.buildFExp2(Dst, Mul, Flags);
2663   MI.eraseFromParent();
2664   return true;
2665 }
2666 
2667 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2668                                        MachineIRBuilder &B) const {
2669   Register Dst = MI.getOperand(0).getReg();
2670   Register Src0 = MI.getOperand(1).getReg();
2671   Register Src1 = MI.getOperand(2).getReg();
2672   unsigned Flags = MI.getFlags();
2673   LLT Ty = B.getMRI()->getType(Dst);
2674   const LLT S16 = LLT::scalar(16);
2675   const LLT S32 = LLT::scalar(32);
2676 
2677   if (Ty == S32) {
2678     auto Log = B.buildFLog2(S32, Src0, Flags);
2679     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2680       .addUse(Log.getReg(0))
2681       .addUse(Src1)
2682       .setMIFlags(Flags);
2683     B.buildFExp2(Dst, Mul, Flags);
2684   } else if (Ty == S16) {
2685     // There's no f16 fmul_legacy, so we need to convert for it.
2686     auto Log = B.buildFLog2(S16, Src0, Flags);
2687     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2688     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2689     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2690       .addUse(Ext0.getReg(0))
2691       .addUse(Ext1.getReg(0))
2692       .setMIFlags(Flags);
2693 
2694     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2695   } else
2696     return false;
2697 
2698   MI.eraseFromParent();
2699   return true;
2700 }
2701 
2702 // Find a source register, ignoring any possible source modifiers.
2703 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2704   Register ModSrc = OrigSrc;
2705   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2706     ModSrc = SrcFNeg->getOperand(1).getReg();
2707     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2708       ModSrc = SrcFAbs->getOperand(1).getReg();
2709   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2710     ModSrc = SrcFAbs->getOperand(1).getReg();
2711   return ModSrc;
2712 }
2713 
2714 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2715                                          MachineRegisterInfo &MRI,
2716                                          MachineIRBuilder &B) const {
2717 
2718   const LLT S1 = LLT::scalar(1);
2719   const LLT S64 = LLT::scalar(64);
2720   Register Dst = MI.getOperand(0).getReg();
2721   Register OrigSrc = MI.getOperand(1).getReg();
2722   unsigned Flags = MI.getFlags();
2723   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2724          "this should not have been custom lowered");
2725 
2726   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2727   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2728   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2729   // V_FRACT bug is:
2730   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2731   //
2732   // Convert floor(x) to (x - fract(x))
2733 
2734   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2735     .addUse(OrigSrc)
2736     .setMIFlags(Flags);
2737 
2738   // Give source modifier matching some assistance before obscuring a foldable
2739   // pattern.
2740 
2741   // TODO: We can avoid the neg on the fract? The input sign to fract
2742   // shouldn't matter?
2743   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2744 
2745   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2746 
2747   Register Min = MRI.createGenericVirtualRegister(S64);
2748 
2749   // We don't need to concern ourselves with the snan handling difference, so
2750   // use the one which will directly select.
2751   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2752   if (MFI->getMode().IEEE)
2753     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2754   else
2755     B.buildFMinNum(Min, Fract, Const, Flags);
2756 
2757   Register CorrectedFract = Min;
2758   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2759     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2760     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2761   }
2762 
2763   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2764   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2765 
2766   MI.eraseFromParent();
2767   return true;
2768 }
2769 
2770 // Turn an illegal packed v2s16 build vector into bit operations.
2771 // TODO: This should probably be a bitcast action in LegalizerHelper.
2772 bool AMDGPULegalizerInfo::legalizeBuildVector(
2773   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2774   Register Dst = MI.getOperand(0).getReg();
2775   const LLT S32 = LLT::scalar(32);
2776   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2777 
2778   Register Src0 = MI.getOperand(1).getReg();
2779   Register Src1 = MI.getOperand(2).getReg();
2780   assert(MRI.getType(Src0) == LLT::scalar(16));
2781 
2782   auto Merge = B.buildMerge(S32, {Src0, Src1});
2783   B.buildBitcast(Dst, Merge);
2784 
2785   MI.eraseFromParent();
2786   return true;
2787 }
2788 
2789 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2790 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2791 // case with a single min instruction instead of a compare+select.
2792 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2793                                             MachineRegisterInfo &MRI,
2794                                             MachineIRBuilder &B) const {
2795   Register Dst = MI.getOperand(0).getReg();
2796   Register Src = MI.getOperand(1).getReg();
2797   LLT DstTy = MRI.getType(Dst);
2798   LLT SrcTy = MRI.getType(Src);
2799 
2800   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2801                         ? AMDGPU::G_AMDGPU_FFBH_U32
2802                         : AMDGPU::G_AMDGPU_FFBL_B32;
2803   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2804   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2805 
2806   MI.eraseFromParent();
2807   return true;
2808 }
2809 
2810 // Check that this is a G_XOR x, -1
2811 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2812   if (MI.getOpcode() != TargetOpcode::G_XOR)
2813     return false;
2814   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2815   return ConstVal && *ConstVal == -1;
2816 }
2817 
2818 // Return the use branch instruction, otherwise null if the usage is invalid.
2819 static MachineInstr *
2820 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2821                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2822   Register CondDef = MI.getOperand(0).getReg();
2823   if (!MRI.hasOneNonDBGUse(CondDef))
2824     return nullptr;
2825 
2826   MachineBasicBlock *Parent = MI.getParent();
2827   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2828 
2829   if (isNot(MRI, *UseMI)) {
2830     Register NegatedCond = UseMI->getOperand(0).getReg();
2831     if (!MRI.hasOneNonDBGUse(NegatedCond))
2832       return nullptr;
2833 
2834     // We're deleting the def of this value, so we need to remove it.
2835     eraseInstr(*UseMI, MRI);
2836 
2837     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2838     Negated = true;
2839   }
2840 
2841   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2842     return nullptr;
2843 
2844   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2845   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2846   if (Next == Parent->end()) {
2847     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2848     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2849       return nullptr;
2850     UncondBrTarget = &*NextMBB;
2851   } else {
2852     if (Next->getOpcode() != AMDGPU::G_BR)
2853       return nullptr;
2854     Br = &*Next;
2855     UncondBrTarget = Br->getOperand(0).getMBB();
2856   }
2857 
2858   return UseMI;
2859 }
2860 
2861 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2862                                          const ArgDescriptor *Arg,
2863                                          const TargetRegisterClass *ArgRC,
2864                                          LLT ArgTy) const {
2865   MCRegister SrcReg = Arg->getRegister();
2866   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2867   assert(DstReg.isVirtual() && "Virtual register expected");
2868 
2869   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2870                                              ArgTy);
2871   if (Arg->isMasked()) {
2872     // TODO: Should we try to emit this once in the entry block?
2873     const LLT S32 = LLT::scalar(32);
2874     const unsigned Mask = Arg->getMask();
2875     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2876 
2877     Register AndMaskSrc = LiveIn;
2878 
2879     if (Shift != 0) {
2880       auto ShiftAmt = B.buildConstant(S32, Shift);
2881       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2882     }
2883 
2884     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2885   } else {
2886     B.buildCopy(DstReg, LiveIn);
2887   }
2888 
2889   return true;
2890 }
2891 
2892 bool AMDGPULegalizerInfo::loadInputValue(
2893     Register DstReg, MachineIRBuilder &B,
2894     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2895   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2896   const ArgDescriptor *Arg;
2897   const TargetRegisterClass *ArgRC;
2898   LLT ArgTy;
2899   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2900 
2901   if (!Arg) {
2902     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2903       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2904       // case the pointer argument may be missing and we use null.
2905       B.buildConstant(DstReg, 0);
2906       return true;
2907     }
2908 
2909     // It's undefined behavior if a function marked with the amdgpu-no-*
2910     // attributes uses the corresponding intrinsic.
2911     B.buildUndef(DstReg);
2912     return true;
2913   }
2914 
2915   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2916     return false; // TODO: Handle these
2917   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2918 }
2919 
2920 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2921     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2922     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2923   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2924     return false;
2925 
2926   MI.eraseFromParent();
2927   return true;
2928 }
2929 
2930 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2931                                        MachineRegisterInfo &MRI,
2932                                        MachineIRBuilder &B) const {
2933   Register Dst = MI.getOperand(0).getReg();
2934   LLT DstTy = MRI.getType(Dst);
2935   LLT S16 = LLT::scalar(16);
2936   LLT S32 = LLT::scalar(32);
2937   LLT S64 = LLT::scalar(64);
2938 
2939   if (DstTy == S16)
2940     return legalizeFDIV16(MI, MRI, B);
2941   if (DstTy == S32)
2942     return legalizeFDIV32(MI, MRI, B);
2943   if (DstTy == S64)
2944     return legalizeFDIV64(MI, MRI, B);
2945 
2946   return false;
2947 }
2948 
2949 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2950                                                         Register DstDivReg,
2951                                                         Register DstRemReg,
2952                                                         Register X,
2953                                                         Register Y) const {
2954   const LLT S1 = LLT::scalar(1);
2955   const LLT S32 = LLT::scalar(32);
2956 
2957   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2958   // algorithm used here.
2959 
2960   // Initial estimate of inv(y).
2961   auto FloatY = B.buildUITOFP(S32, Y);
2962   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2963   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2964   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2965   auto Z = B.buildFPTOUI(S32, ScaledY);
2966 
2967   // One round of UNR.
2968   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2969   auto NegYZ = B.buildMul(S32, NegY, Z);
2970   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2971 
2972   // Quotient/remainder estimate.
2973   auto Q = B.buildUMulH(S32, X, Z);
2974   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2975 
2976   // First quotient/remainder refinement.
2977   auto One = B.buildConstant(S32, 1);
2978   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2979   if (DstDivReg)
2980     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2981   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2982 
2983   // Second quotient/remainder refinement.
2984   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2985   if (DstDivReg)
2986     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2987 
2988   if (DstRemReg)
2989     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2990 }
2991 
2992 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2993 //
2994 // Return lo, hi of result
2995 //
2996 // %cvt.lo = G_UITOFP Val.lo
2997 // %cvt.hi = G_UITOFP Val.hi
2998 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2999 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3000 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3001 // %mul2 = G_FMUL %mul1, 2**(-32)
3002 // %trunc = G_INTRINSIC_TRUNC %mul2
3003 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3004 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3005 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3006                                                        Register Val) {
3007   const LLT S32 = LLT::scalar(32);
3008   auto Unmerge = B.buildUnmerge(S32, Val);
3009 
3010   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3011   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3012 
3013   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3014                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3015 
3016   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3017   auto Mul1 =
3018       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3019 
3020   // 2**(-32)
3021   auto Mul2 =
3022       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3023   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3024 
3025   // -(2**32)
3026   auto Mad2 = B.buildFMAD(S32, Trunc,
3027                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3028 
3029   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3030   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3031 
3032   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3033 }
3034 
3035 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3036                                                         Register DstDivReg,
3037                                                         Register DstRemReg,
3038                                                         Register Numer,
3039                                                         Register Denom) const {
3040   const LLT S32 = LLT::scalar(32);
3041   const LLT S64 = LLT::scalar(64);
3042   const LLT S1 = LLT::scalar(1);
3043   Register RcpLo, RcpHi;
3044 
3045   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3046 
3047   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3048 
3049   auto Zero64 = B.buildConstant(S64, 0);
3050   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3051 
3052   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3053   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3054 
3055   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3056   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3057   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3058 
3059   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3060   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3061   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3062   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3063 
3064   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3065   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3066   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3067   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3068   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3069 
3070   auto Zero32 = B.buildConstant(S32, 0);
3071   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3072   auto Add2_HiC =
3073       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3074   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3075   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3076 
3077   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3078   Register NumerLo = UnmergeNumer.getReg(0);
3079   Register NumerHi = UnmergeNumer.getReg(1);
3080 
3081   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3082   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3083   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3084   Register Mul3_Lo = UnmergeMul3.getReg(0);
3085   Register Mul3_Hi = UnmergeMul3.getReg(1);
3086   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3087   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3088   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3089   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3090 
3091   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3092   Register DenomLo = UnmergeDenom.getReg(0);
3093   Register DenomHi = UnmergeDenom.getReg(1);
3094 
3095   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3096   auto C1 = B.buildSExt(S32, CmpHi);
3097 
3098   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3099   auto C2 = B.buildSExt(S32, CmpLo);
3100 
3101   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3102   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3103 
3104   // TODO: Here and below portions of the code can be enclosed into if/endif.
3105   // Currently control flow is unconditional and we have 4 selects after
3106   // potential endif to substitute PHIs.
3107 
3108   // if C3 != 0 ...
3109   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3110   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3111   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3112   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3113 
3114   auto One64 = B.buildConstant(S64, 1);
3115   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3116 
3117   auto C4 =
3118       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3119   auto C5 =
3120       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3121   auto C6 = B.buildSelect(
3122       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3123 
3124   // if (C6 != 0)
3125   auto Add4 = B.buildAdd(S64, Add3, One64);
3126   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3127 
3128   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3129   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3130   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3131 
3132   // endif C6
3133   // endif C3
3134 
3135   if (DstDivReg) {
3136     auto Sel1 = B.buildSelect(
3137         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3138     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3139                   Sel1, MulHi3);
3140   }
3141 
3142   if (DstRemReg) {
3143     auto Sel2 = B.buildSelect(
3144         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3145     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3146                   Sel2, Sub1);
3147   }
3148 }
3149 
3150 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3151                                                   MachineRegisterInfo &MRI,
3152                                                   MachineIRBuilder &B) const {
3153   Register DstDivReg, DstRemReg;
3154   switch (MI.getOpcode()) {
3155   default:
3156     llvm_unreachable("Unexpected opcode!");
3157   case AMDGPU::G_UDIV: {
3158     DstDivReg = MI.getOperand(0).getReg();
3159     break;
3160   }
3161   case AMDGPU::G_UREM: {
3162     DstRemReg = MI.getOperand(0).getReg();
3163     break;
3164   }
3165   case AMDGPU::G_UDIVREM: {
3166     DstDivReg = MI.getOperand(0).getReg();
3167     DstRemReg = MI.getOperand(1).getReg();
3168     break;
3169   }
3170   }
3171 
3172   const LLT S64 = LLT::scalar(64);
3173   const LLT S32 = LLT::scalar(32);
3174   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3175   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3176   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3177   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3178 
3179   if (Ty == S32)
3180     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3181   else if (Ty == S64)
3182     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3183   else
3184     return false;
3185 
3186   MI.eraseFromParent();
3187   return true;
3188 }
3189 
3190 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3191                                                 MachineRegisterInfo &MRI,
3192                                                 MachineIRBuilder &B) const {
3193   const LLT S64 = LLT::scalar(64);
3194   const LLT S32 = LLT::scalar(32);
3195 
3196   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3197   if (Ty != S32 && Ty != S64)
3198     return false;
3199 
3200   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3201   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3202   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3203 
3204   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3205   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3206   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3207 
3208   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3209   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3210 
3211   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3212   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3213 
3214   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3215   switch (MI.getOpcode()) {
3216   default:
3217     llvm_unreachable("Unexpected opcode!");
3218   case AMDGPU::G_SDIV: {
3219     DstDivReg = MI.getOperand(0).getReg();
3220     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3221     break;
3222   }
3223   case AMDGPU::G_SREM: {
3224     DstRemReg = MI.getOperand(0).getReg();
3225     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3226     break;
3227   }
3228   case AMDGPU::G_SDIVREM: {
3229     DstDivReg = MI.getOperand(0).getReg();
3230     DstRemReg = MI.getOperand(1).getReg();
3231     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3232     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3233     break;
3234   }
3235   }
3236 
3237   if (Ty == S32)
3238     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3239   else
3240     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3241 
3242   if (DstDivReg) {
3243     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3244     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3245     B.buildSub(DstDivReg, SignXor, Sign);
3246   }
3247 
3248   if (DstRemReg) {
3249     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3250     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3251     B.buildSub(DstRemReg, SignXor, Sign);
3252   }
3253 
3254   MI.eraseFromParent();
3255   return true;
3256 }
3257 
3258 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3259                                                  MachineRegisterInfo &MRI,
3260                                                  MachineIRBuilder &B) const {
3261   Register Res = MI.getOperand(0).getReg();
3262   Register LHS = MI.getOperand(1).getReg();
3263   Register RHS = MI.getOperand(2).getReg();
3264   uint16_t Flags = MI.getFlags();
3265   LLT ResTy = MRI.getType(Res);
3266 
3267   const MachineFunction &MF = B.getMF();
3268   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3269                             MI.getFlag(MachineInstr::FmAfn);
3270 
3271   if (!AllowInaccurateRcp)
3272     return false;
3273 
3274   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3275     // 1 / x -> RCP(x)
3276     if (CLHS->isExactlyValue(1.0)) {
3277       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3278         .addUse(RHS)
3279         .setMIFlags(Flags);
3280 
3281       MI.eraseFromParent();
3282       return true;
3283     }
3284 
3285     // -1 / x -> RCP( FNEG(x) )
3286     if (CLHS->isExactlyValue(-1.0)) {
3287       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3288       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3289         .addUse(FNeg.getReg(0))
3290         .setMIFlags(Flags);
3291 
3292       MI.eraseFromParent();
3293       return true;
3294     }
3295   }
3296 
3297   // x / y -> x * (1.0 / y)
3298   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3299     .addUse(RHS)
3300     .setMIFlags(Flags);
3301   B.buildFMul(Res, LHS, RCP, Flags);
3302 
3303   MI.eraseFromParent();
3304   return true;
3305 }
3306 
3307 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3308                                                    MachineRegisterInfo &MRI,
3309                                                    MachineIRBuilder &B) const {
3310   Register Res = MI.getOperand(0).getReg();
3311   Register X = MI.getOperand(1).getReg();
3312   Register Y = MI.getOperand(2).getReg();
3313   uint16_t Flags = MI.getFlags();
3314   LLT ResTy = MRI.getType(Res);
3315 
3316   const MachineFunction &MF = B.getMF();
3317   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3318                             MI.getFlag(MachineInstr::FmAfn);
3319 
3320   if (!AllowInaccurateRcp)
3321     return false;
3322 
3323   auto NegY = B.buildFNeg(ResTy, Y);
3324   auto One = B.buildFConstant(ResTy, 1.0);
3325 
3326   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3327     .addUse(Y)
3328     .setMIFlags(Flags);
3329 
3330   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3331   R = B.buildFMA(ResTy, Tmp0, R, R);
3332 
3333   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3334   R = B.buildFMA(ResTy, Tmp1, R, R);
3335 
3336   auto Ret = B.buildFMul(ResTy, X, R);
3337   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3338 
3339   B.buildFMA(Res, Tmp2, R, Ret);
3340   MI.eraseFromParent();
3341   return true;
3342 }
3343 
3344 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3345                                          MachineRegisterInfo &MRI,
3346                                          MachineIRBuilder &B) const {
3347   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3348     return true;
3349 
3350   Register Res = MI.getOperand(0).getReg();
3351   Register LHS = MI.getOperand(1).getReg();
3352   Register RHS = MI.getOperand(2).getReg();
3353 
3354   uint16_t Flags = MI.getFlags();
3355 
3356   LLT S16 = LLT::scalar(16);
3357   LLT S32 = LLT::scalar(32);
3358 
3359   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3360   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3361 
3362   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3363     .addUse(RHSExt.getReg(0))
3364     .setMIFlags(Flags);
3365 
3366   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3367   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3368 
3369   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3370     .addUse(RDst.getReg(0))
3371     .addUse(RHS)
3372     .addUse(LHS)
3373     .setMIFlags(Flags);
3374 
3375   MI.eraseFromParent();
3376   return true;
3377 }
3378 
3379 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3380 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3381 static void toggleSPDenormMode(bool Enable,
3382                                MachineIRBuilder &B,
3383                                const GCNSubtarget &ST,
3384                                AMDGPU::SIModeRegisterDefaults Mode) {
3385   // Set SP denorm mode to this value.
3386   unsigned SPDenormMode =
3387     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3388 
3389   if (ST.hasDenormModeInst()) {
3390     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3391     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3392 
3393     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3394     B.buildInstr(AMDGPU::S_DENORM_MODE)
3395       .addImm(NewDenormModeValue);
3396 
3397   } else {
3398     // Select FP32 bit field in mode register.
3399     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3400                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3401                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3402 
3403     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3404       .addImm(SPDenormMode)
3405       .addImm(SPDenormModeBitField);
3406   }
3407 }
3408 
3409 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3410                                          MachineRegisterInfo &MRI,
3411                                          MachineIRBuilder &B) const {
3412   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3413     return true;
3414 
3415   Register Res = MI.getOperand(0).getReg();
3416   Register LHS = MI.getOperand(1).getReg();
3417   Register RHS = MI.getOperand(2).getReg();
3418   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3419   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3420 
3421   uint16_t Flags = MI.getFlags();
3422 
3423   LLT S32 = LLT::scalar(32);
3424   LLT S1 = LLT::scalar(1);
3425 
3426   auto One = B.buildFConstant(S32, 1.0f);
3427 
3428   auto DenominatorScaled =
3429     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3430       .addUse(LHS)
3431       .addUse(RHS)
3432       .addImm(0)
3433       .setMIFlags(Flags);
3434   auto NumeratorScaled =
3435     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3436       .addUse(LHS)
3437       .addUse(RHS)
3438       .addImm(1)
3439       .setMIFlags(Flags);
3440 
3441   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3442     .addUse(DenominatorScaled.getReg(0))
3443     .setMIFlags(Flags);
3444   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3445 
3446   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3447   // aren't modeled as reading it.
3448   if (!Mode.allFP32Denormals())
3449     toggleSPDenormMode(true, B, ST, Mode);
3450 
3451   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3452   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3453   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3454   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3455   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3456   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3457 
3458   if (!Mode.allFP32Denormals())
3459     toggleSPDenormMode(false, B, ST, Mode);
3460 
3461   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3462     .addUse(Fma4.getReg(0))
3463     .addUse(Fma1.getReg(0))
3464     .addUse(Fma3.getReg(0))
3465     .addUse(NumeratorScaled.getReg(1))
3466     .setMIFlags(Flags);
3467 
3468   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3469     .addUse(Fmas.getReg(0))
3470     .addUse(RHS)
3471     .addUse(LHS)
3472     .setMIFlags(Flags);
3473 
3474   MI.eraseFromParent();
3475   return true;
3476 }
3477 
3478 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3479                                          MachineRegisterInfo &MRI,
3480                                          MachineIRBuilder &B) const {
3481   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3482     return true;
3483 
3484   Register Res = MI.getOperand(0).getReg();
3485   Register LHS = MI.getOperand(1).getReg();
3486   Register RHS = MI.getOperand(2).getReg();
3487 
3488   uint16_t Flags = MI.getFlags();
3489 
3490   LLT S64 = LLT::scalar(64);
3491   LLT S1 = LLT::scalar(1);
3492 
3493   auto One = B.buildFConstant(S64, 1.0);
3494 
3495   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3496     .addUse(LHS)
3497     .addUse(RHS)
3498     .addImm(0)
3499     .setMIFlags(Flags);
3500 
3501   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3502 
3503   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3504     .addUse(DivScale0.getReg(0))
3505     .setMIFlags(Flags);
3506 
3507   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3508   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3509   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3510 
3511   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3512     .addUse(LHS)
3513     .addUse(RHS)
3514     .addImm(1)
3515     .setMIFlags(Flags);
3516 
3517   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3518   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3519   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3520 
3521   Register Scale;
3522   if (!ST.hasUsableDivScaleConditionOutput()) {
3523     // Workaround a hardware bug on SI where the condition output from div_scale
3524     // is not usable.
3525 
3526     LLT S32 = LLT::scalar(32);
3527 
3528     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3529     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3530     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3531     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3532 
3533     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3534                               Scale1Unmerge.getReg(1));
3535     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3536                               Scale0Unmerge.getReg(1));
3537     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3538   } else {
3539     Scale = DivScale1.getReg(1);
3540   }
3541 
3542   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3543     .addUse(Fma4.getReg(0))
3544     .addUse(Fma3.getReg(0))
3545     .addUse(Mul.getReg(0))
3546     .addUse(Scale)
3547     .setMIFlags(Flags);
3548 
3549   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3550     .addUse(Fmas.getReg(0))
3551     .addUse(RHS)
3552     .addUse(LHS)
3553     .setMIFlags(Flags);
3554 
3555   MI.eraseFromParent();
3556   return true;
3557 }
3558 
3559 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3560                                                  MachineRegisterInfo &MRI,
3561                                                  MachineIRBuilder &B) const {
3562   Register Res = MI.getOperand(0).getReg();
3563   Register LHS = MI.getOperand(2).getReg();
3564   Register RHS = MI.getOperand(3).getReg();
3565   uint16_t Flags = MI.getFlags();
3566 
3567   LLT S32 = LLT::scalar(32);
3568   LLT S1 = LLT::scalar(1);
3569 
3570   auto Abs = B.buildFAbs(S32, RHS, Flags);
3571   const APFloat C0Val(1.0f);
3572 
3573   auto C0 = B.buildConstant(S32, 0x6f800000);
3574   auto C1 = B.buildConstant(S32, 0x2f800000);
3575   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3576 
3577   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3578   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3579 
3580   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3581 
3582   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3583     .addUse(Mul0.getReg(0))
3584     .setMIFlags(Flags);
3585 
3586   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3587 
3588   B.buildFMul(Res, Sel, Mul1, Flags);
3589 
3590   MI.eraseFromParent();
3591   return true;
3592 }
3593 
3594 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3595 // FIXME: Why do we handle this one but not other removed instructions?
3596 //
3597 // Reciprocal square root.  The clamp prevents infinite results, clamping
3598 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3599 // +-max_float.
3600 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3601                                                     MachineRegisterInfo &MRI,
3602                                                     MachineIRBuilder &B) const {
3603   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3604     return true;
3605 
3606   Register Dst = MI.getOperand(0).getReg();
3607   Register Src = MI.getOperand(2).getReg();
3608   auto Flags = MI.getFlags();
3609 
3610   LLT Ty = MRI.getType(Dst);
3611 
3612   const fltSemantics *FltSemantics;
3613   if (Ty == LLT::scalar(32))
3614     FltSemantics = &APFloat::IEEEsingle();
3615   else if (Ty == LLT::scalar(64))
3616     FltSemantics = &APFloat::IEEEdouble();
3617   else
3618     return false;
3619 
3620   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3621     .addUse(Src)
3622     .setMIFlags(Flags);
3623 
3624   // We don't need to concern ourselves with the snan handling difference, since
3625   // the rsq quieted (or not) so use the one which will directly select.
3626   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3627   const bool UseIEEE = MFI->getMode().IEEE;
3628 
3629   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3630   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3631                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3632 
3633   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3634 
3635   if (UseIEEE)
3636     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3637   else
3638     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3639   MI.eraseFromParent();
3640   return true;
3641 }
3642 
3643 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3644   switch (IID) {
3645   case Intrinsic::amdgcn_ds_fadd:
3646     return AMDGPU::G_ATOMICRMW_FADD;
3647   case Intrinsic::amdgcn_ds_fmin:
3648     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3649   case Intrinsic::amdgcn_ds_fmax:
3650     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3651   default:
3652     llvm_unreachable("not a DS FP intrinsic");
3653   }
3654 }
3655 
3656 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3657                                                       MachineInstr &MI,
3658                                                       Intrinsic::ID IID) const {
3659   GISelChangeObserver &Observer = Helper.Observer;
3660   Observer.changingInstr(MI);
3661 
3662   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3663 
3664   // The remaining operands were used to set fields in the MemOperand on
3665   // construction.
3666   for (int I = 6; I > 3; --I)
3667     MI.RemoveOperand(I);
3668 
3669   MI.RemoveOperand(1); // Remove the intrinsic ID.
3670   Observer.changedInstr(MI);
3671   return true;
3672 }
3673 
3674 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3675                                             MachineRegisterInfo &MRI,
3676                                             MachineIRBuilder &B) const {
3677   uint64_t Offset =
3678     ST.getTargetLowering()->getImplicitParameterOffset(
3679       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3680   LLT DstTy = MRI.getType(DstReg);
3681   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3682 
3683   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3684   if (!loadInputValue(KernargPtrReg, B,
3685                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3686     return false;
3687 
3688   // FIXME: This should be nuw
3689   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3690   return true;
3691 }
3692 
3693 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3694                                                  MachineRegisterInfo &MRI,
3695                                                  MachineIRBuilder &B) const {
3696   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3697   if (!MFI->isEntryFunction()) {
3698     return legalizePreloadedArgIntrin(MI, MRI, B,
3699                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3700   }
3701 
3702   Register DstReg = MI.getOperand(0).getReg();
3703   if (!getImplicitArgPtr(DstReg, MRI, B))
3704     return false;
3705 
3706   MI.eraseFromParent();
3707   return true;
3708 }
3709 
3710 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3711                                               MachineRegisterInfo &MRI,
3712                                               MachineIRBuilder &B,
3713                                               unsigned AddrSpace) const {
3714   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3715   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3716   Register Hi32 = Unmerge.getReg(1);
3717 
3718   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3719   MI.eraseFromParent();
3720   return true;
3721 }
3722 
3723 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3724 // offset (the offset that is included in bounds checking and swizzling, to be
3725 // split between the instruction's voffset and immoffset fields) and soffset
3726 // (the offset that is excluded from bounds checking and swizzling, to go in
3727 // the instruction's soffset field).  This function takes the first kind of
3728 // offset and figures out how to split it between voffset and immoffset.
3729 std::pair<Register, unsigned>
3730 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3731                                         Register OrigOffset) const {
3732   const unsigned MaxImm = 4095;
3733   Register BaseReg;
3734   unsigned ImmOffset;
3735   const LLT S32 = LLT::scalar(32);
3736   MachineRegisterInfo &MRI = *B.getMRI();
3737 
3738   std::tie(BaseReg, ImmOffset) =
3739       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3740 
3741   // If BaseReg is a pointer, convert it to int.
3742   if (MRI.getType(BaseReg).isPointer())
3743     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3744 
3745   // If the immediate value is too big for the immoffset field, put the value
3746   // and -4096 into the immoffset field so that the value that is copied/added
3747   // for the voffset field is a multiple of 4096, and it stands more chance
3748   // of being CSEd with the copy/add for another similar load/store.
3749   // However, do not do that rounding down to a multiple of 4096 if that is a
3750   // negative number, as it appears to be illegal to have a negative offset
3751   // in the vgpr, even if adding the immediate offset makes it positive.
3752   unsigned Overflow = ImmOffset & ~MaxImm;
3753   ImmOffset -= Overflow;
3754   if ((int32_t)Overflow < 0) {
3755     Overflow += ImmOffset;
3756     ImmOffset = 0;
3757   }
3758 
3759   if (Overflow != 0) {
3760     if (!BaseReg) {
3761       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3762     } else {
3763       auto OverflowVal = B.buildConstant(S32, Overflow);
3764       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3765     }
3766   }
3767 
3768   if (!BaseReg)
3769     BaseReg = B.buildConstant(S32, 0).getReg(0);
3770 
3771   return std::make_pair(BaseReg, ImmOffset);
3772 }
3773 
3774 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3775 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3776                                           Register VOffset, Register SOffset,
3777                                           unsigned ImmOffset, Register VIndex,
3778                                           MachineRegisterInfo &MRI) const {
3779   Optional<ValueAndVReg> MaybeVOffsetVal =
3780       getIConstantVRegValWithLookThrough(VOffset, MRI);
3781   Optional<ValueAndVReg> MaybeSOffsetVal =
3782       getIConstantVRegValWithLookThrough(SOffset, MRI);
3783   Optional<ValueAndVReg> MaybeVIndexVal =
3784       getIConstantVRegValWithLookThrough(VIndex, MRI);
3785   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3786   // update the MMO with that offset. The stride is unknown so we can only do
3787   // this if VIndex is constant 0.
3788   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3789       MaybeVIndexVal->Value == 0) {
3790     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3791                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3792     MMO->setOffset(TotalOffset);
3793   } else {
3794     // We don't have a constant combined offset to use in the MMO. Give up.
3795     MMO->setValue((Value *)nullptr);
3796   }
3797 }
3798 
3799 /// Handle register layout difference for f16 images for some subtargets.
3800 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3801                                              MachineRegisterInfo &MRI,
3802                                              Register Reg,
3803                                              bool ImageStore) const {
3804   const LLT S16 = LLT::scalar(16);
3805   const LLT S32 = LLT::scalar(32);
3806   LLT StoreVT = MRI.getType(Reg);
3807   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3808 
3809   if (ST.hasUnpackedD16VMem()) {
3810     auto Unmerge = B.buildUnmerge(S16, Reg);
3811 
3812     SmallVector<Register, 4> WideRegs;
3813     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3814       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3815 
3816     int NumElts = StoreVT.getNumElements();
3817 
3818     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3819         .getReg(0);
3820   }
3821 
3822   if (ImageStore && ST.hasImageStoreD16Bug()) {
3823     if (StoreVT.getNumElements() == 2) {
3824       SmallVector<Register, 4> PackedRegs;
3825       Reg = B.buildBitcast(S32, Reg).getReg(0);
3826       PackedRegs.push_back(Reg);
3827       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3828       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3829           .getReg(0);
3830     }
3831 
3832     if (StoreVT.getNumElements() == 3) {
3833       SmallVector<Register, 4> PackedRegs;
3834       auto Unmerge = B.buildUnmerge(S16, Reg);
3835       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3836         PackedRegs.push_back(Unmerge.getReg(I));
3837       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3838       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3839       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3840     }
3841 
3842     if (StoreVT.getNumElements() == 4) {
3843       SmallVector<Register, 4> PackedRegs;
3844       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3845       auto Unmerge = B.buildUnmerge(S32, Reg);
3846       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3847         PackedRegs.push_back(Unmerge.getReg(I));
3848       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3849       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3850           .getReg(0);
3851     }
3852 
3853     llvm_unreachable("invalid data type");
3854   }
3855 
3856   return Reg;
3857 }
3858 
3859 Register AMDGPULegalizerInfo::fixStoreSourceType(
3860   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3861   MachineRegisterInfo *MRI = B.getMRI();
3862   LLT Ty = MRI->getType(VData);
3863 
3864   const LLT S16 = LLT::scalar(16);
3865 
3866   // Fixup illegal register types for i8 stores.
3867   if (Ty == LLT::scalar(8) || Ty == S16) {
3868     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3869     return AnyExt;
3870   }
3871 
3872   if (Ty.isVector()) {
3873     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3874       if (IsFormat)
3875         return handleD16VData(B, *MRI, VData);
3876     }
3877   }
3878 
3879   return VData;
3880 }
3881 
3882 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3883                                               MachineRegisterInfo &MRI,
3884                                               MachineIRBuilder &B,
3885                                               bool IsTyped,
3886                                               bool IsFormat) const {
3887   Register VData = MI.getOperand(1).getReg();
3888   LLT Ty = MRI.getType(VData);
3889   LLT EltTy = Ty.getScalarType();
3890   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3891   const LLT S32 = LLT::scalar(32);
3892 
3893   VData = fixStoreSourceType(B, VData, IsFormat);
3894   Register RSrc = MI.getOperand(2).getReg();
3895 
3896   MachineMemOperand *MMO = *MI.memoperands_begin();
3897   const int MemSize = MMO->getSize();
3898 
3899   unsigned ImmOffset;
3900 
3901   // The typed intrinsics add an immediate after the registers.
3902   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3903 
3904   // The struct intrinsic variants add one additional operand over raw.
3905   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3906   Register VIndex;
3907   int OpOffset = 0;
3908   if (HasVIndex) {
3909     VIndex = MI.getOperand(3).getReg();
3910     OpOffset = 1;
3911   } else {
3912     VIndex = B.buildConstant(S32, 0).getReg(0);
3913   }
3914 
3915   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3916   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3917 
3918   unsigned Format = 0;
3919   if (IsTyped) {
3920     Format = MI.getOperand(5 + OpOffset).getImm();
3921     ++OpOffset;
3922   }
3923 
3924   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3925 
3926   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3927   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3928 
3929   unsigned Opc;
3930   if (IsTyped) {
3931     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3932                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3933   } else if (IsFormat) {
3934     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3935                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3936   } else {
3937     switch (MemSize) {
3938     case 1:
3939       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3940       break;
3941     case 2:
3942       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3943       break;
3944     default:
3945       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3946       break;
3947     }
3948   }
3949 
3950   auto MIB = B.buildInstr(Opc)
3951     .addUse(VData)              // vdata
3952     .addUse(RSrc)               // rsrc
3953     .addUse(VIndex)             // vindex
3954     .addUse(VOffset)            // voffset
3955     .addUse(SOffset)            // soffset
3956     .addImm(ImmOffset);         // offset(imm)
3957 
3958   if (IsTyped)
3959     MIB.addImm(Format);
3960 
3961   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3962      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3963      .addMemOperand(MMO);
3964 
3965   MI.eraseFromParent();
3966   return true;
3967 }
3968 
3969 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3970                                              MachineRegisterInfo &MRI,
3971                                              MachineIRBuilder &B,
3972                                              bool IsFormat,
3973                                              bool IsTyped) const {
3974   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3975   MachineMemOperand *MMO = *MI.memoperands_begin();
3976   const LLT MemTy = MMO->getMemoryType();
3977   const LLT S32 = LLT::scalar(32);
3978 
3979   Register Dst = MI.getOperand(0).getReg();
3980   Register RSrc = MI.getOperand(2).getReg();
3981 
3982   // The typed intrinsics add an immediate after the registers.
3983   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3984 
3985   // The struct intrinsic variants add one additional operand over raw.
3986   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3987   Register VIndex;
3988   int OpOffset = 0;
3989   if (HasVIndex) {
3990     VIndex = MI.getOperand(3).getReg();
3991     OpOffset = 1;
3992   } else {
3993     VIndex = B.buildConstant(S32, 0).getReg(0);
3994   }
3995 
3996   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3997   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3998 
3999   unsigned Format = 0;
4000   if (IsTyped) {
4001     Format = MI.getOperand(5 + OpOffset).getImm();
4002     ++OpOffset;
4003   }
4004 
4005   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4006   unsigned ImmOffset;
4007 
4008   LLT Ty = MRI.getType(Dst);
4009   LLT EltTy = Ty.getScalarType();
4010   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4011   const bool Unpacked = ST.hasUnpackedD16VMem();
4012 
4013   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4014   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4015 
4016   unsigned Opc;
4017 
4018   if (IsTyped) {
4019     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4020                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4021   } else if (IsFormat) {
4022     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4023                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4024   } else {
4025     switch (MemTy.getSizeInBits()) {
4026     case 8:
4027       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4028       break;
4029     case 16:
4030       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4031       break;
4032     default:
4033       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4034       break;
4035     }
4036   }
4037 
4038   Register LoadDstReg;
4039 
4040   bool IsExtLoad =
4041       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4042   LLT UnpackedTy = Ty.changeElementSize(32);
4043 
4044   if (IsExtLoad)
4045     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4046   else if (Unpacked && IsD16 && Ty.isVector())
4047     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4048   else
4049     LoadDstReg = Dst;
4050 
4051   auto MIB = B.buildInstr(Opc)
4052     .addDef(LoadDstReg)         // vdata
4053     .addUse(RSrc)               // rsrc
4054     .addUse(VIndex)             // vindex
4055     .addUse(VOffset)            // voffset
4056     .addUse(SOffset)            // soffset
4057     .addImm(ImmOffset);         // offset(imm)
4058 
4059   if (IsTyped)
4060     MIB.addImm(Format);
4061 
4062   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4063      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4064      .addMemOperand(MMO);
4065 
4066   if (LoadDstReg != Dst) {
4067     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4068 
4069     // Widen result for extending loads was widened.
4070     if (IsExtLoad)
4071       B.buildTrunc(Dst, LoadDstReg);
4072     else {
4073       // Repack to original 16-bit vector result
4074       // FIXME: G_TRUNC should work, but legalization currently fails
4075       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4076       SmallVector<Register, 4> Repack;
4077       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4078         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4079       B.buildMerge(Dst, Repack);
4080     }
4081   }
4082 
4083   MI.eraseFromParent();
4084   return true;
4085 }
4086 
4087 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4088                                                MachineIRBuilder &B,
4089                                                bool IsInc) const {
4090   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4091                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4092   B.buildInstr(Opc)
4093     .addDef(MI.getOperand(0).getReg())
4094     .addUse(MI.getOperand(2).getReg())
4095     .addUse(MI.getOperand(3).getReg())
4096     .cloneMemRefs(MI);
4097   MI.eraseFromParent();
4098   return true;
4099 }
4100 
4101 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4102   switch (IntrID) {
4103   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4104   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4105     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4106   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4107   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4108     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4109   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4110   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4111     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4112   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4113   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4114     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4115   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4116   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4117     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4118   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4119   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4120     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4121   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4122   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4123     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4124   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4125   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4126     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4127   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4128   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4129     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4130   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4131   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4132     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4133   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4134   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4135     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4136   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4137   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4138     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4139   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4140   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4141     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4142   case Intrinsic::amdgcn_buffer_atomic_fadd:
4143   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4144   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4145     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4146   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4147   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4148     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4149   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4150   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4151     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4152   default:
4153     llvm_unreachable("unhandled atomic opcode");
4154   }
4155 }
4156 
4157 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4158                                                MachineIRBuilder &B,
4159                                                Intrinsic::ID IID) const {
4160   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4161                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4162   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4163 
4164   Register Dst;
4165 
4166   int OpOffset = 0;
4167   if (HasReturn) {
4168     // A few FP atomics do not support return values.
4169     Dst = MI.getOperand(0).getReg();
4170   } else {
4171     OpOffset = -1;
4172   }
4173 
4174   Register VData = MI.getOperand(2 + OpOffset).getReg();
4175   Register CmpVal;
4176 
4177   if (IsCmpSwap) {
4178     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4179     ++OpOffset;
4180   }
4181 
4182   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4183   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4184 
4185   // The struct intrinsic variants add one additional operand over raw.
4186   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4187   Register VIndex;
4188   if (HasVIndex) {
4189     VIndex = MI.getOperand(4 + OpOffset).getReg();
4190     ++OpOffset;
4191   } else {
4192     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4193   }
4194 
4195   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4196   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4197   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4198 
4199   MachineMemOperand *MMO = *MI.memoperands_begin();
4200 
4201   unsigned ImmOffset;
4202   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4203   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4204 
4205   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4206 
4207   if (HasReturn)
4208     MIB.addDef(Dst);
4209 
4210   MIB.addUse(VData); // vdata
4211 
4212   if (IsCmpSwap)
4213     MIB.addReg(CmpVal);
4214 
4215   MIB.addUse(RSrc)               // rsrc
4216      .addUse(VIndex)             // vindex
4217      .addUse(VOffset)            // voffset
4218      .addUse(SOffset)            // soffset
4219      .addImm(ImmOffset)          // offset(imm)
4220      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4221      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4222      .addMemOperand(MMO);
4223 
4224   MI.eraseFromParent();
4225   return true;
4226 }
4227 
4228 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4229 /// vector with s16 typed elements.
4230 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4231                                       SmallVectorImpl<Register> &PackedAddrs,
4232                                       unsigned ArgOffset,
4233                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4234                                       bool IsA16, bool IsG16) {
4235   const LLT S16 = LLT::scalar(16);
4236   const LLT V2S16 = LLT::fixed_vector(2, 16);
4237   auto EndIdx = Intr->VAddrEnd;
4238 
4239   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4240     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4241     if (!SrcOp.isReg())
4242       continue; // _L to _LZ may have eliminated this.
4243 
4244     Register AddrReg = SrcOp.getReg();
4245 
4246     if ((I < Intr->GradientStart) ||
4247         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4248         (I >= Intr->CoordStart && !IsA16)) {
4249       // Handle any gradient or coordinate operands that should not be packed
4250       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4251       PackedAddrs.push_back(AddrReg);
4252     } else {
4253       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4254       // derivatives dx/dh and dx/dv are packed with undef.
4255       if (((I + 1) >= EndIdx) ||
4256           ((Intr->NumGradients / 2) % 2 == 1 &&
4257            (I == static_cast<unsigned>(Intr->GradientStart +
4258                                        (Intr->NumGradients / 2) - 1) ||
4259             I == static_cast<unsigned>(Intr->GradientStart +
4260                                        Intr->NumGradients - 1))) ||
4261           // Check for _L to _LZ optimization
4262           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4263         PackedAddrs.push_back(
4264             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4265                 .getReg(0));
4266       } else {
4267         PackedAddrs.push_back(
4268             B.buildBuildVector(
4269                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4270                 .getReg(0));
4271         ++I;
4272       }
4273     }
4274   }
4275 }
4276 
4277 /// Convert from separate vaddr components to a single vector address register,
4278 /// and replace the remaining operands with $noreg.
4279 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4280                                      int DimIdx, int NumVAddrs) {
4281   const LLT S32 = LLT::scalar(32);
4282 
4283   SmallVector<Register, 8> AddrRegs;
4284   for (int I = 0; I != NumVAddrs; ++I) {
4285     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4286     if (SrcOp.isReg()) {
4287       AddrRegs.push_back(SrcOp.getReg());
4288       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4289     }
4290   }
4291 
4292   int NumAddrRegs = AddrRegs.size();
4293   if (NumAddrRegs != 1) {
4294     // Above 8 elements round up to next power of 2 (i.e. 16).
4295     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4296       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4297       auto Undef = B.buildUndef(S32);
4298       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4299       NumAddrRegs = RoundedNumRegs;
4300     }
4301 
4302     auto VAddr =
4303         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4304     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4305   }
4306 
4307   for (int I = 1; I != NumVAddrs; ++I) {
4308     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4309     if (SrcOp.isReg())
4310       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4311   }
4312 }
4313 
4314 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4315 ///
4316 /// Depending on the subtarget, load/store with 16-bit element data need to be
4317 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4318 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4319 /// registers.
4320 ///
4321 /// We don't want to directly select image instructions just yet, but also want
4322 /// to exposes all register repacking to the legalizer/combiners. We also don't
4323 /// want a selected instrution entering RegBankSelect. In order to avoid
4324 /// defining a multitude of intermediate image instructions, directly hack on
4325 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4326 /// padding now unnecessary arguments with $noreg.
4327 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4328     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4329     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4330 
4331   const unsigned NumDefs = MI.getNumExplicitDefs();
4332   const unsigned ArgOffset = NumDefs + 1;
4333   bool IsTFE = NumDefs == 2;
4334   // We are only processing the operands of d16 image operations on subtargets
4335   // that use the unpacked register layout, or need to repack the TFE result.
4336 
4337   // TODO: Do we need to guard against already legalized intrinsics?
4338   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4339       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4340 
4341   MachineRegisterInfo *MRI = B.getMRI();
4342   const LLT S32 = LLT::scalar(32);
4343   const LLT S16 = LLT::scalar(16);
4344   const LLT V2S16 = LLT::fixed_vector(2, 16);
4345 
4346   unsigned DMask = 0;
4347 
4348   // Check for 16 bit addresses and pack if true.
4349   LLT GradTy =
4350       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4351   LLT AddrTy =
4352       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4353   const bool IsG16 = GradTy == S16;
4354   const bool IsA16 = AddrTy == S16;
4355 
4356   int DMaskLanes = 0;
4357   if (!BaseOpcode->Atomic) {
4358     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4359     if (BaseOpcode->Gather4) {
4360       DMaskLanes = 4;
4361     } else if (DMask != 0) {
4362       DMaskLanes = countPopulation(DMask);
4363     } else if (!IsTFE && !BaseOpcode->Store) {
4364       // If dmask is 0, this is a no-op load. This can be eliminated.
4365       B.buildUndef(MI.getOperand(0));
4366       MI.eraseFromParent();
4367       return true;
4368     }
4369   }
4370 
4371   Observer.changingInstr(MI);
4372   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4373 
4374   unsigned NewOpcode = NumDefs == 0 ?
4375     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4376 
4377   // Track that we legalized this
4378   MI.setDesc(B.getTII().get(NewOpcode));
4379 
4380   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4381   // dmask to be at least 1 otherwise the instruction will fail
4382   if (IsTFE && DMask == 0) {
4383     DMask = 0x1;
4384     DMaskLanes = 1;
4385     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4386   }
4387 
4388   if (BaseOpcode->Atomic) {
4389     Register VData0 = MI.getOperand(2).getReg();
4390     LLT Ty = MRI->getType(VData0);
4391 
4392     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4393     if (Ty.isVector())
4394       return false;
4395 
4396     if (BaseOpcode->AtomicX2) {
4397       Register VData1 = MI.getOperand(3).getReg();
4398       // The two values are packed in one register.
4399       LLT PackedTy = LLT::fixed_vector(2, Ty);
4400       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4401       MI.getOperand(2).setReg(Concat.getReg(0));
4402       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4403     }
4404   }
4405 
4406   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4407 
4408   // Optimize _L to _LZ when _L is zero
4409   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4410           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4411     const ConstantFP *ConstantLod;
4412 
4413     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4414                  m_GFCst(ConstantLod))) {
4415       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4416         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4417         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4418             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4419                                                       Intr->Dim);
4420 
4421         // The starting indexes should remain in the same place.
4422         --CorrectedNumVAddrs;
4423 
4424         MI.getOperand(MI.getNumExplicitDefs())
4425             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4426         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4427         Intr = NewImageDimIntr;
4428       }
4429     }
4430   }
4431 
4432   // Optimize _mip away, when 'lod' is zero
4433   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4434     int64_t ConstantLod;
4435     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4436                  m_ICst(ConstantLod))) {
4437       if (ConstantLod == 0) {
4438         // TODO: Change intrinsic opcode and remove operand instead or replacing
4439         // it with 0, as the _L to _LZ handling is done above.
4440         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4441         --CorrectedNumVAddrs;
4442       }
4443     }
4444   }
4445 
4446   // Rewrite the addressing register layout before doing anything else.
4447   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4448     // 16 bit gradients are supported, but are tied to the A16 control
4449     // so both gradients and addresses must be 16 bit
4450     return false;
4451   }
4452 
4453   if (IsA16 && !ST.hasA16()) {
4454     // A16 not supported
4455     return false;
4456   }
4457 
4458   if (IsA16 || IsG16) {
4459     if (Intr->NumVAddrs > 1) {
4460       SmallVector<Register, 4> PackedRegs;
4461 
4462       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4463                                 IsG16);
4464 
4465       // See also below in the non-a16 branch
4466       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4467                           PackedRegs.size() <= ST.getNSAMaxSize();
4468 
4469       if (!UseNSA && PackedRegs.size() > 1) {
4470         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4471         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4472         PackedRegs[0] = Concat.getReg(0);
4473         PackedRegs.resize(1);
4474       }
4475 
4476       const unsigned NumPacked = PackedRegs.size();
4477       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4478         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4479         if (!SrcOp.isReg()) {
4480           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4481           continue;
4482         }
4483 
4484         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4485 
4486         if (I - Intr->VAddrStart < NumPacked)
4487           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4488         else
4489           SrcOp.setReg(AMDGPU::NoRegister);
4490       }
4491     }
4492   } else {
4493     // If the register allocator cannot place the address registers contiguously
4494     // without introducing moves, then using the non-sequential address encoding
4495     // is always preferable, since it saves VALU instructions and is usually a
4496     // wash in terms of code size or even better.
4497     //
4498     // However, we currently have no way of hinting to the register allocator
4499     // that MIMG addresses should be placed contiguously when it is possible to
4500     // do so, so force non-NSA for the common 2-address case as a heuristic.
4501     //
4502     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4503     // allocation when possible.
4504     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4505                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4506 
4507     if (!UseNSA && Intr->NumVAddrs > 1)
4508       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4509                                Intr->NumVAddrs);
4510   }
4511 
4512   int Flags = 0;
4513   if (IsA16)
4514     Flags |= 1;
4515   if (IsG16)
4516     Flags |= 2;
4517   MI.addOperand(MachineOperand::CreateImm(Flags));
4518 
4519   if (BaseOpcode->Store) { // No TFE for stores?
4520     // TODO: Handle dmask trim
4521     Register VData = MI.getOperand(1).getReg();
4522     LLT Ty = MRI->getType(VData);
4523     if (!Ty.isVector() || Ty.getElementType() != S16)
4524       return true;
4525 
4526     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4527     if (RepackedReg != VData) {
4528       MI.getOperand(1).setReg(RepackedReg);
4529     }
4530 
4531     return true;
4532   }
4533 
4534   Register DstReg = MI.getOperand(0).getReg();
4535   LLT Ty = MRI->getType(DstReg);
4536   const LLT EltTy = Ty.getScalarType();
4537   const bool IsD16 = Ty.getScalarType() == S16;
4538   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4539 
4540   // Confirm that the return type is large enough for the dmask specified
4541   if (NumElts < DMaskLanes)
4542     return false;
4543 
4544   if (NumElts > 4 || DMaskLanes > 4)
4545     return false;
4546 
4547   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4548   const LLT AdjustedTy =
4549       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4550 
4551   // The raw dword aligned data component of the load. The only legal cases
4552   // where this matters should be when using the packed D16 format, for
4553   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4554   LLT RoundedTy;
4555 
4556   // S32 vector to to cover all data, plus TFE result element.
4557   LLT TFETy;
4558 
4559   // Register type to use for each loaded component. Will be S32 or V2S16.
4560   LLT RegTy;
4561 
4562   if (IsD16 && ST.hasUnpackedD16VMem()) {
4563     RoundedTy =
4564         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4565     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4566     RegTy = S32;
4567   } else {
4568     unsigned EltSize = EltTy.getSizeInBits();
4569     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4570     unsigned RoundedSize = 32 * RoundedElts;
4571     RoundedTy = LLT::scalarOrVector(
4572         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4573     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4574     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4575   }
4576 
4577   // The return type does not need adjustment.
4578   // TODO: Should we change s16 case to s32 or <2 x s16>?
4579   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4580     return true;
4581 
4582   Register Dst1Reg;
4583 
4584   // Insert after the instruction.
4585   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4586 
4587   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4588   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4589   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4590   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4591 
4592   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4593 
4594   MI.getOperand(0).setReg(NewResultReg);
4595 
4596   // In the IR, TFE is supposed to be used with a 2 element struct return
4597   // type. The instruction really returns these two values in one contiguous
4598   // register, with one additional dword beyond the loaded data. Rewrite the
4599   // return type to use a single register result.
4600 
4601   if (IsTFE) {
4602     Dst1Reg = MI.getOperand(1).getReg();
4603     if (MRI->getType(Dst1Reg) != S32)
4604       return false;
4605 
4606     // TODO: Make sure the TFE operand bit is set.
4607     MI.RemoveOperand(1);
4608 
4609     // Handle the easy case that requires no repack instructions.
4610     if (Ty == S32) {
4611       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4612       return true;
4613     }
4614   }
4615 
4616   // Now figure out how to copy the new result register back into the old
4617   // result.
4618   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4619 
4620   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4621 
4622   if (ResultNumRegs == 1) {
4623     assert(!IsTFE);
4624     ResultRegs[0] = NewResultReg;
4625   } else {
4626     // We have to repack into a new vector of some kind.
4627     for (int I = 0; I != NumDataRegs; ++I)
4628       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4629     B.buildUnmerge(ResultRegs, NewResultReg);
4630 
4631     // Drop the final TFE element to get the data part. The TFE result is
4632     // directly written to the right place already.
4633     if (IsTFE)
4634       ResultRegs.resize(NumDataRegs);
4635   }
4636 
4637   // For an s16 scalar result, we form an s32 result with a truncate regardless
4638   // of packed vs. unpacked.
4639   if (IsD16 && !Ty.isVector()) {
4640     B.buildTrunc(DstReg, ResultRegs[0]);
4641     return true;
4642   }
4643 
4644   // Avoid a build/concat_vector of 1 entry.
4645   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4646     B.buildBitcast(DstReg, ResultRegs[0]);
4647     return true;
4648   }
4649 
4650   assert(Ty.isVector());
4651 
4652   if (IsD16) {
4653     // For packed D16 results with TFE enabled, all the data components are
4654     // S32. Cast back to the expected type.
4655     //
4656     // TODO: We don't really need to use load s32 elements. We would only need one
4657     // cast for the TFE result if a multiple of v2s16 was used.
4658     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4659       for (Register &Reg : ResultRegs)
4660         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4661     } else if (ST.hasUnpackedD16VMem()) {
4662       for (Register &Reg : ResultRegs)
4663         Reg = B.buildTrunc(S16, Reg).getReg(0);
4664     }
4665   }
4666 
4667   auto padWithUndef = [&](LLT Ty, int NumElts) {
4668     if (NumElts == 0)
4669       return;
4670     Register Undef = B.buildUndef(Ty).getReg(0);
4671     for (int I = 0; I != NumElts; ++I)
4672       ResultRegs.push_back(Undef);
4673   };
4674 
4675   // Pad out any elements eliminated due to the dmask.
4676   LLT ResTy = MRI->getType(ResultRegs[0]);
4677   if (!ResTy.isVector()) {
4678     padWithUndef(ResTy, NumElts - ResultRegs.size());
4679     B.buildBuildVector(DstReg, ResultRegs);
4680     return true;
4681   }
4682 
4683   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4684   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4685 
4686   // Deal with the one annoying legal case.
4687   const LLT V3S16 = LLT::fixed_vector(3, 16);
4688   if (Ty == V3S16) {
4689     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4690     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4691     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4692     return true;
4693   }
4694 
4695   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4696   B.buildConcatVectors(DstReg, ResultRegs);
4697   return true;
4698 }
4699 
4700 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4701   LegalizerHelper &Helper, MachineInstr &MI) const {
4702   MachineIRBuilder &B = Helper.MIRBuilder;
4703   GISelChangeObserver &Observer = Helper.Observer;
4704 
4705   Register Dst = MI.getOperand(0).getReg();
4706   LLT Ty = B.getMRI()->getType(Dst);
4707   unsigned Size = Ty.getSizeInBits();
4708   MachineFunction &MF = B.getMF();
4709 
4710   Observer.changingInstr(MI);
4711 
4712   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4713     Ty = getBitcastRegisterType(Ty);
4714     Helper.bitcastDst(MI, Ty, 0);
4715     Dst = MI.getOperand(0).getReg();
4716     B.setInsertPt(B.getMBB(), MI);
4717   }
4718 
4719   // FIXME: We don't really need this intermediate instruction. The intrinsic
4720   // should be fixed to have a memory operand. Since it's readnone, we're not
4721   // allowed to add one.
4722   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4723   MI.RemoveOperand(1); // Remove intrinsic ID
4724 
4725   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4726   // TODO: Should this use datalayout alignment?
4727   const unsigned MemSize = (Size + 7) / 8;
4728   const Align MemAlign(4);
4729   MachineMemOperand *MMO = MF.getMachineMemOperand(
4730       MachinePointerInfo(),
4731       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4732           MachineMemOperand::MOInvariant,
4733       MemSize, MemAlign);
4734   MI.addMemOperand(MF, MMO);
4735 
4736   // There are no 96-bit result scalar loads, but widening to 128-bit should
4737   // always be legal. We may need to restore this to a 96-bit result if it turns
4738   // out this needs to be converted to a vector load during RegBankSelect.
4739   if (!isPowerOf2_32(Size)) {
4740     if (Ty.isVector())
4741       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4742     else
4743       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4744   }
4745 
4746   Observer.changedInstr(MI);
4747   return true;
4748 }
4749 
4750 // TODO: Move to selection
4751 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4752                                                 MachineRegisterInfo &MRI,
4753                                                 MachineIRBuilder &B) const {
4754   if (!ST.isTrapHandlerEnabled() ||
4755       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4756     return legalizeTrapEndpgm(MI, MRI, B);
4757 
4758   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4759     switch (*HsaAbiVer) {
4760     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4761     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4762       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4763     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4764       return ST.supportsGetDoorbellID() ?
4765           legalizeTrapHsa(MI, MRI, B) :
4766           legalizeTrapHsaQueuePtr(MI, MRI, B);
4767     }
4768   }
4769 
4770   llvm_unreachable("Unknown trap handler");
4771 }
4772 
4773 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4774     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4775   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4776   MI.eraseFromParent();
4777   return true;
4778 }
4779 
4780 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4781     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4782   // Pass queue pointer to trap handler as input, and insert trap instruction
4783   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4784   Register LiveIn =
4785     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4786   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4787     return false;
4788 
4789   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4790   B.buildCopy(SGPR01, LiveIn);
4791   B.buildInstr(AMDGPU::S_TRAP)
4792       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4793       .addReg(SGPR01, RegState::Implicit);
4794 
4795   MI.eraseFromParent();
4796   return true;
4797 }
4798 
4799 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4800     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4801   B.buildInstr(AMDGPU::S_TRAP)
4802       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4803   MI.eraseFromParent();
4804   return true;
4805 }
4806 
4807 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4808     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4809   // Is non-HSA path or trap-handler disabled? Then, report a warning
4810   // accordingly
4811   if (!ST.isTrapHandlerEnabled() ||
4812       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4813     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4814                                      "debugtrap handler not supported",
4815                                      MI.getDebugLoc(), DS_Warning);
4816     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4817     Ctx.diagnose(NoTrap);
4818   } else {
4819     // Insert debug-trap instruction
4820     B.buildInstr(AMDGPU::S_TRAP)
4821         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4822   }
4823 
4824   MI.eraseFromParent();
4825   return true;
4826 }
4827 
4828 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4829                                                MachineIRBuilder &B) const {
4830   MachineRegisterInfo &MRI = *B.getMRI();
4831   const LLT S16 = LLT::scalar(16);
4832   const LLT S32 = LLT::scalar(32);
4833 
4834   Register DstReg = MI.getOperand(0).getReg();
4835   Register NodePtr = MI.getOperand(2).getReg();
4836   Register RayExtent = MI.getOperand(3).getReg();
4837   Register RayOrigin = MI.getOperand(4).getReg();
4838   Register RayDir = MI.getOperand(5).getReg();
4839   Register RayInvDir = MI.getOperand(6).getReg();
4840   Register TDescr = MI.getOperand(7).getReg();
4841 
4842   if (!ST.hasGFX10_AEncoding()) {
4843     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4844                                         "intrinsic not supported on subtarget",
4845                                         MI.getDebugLoc());
4846     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4847     return false;
4848   }
4849 
4850   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4851   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4852   const unsigned NumVDataDwords = 4;
4853   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4854   const bool UseNSA =
4855       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4856   const unsigned BaseOpcodes[2][2] = {
4857       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4858       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4859        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4860   int Opcode;
4861   if (UseNSA) {
4862     Opcode =
4863         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4864                               NumVDataDwords, NumVAddrDwords);
4865   } else {
4866     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4867                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4868                                    PowerOf2Ceil(NumVAddrDwords));
4869   }
4870   assert(Opcode != -1);
4871 
4872   SmallVector<Register, 12> Ops;
4873   if (Is64) {
4874     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4875     Ops.push_back(Unmerge.getReg(0));
4876     Ops.push_back(Unmerge.getReg(1));
4877   } else {
4878     Ops.push_back(NodePtr);
4879   }
4880   Ops.push_back(RayExtent);
4881 
4882   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4883     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4884     Ops.push_back(Unmerge.getReg(0));
4885     Ops.push_back(Unmerge.getReg(1));
4886     Ops.push_back(Unmerge.getReg(2));
4887   };
4888 
4889   packLanes(RayOrigin);
4890   if (IsA16) {
4891     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4892     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4893     Register R1 = MRI.createGenericVirtualRegister(S32);
4894     Register R2 = MRI.createGenericVirtualRegister(S32);
4895     Register R3 = MRI.createGenericVirtualRegister(S32);
4896     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4897     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4898     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4899     Ops.push_back(R1);
4900     Ops.push_back(R2);
4901     Ops.push_back(R3);
4902   } else {
4903     packLanes(RayDir);
4904     packLanes(RayInvDir);
4905   }
4906 
4907   if (!UseNSA) {
4908     // Build a single vector containing all the operands so far prepared.
4909     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4910     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4911     Ops.clear();
4912     Ops.push_back(MergedOps);
4913   }
4914 
4915   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4916     .addDef(DstReg)
4917     .addImm(Opcode);
4918 
4919   for (Register R : Ops) {
4920     MIB.addUse(R);
4921   }
4922 
4923   MIB.addUse(TDescr)
4924      .addImm(IsA16 ? 1 : 0)
4925      .cloneMemRefs(MI);
4926 
4927   MI.eraseFromParent();
4928   return true;
4929 }
4930 
4931 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4932                                             MachineInstr &MI) const {
4933   MachineIRBuilder &B = Helper.MIRBuilder;
4934   MachineRegisterInfo &MRI = *B.getMRI();
4935 
4936   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4937   auto IntrID = MI.getIntrinsicID();
4938   switch (IntrID) {
4939   case Intrinsic::amdgcn_if:
4940   case Intrinsic::amdgcn_else: {
4941     MachineInstr *Br = nullptr;
4942     MachineBasicBlock *UncondBrTarget = nullptr;
4943     bool Negated = false;
4944     if (MachineInstr *BrCond =
4945             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4946       const SIRegisterInfo *TRI
4947         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4948 
4949       Register Def = MI.getOperand(1).getReg();
4950       Register Use = MI.getOperand(3).getReg();
4951 
4952       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4953 
4954       if (Negated)
4955         std::swap(CondBrTarget, UncondBrTarget);
4956 
4957       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4958       if (IntrID == Intrinsic::amdgcn_if) {
4959         B.buildInstr(AMDGPU::SI_IF)
4960           .addDef(Def)
4961           .addUse(Use)
4962           .addMBB(UncondBrTarget);
4963       } else {
4964         B.buildInstr(AMDGPU::SI_ELSE)
4965             .addDef(Def)
4966             .addUse(Use)
4967             .addMBB(UncondBrTarget);
4968       }
4969 
4970       if (Br) {
4971         Br->getOperand(0).setMBB(CondBrTarget);
4972       } else {
4973         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4974         // since we're swapping branch targets it needs to be reinserted.
4975         // FIXME: IRTranslator should probably not do this
4976         B.buildBr(*CondBrTarget);
4977       }
4978 
4979       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4980       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4981       MI.eraseFromParent();
4982       BrCond->eraseFromParent();
4983       return true;
4984     }
4985 
4986     return false;
4987   }
4988   case Intrinsic::amdgcn_loop: {
4989     MachineInstr *Br = nullptr;
4990     MachineBasicBlock *UncondBrTarget = nullptr;
4991     bool Negated = false;
4992     if (MachineInstr *BrCond =
4993             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4994       const SIRegisterInfo *TRI
4995         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4996 
4997       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4998       Register Reg = MI.getOperand(2).getReg();
4999 
5000       if (Negated)
5001         std::swap(CondBrTarget, UncondBrTarget);
5002 
5003       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5004       B.buildInstr(AMDGPU::SI_LOOP)
5005         .addUse(Reg)
5006         .addMBB(UncondBrTarget);
5007 
5008       if (Br)
5009         Br->getOperand(0).setMBB(CondBrTarget);
5010       else
5011         B.buildBr(*CondBrTarget);
5012 
5013       MI.eraseFromParent();
5014       BrCond->eraseFromParent();
5015       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5016       return true;
5017     }
5018 
5019     return false;
5020   }
5021   case Intrinsic::amdgcn_kernarg_segment_ptr:
5022     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5023       // This only makes sense to call in a kernel, so just lower to null.
5024       B.buildConstant(MI.getOperand(0).getReg(), 0);
5025       MI.eraseFromParent();
5026       return true;
5027     }
5028 
5029     return legalizePreloadedArgIntrin(
5030       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5031   case Intrinsic::amdgcn_implicitarg_ptr:
5032     return legalizeImplicitArgPtr(MI, MRI, B);
5033   case Intrinsic::amdgcn_workitem_id_x:
5034     return legalizePreloadedArgIntrin(MI, MRI, B,
5035                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5036   case Intrinsic::amdgcn_workitem_id_y:
5037     return legalizePreloadedArgIntrin(MI, MRI, B,
5038                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5039   case Intrinsic::amdgcn_workitem_id_z:
5040     return legalizePreloadedArgIntrin(MI, MRI, B,
5041                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5042   case Intrinsic::amdgcn_workgroup_id_x:
5043     return legalizePreloadedArgIntrin(MI, MRI, B,
5044                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5045   case Intrinsic::amdgcn_workgroup_id_y:
5046     return legalizePreloadedArgIntrin(MI, MRI, B,
5047                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5048   case Intrinsic::amdgcn_workgroup_id_z:
5049     return legalizePreloadedArgIntrin(MI, MRI, B,
5050                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5051   case Intrinsic::amdgcn_dispatch_ptr:
5052     return legalizePreloadedArgIntrin(MI, MRI, B,
5053                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5054   case Intrinsic::amdgcn_queue_ptr:
5055     return legalizePreloadedArgIntrin(MI, MRI, B,
5056                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5057   case Intrinsic::amdgcn_implicit_buffer_ptr:
5058     return legalizePreloadedArgIntrin(
5059       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5060   case Intrinsic::amdgcn_dispatch_id:
5061     return legalizePreloadedArgIntrin(MI, MRI, B,
5062                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5063   case Intrinsic::amdgcn_fdiv_fast:
5064     return legalizeFDIVFastIntrin(MI, MRI, B);
5065   case Intrinsic::amdgcn_is_shared:
5066     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5067   case Intrinsic::amdgcn_is_private:
5068     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5069   case Intrinsic::amdgcn_wavefrontsize: {
5070     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5071     MI.eraseFromParent();
5072     return true;
5073   }
5074   case Intrinsic::amdgcn_s_buffer_load:
5075     return legalizeSBufferLoad(Helper, MI);
5076   case Intrinsic::amdgcn_raw_buffer_store:
5077   case Intrinsic::amdgcn_struct_buffer_store:
5078     return legalizeBufferStore(MI, MRI, B, false, false);
5079   case Intrinsic::amdgcn_raw_buffer_store_format:
5080   case Intrinsic::amdgcn_struct_buffer_store_format:
5081     return legalizeBufferStore(MI, MRI, B, false, true);
5082   case Intrinsic::amdgcn_raw_tbuffer_store:
5083   case Intrinsic::amdgcn_struct_tbuffer_store:
5084     return legalizeBufferStore(MI, MRI, B, true, true);
5085   case Intrinsic::amdgcn_raw_buffer_load:
5086   case Intrinsic::amdgcn_struct_buffer_load:
5087     return legalizeBufferLoad(MI, MRI, B, false, false);
5088   case Intrinsic::amdgcn_raw_buffer_load_format:
5089   case Intrinsic::amdgcn_struct_buffer_load_format:
5090     return legalizeBufferLoad(MI, MRI, B, true, false);
5091   case Intrinsic::amdgcn_raw_tbuffer_load:
5092   case Intrinsic::amdgcn_struct_tbuffer_load:
5093     return legalizeBufferLoad(MI, MRI, B, true, true);
5094   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5095   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5096   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5097   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5098   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5099   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5100   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5101   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5102   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5103   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5104   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5105   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5106   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5107   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5108   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5109   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5110   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5111   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5112   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5113   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5114   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5115   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5116   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5117   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5118   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5119   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5120   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5121   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5122   case Intrinsic::amdgcn_buffer_atomic_fadd:
5123   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5124   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5125   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5126   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5127     return legalizeBufferAtomic(MI, B, IntrID);
5128   case Intrinsic::amdgcn_atomic_inc:
5129     return legalizeAtomicIncDec(MI, B, true);
5130   case Intrinsic::amdgcn_atomic_dec:
5131     return legalizeAtomicIncDec(MI, B, false);
5132   case Intrinsic::trap:
5133     return legalizeTrapIntrinsic(MI, MRI, B);
5134   case Intrinsic::debugtrap:
5135     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5136   case Intrinsic::amdgcn_rsq_clamp:
5137     return legalizeRsqClampIntrinsic(MI, MRI, B);
5138   case Intrinsic::amdgcn_ds_fadd:
5139   case Intrinsic::amdgcn_ds_fmin:
5140   case Intrinsic::amdgcn_ds_fmax:
5141     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5142   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5143     return legalizeBVHIntrinsic(MI, B);
5144   default: {
5145     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5146             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5147       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5148     return true;
5149   }
5150   }
5151 
5152   return true;
5153 }
5154