1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .scalarize(0)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32);
936 
937   // The hardware instructions return a different result on 0 than the generic
938   // instructions expect. The hardware produces -1, but these produce the
939   // bitwidth.
940   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941     .scalarize(0)
942     .clampScalar(0, S32, S32)
943     .clampScalar(1, S32, S64)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32)
946     .custom();
947 
948   // The 64-bit versions produce 32-bit results, but only on the SALU.
949   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950     .legalFor({{S32, S32}, {S32, S64}})
951     .clampScalar(0, S32, S32)
952     .clampScalar(1, S32, S64)
953     .scalarize(0)
954     .widenScalarToNextPow2(0, 32)
955     .widenScalarToNextPow2(1, 32);
956 
957   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958   // RegBankSelect.
959   getActionDefinitionsBuilder(G_BITREVERSE)
960     .legalFor({S32, S64})
961     .clampScalar(0, S32, S64)
962     .scalarize(0)
963     .widenScalarToNextPow2(0);
964 
965   if (ST.has16BitInsts()) {
966     getActionDefinitionsBuilder(G_BSWAP)
967       .legalFor({S16, S32, V2S16})
968       .clampMaxNumElementsStrict(0, S16, 2)
969       // FIXME: Fixing non-power-of-2 before clamp is workaround for
970       // narrowScalar limitation.
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, S16, S32)
973       .scalarize(0);
974 
975     if (ST.hasVOP3PInsts()) {
976       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977         .legalFor({S32, S16, V2S16})
978         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979         .clampMaxNumElements(0, S16, 2)
980         .minScalar(0, S16)
981         .widenScalarToNextPow2(0)
982         .scalarize(0)
983         .lower();
984     } else {
985       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986         .legalFor({S32, S16})
987         .widenScalarToNextPow2(0)
988         .minScalar(0, S16)
989         .scalarize(0)
990         .lower();
991     }
992   } else {
993     // TODO: Should have same legality without v_perm_b32
994     getActionDefinitionsBuilder(G_BSWAP)
995       .legalFor({S32})
996       .lowerIf(scalarNarrowerThan(0, 32))
997       // FIXME: Fixing non-power-of-2 before clamp is workaround for
998       // narrowScalar limitation.
999       .widenScalarToNextPow2(0)
1000       .maxScalar(0, S32)
1001       .scalarize(0)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005       .legalFor({S32})
1006       .minScalar(0, S32)
1007       .widenScalarToNextPow2(0)
1008       .scalarize(0)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_INTTOPTR)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(1, 0),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022       })
1023     .narrowScalarIf(largerThan(1, 0),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_PTRTOINT)
1029     // List the common cases
1030     .legalForCartesianProduct(AddrSpaces64, {S64})
1031     .legalForCartesianProduct(AddrSpaces32, {S32})
1032     .scalarize(0)
1033     // Accept any address space as long as the size matches
1034     .legalIf(sameSize(0, 1))
1035     .widenScalarIf(smallerThan(0, 1),
1036       [](const LegalityQuery &Query) {
1037         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038       })
1039     .narrowScalarIf(
1040       largerThan(0, 1),
1041       [](const LegalityQuery &Query) {
1042         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043       });
1044 
1045   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046     .scalarize(0)
1047     .custom();
1048 
1049   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050                                     bool IsLoad) -> bool {
1051     const LLT DstTy = Query.Types[0];
1052 
1053     // Split vector extloads.
1054     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 
1056     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057       return true;
1058 
1059     const LLT PtrTy = Query.Types[1];
1060     unsigned AS = PtrTy.getAddressSpace();
1061     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062       return true;
1063 
1064     // Catch weird sized loads that don't evenly divide into the access sizes
1065     // TODO: May be able to widen depending on alignment etc.
1066     unsigned NumRegs = (MemSize + 31) / 32;
1067     if (NumRegs == 3) {
1068       if (!ST.hasDwordx3LoadStores())
1069         return true;
1070     } else {
1071       // If the alignment allows, these should have been widened.
1072       if (!isPowerOf2_32(NumRegs))
1073         return true;
1074     }
1075 
1076     return false;
1077   };
1078 
1079   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084   // LDS
1085   // TODO: Unsupported flat for SI.
1086 
1087   for (unsigned Op : {G_LOAD, G_STORE}) {
1088     const bool IsStore = Op == G_STORE;
1089 
1090     auto &Actions = getActionDefinitionsBuilder(Op);
1091     // Explicitly list some common cases.
1092     // TODO: Does this help compile time at all?
1093     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096                                       {S64, GlobalPtr, S64, GlobalAlign32},
1097                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099                                       {S32, GlobalPtr, S8, GlobalAlign8},
1100                                       {S32, GlobalPtr, S16, GlobalAlign16},
1101 
1102                                       {S32, LocalPtr, S32, 32},
1103                                       {S64, LocalPtr, S64, 32},
1104                                       {V2S32, LocalPtr, V2S32, 32},
1105                                       {S32, LocalPtr, S8, 8},
1106                                       {S32, LocalPtr, S16, 16},
1107                                       {V2S16, LocalPtr, S32, 32},
1108 
1109                                       {S32, PrivatePtr, S32, 32},
1110                                       {S32, PrivatePtr, S8, 8},
1111                                       {S32, PrivatePtr, S16, 16},
1112                                       {V2S16, PrivatePtr, S32, 32},
1113 
1114                                       {S32, ConstantPtr, S32, GlobalAlign32},
1115                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117                                       {S64, ConstantPtr, S64, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119     Actions.legalIf(
1120       [=](const LegalityQuery &Query) -> bool {
1121         return isLoadStoreLegal(ST, Query);
1122       });
1123 
1124     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125     // 64-bits.
1126     //
1127     // TODO: Should generalize bitcast action into coerce, which will also cover
1128     // inserting addrspacecasts.
1129     Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131     // Turn any illegal element vectors into something easier to deal
1132     // with. These will ultimately produce 32-bit scalar shifts to extract the
1133     // parts anyway.
1134     //
1135     // For odd 16-bit element vectors, prefer to split those into pieces with
1136     // 16-bit vector parts.
1137     Actions.bitcastIf(
1138       [=](const LegalityQuery &Query) -> bool {
1139         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140                                           Query.MMODescrs[0].MemoryTy);
1141       }, bitcastToRegisterType(0));
1142 
1143     if (!IsStore) {
1144       // Widen suitably aligned loads by loading extra bytes. The standard
1145       // legalization actions can't properly express widening memory operands.
1146       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147         return shouldWidenLoad(ST, Query, G_LOAD);
1148       });
1149     }
1150 
1151     // FIXME: load/store narrowing should be moved to lower action
1152     Actions
1153         .narrowScalarIf(
1154             [=](const LegalityQuery &Query) -> bool {
1155               return !Query.Types[0].isVector() &&
1156                      needToSplitMemOp(Query, Op == G_LOAD);
1157             },
1158             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159               const LLT DstTy = Query.Types[0];
1160               const LLT PtrTy = Query.Types[1];
1161 
1162               const unsigned DstSize = DstTy.getSizeInBits();
1163               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164 
1165               // Split extloads.
1166               if (DstSize > MemSize)
1167                 return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169               unsigned MaxSize = maxSizeForAddrSpace(ST,
1170                                                      PtrTy.getAddressSpace(),
1171                                                      Op == G_LOAD);
1172               if (MemSize > MaxSize)
1173                 return std::make_pair(0, LLT::scalar(MaxSize));
1174 
1175               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1176               return std::make_pair(0, LLT::scalar(Align));
1177             })
1178         .fewerElementsIf(
1179             [=](const LegalityQuery &Query) -> bool {
1180               return Query.Types[0].isVector() &&
1181                      needToSplitMemOp(Query, Op == G_LOAD);
1182             },
1183             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184               const LLT DstTy = Query.Types[0];
1185               const LLT PtrTy = Query.Types[1];
1186 
1187               LLT EltTy = DstTy.getElementType();
1188               unsigned MaxSize = maxSizeForAddrSpace(ST,
1189                                                      PtrTy.getAddressSpace(),
1190                                                      Op == G_LOAD);
1191 
1192               // FIXME: Handle widened to power of 2 results better. This ends
1193               // up scalarizing.
1194               // FIXME: 3 element stores scalarized on SI
1195 
1196               // Split if it's too large for the address space.
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198               if (MemSize > MaxSize) {
1199                 unsigned NumElts = DstTy.getNumElements();
1200                 unsigned EltSize = EltTy.getSizeInBits();
1201 
1202                 if (MaxSize % EltSize == 0) {
1203                   return std::make_pair(
1204                       0, LLT::scalarOrVector(
1205                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = MemSize / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(
1217                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               if (DstTy.getSizeInBits() > MemSize)
1222                 return std::make_pair(0, EltTy);
1223 
1224               unsigned EltSize = EltTy.getSizeInBits();
1225               unsigned DstSize = DstTy.getSizeInBits();
1226               if (!isPowerOf2_32(DstSize)) {
1227                 // We're probably decomposing an odd sized store. Try to split
1228                 // to the widest type. TODO: Account for alignment. As-is it
1229                 // should be OK, since the new parts will be further legalized.
1230                 unsigned FloorSize = PowerOf2Floor(DstSize);
1231                 return std::make_pair(
1232                     0, LLT::scalarOrVector(
1233                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234               }
1235 
1236               // May need relegalization for the scalars.
1237               return std::make_pair(0, EltTy);
1238             })
1239     .minScalar(0, S32)
1240     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241     .widenScalarToNextPow2(0)
1242     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243     .lower();
1244   }
1245 
1246   // FIXME: Unaligned accesses not lowered.
1247   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249                                                   {S32, GlobalPtr, S16, 2 * 8},
1250                                                   {S32, LocalPtr, S8, 8},
1251                                                   {S32, LocalPtr, S16, 16},
1252                                                   {S32, PrivatePtr, S8, 8},
1253                                                   {S32, PrivatePtr, S16, 16},
1254                                                   {S32, ConstantPtr, S8, 8},
1255                                                   {S32, ConstantPtr, S16, 2 * 8}})
1256                        .legalIf(
1257                          [=](const LegalityQuery &Query) -> bool {
1258                            return isLoadStoreLegal(ST, Query);
1259                          });
1260 
1261   if (ST.hasFlatAddressSpace()) {
1262     ExtLoads.legalForTypesWithMemDesc(
1263         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264   }
1265 
1266   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267   // 64-bits.
1268   //
1269   // TODO: Should generalize bitcast action into coerce, which will also cover
1270   // inserting addrspacecasts.
1271   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272 
1273   ExtLoads.clampScalar(0, S32, S32)
1274           .widenScalarToNextPow2(0)
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290   if (ST.hasLDSFPAtomicAdd()) {
1291     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292     if (ST.hasGFX90AInsts())
1293       Atomic.legalFor({{S64, LocalPtr}});
1294   }
1295   if (ST.hasAtomicFaddInsts())
1296     Atomic.legalFor({{S32, GlobalPtr}});
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310                                  LocalPtr, FlatPtr, PrivatePtr,
1311                                  LLT::fixed_vector(2, LocalPtr),
1312                                  LLT::fixed_vector(2, PrivatePtr)},
1313                                 {S1, S32})
1314       .clampScalar(0, S16, S64)
1315       .scalarize(1)
1316       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318       .clampMaxNumElements(0, S32, 2)
1319       .clampMaxNumElements(0, LocalPtr, 2)
1320       .clampMaxNumElements(0, PrivatePtr, 2)
1321       .scalarize(0)
1322       .widenScalarToNextPow2(0)
1323       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324 
1325   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326   // be more flexible with the shift amount type.
1327   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328     .legalFor({{S32, S32}, {S64, S32}});
1329   if (ST.has16BitInsts()) {
1330     if (ST.hasVOP3PInsts()) {
1331       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332             .clampMaxNumElements(0, S16, 2);
1333     } else
1334       Shifts.legalFor({{S16, S16}});
1335 
1336     // TODO: Support 16-bit shift amounts for all types
1337     Shifts.widenScalarIf(
1338       [=](const LegalityQuery &Query) {
1339         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340         // 32-bit amount.
1341         const LLT ValTy = Query.Types[0];
1342         const LLT AmountTy = Query.Types[1];
1343         return ValTy.getSizeInBits() <= 16 &&
1344                AmountTy.getSizeInBits() < 16;
1345       }, changeTo(1, S16));
1346     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347     Shifts.clampScalar(1, S32, S32);
1348     Shifts.clampScalar(0, S16, S64);
1349     Shifts.widenScalarToNextPow2(0, 16);
1350 
1351     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352       .minScalar(0, S16)
1353       .scalarize(0)
1354       .lower();
1355   } else {
1356     // Make sure we legalize the shift amount type first, as the general
1357     // expansion for the shifted type will produce much worse code if it hasn't
1358     // been truncated already.
1359     Shifts.clampScalar(1, S32, S32);
1360     Shifts.clampScalar(0, S32, S64);
1361     Shifts.widenScalarToNextPow2(0, 32);
1362 
1363     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364       .minScalar(0, S32)
1365       .scalarize(0)
1366       .lower();
1367   }
1368   Shifts.scalarize(0);
1369 
1370   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373     unsigned IdxTypeIdx = 2;
1374 
1375     getActionDefinitionsBuilder(Op)
1376       .customIf([=](const LegalityQuery &Query) {
1377           const LLT EltTy = Query.Types[EltTypeIdx];
1378           const LLT VecTy = Query.Types[VecTypeIdx];
1379           const LLT IdxTy = Query.Types[IdxTypeIdx];
1380           const unsigned EltSize = EltTy.getSizeInBits();
1381           return (EltSize == 32 || EltSize == 64) &&
1382                   VecTy.getSizeInBits() % 32 == 0 &&
1383                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1384                   IdxTy.getSizeInBits() == 32;
1385         })
1386       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387                  bitcastToVectorElement32(VecTypeIdx))
1388       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389       .bitcastIf(
1390         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391         [=](const LegalityQuery &Query) {
1392           // For > 64-bit element types, try to turn this into a 64-bit
1393           // element vector since we may be able to do better indexing
1394           // if this is scalar. If not, fall back to 32.
1395           const LLT EltTy = Query.Types[EltTypeIdx];
1396           const LLT VecTy = Query.Types[VecTypeIdx];
1397           const unsigned DstEltSize = EltTy.getSizeInBits();
1398           const unsigned VecSize = VecTy.getSizeInBits();
1399 
1400           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401           return std::make_pair(
1402               VecTypeIdx,
1403               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404         })
1405       .clampScalar(EltTypeIdx, S32, S64)
1406       .clampScalar(VecTypeIdx, S32, S64)
1407       .clampScalar(IdxTypeIdx, S32, S32)
1408       .clampMaxNumElements(VecTypeIdx, S32, 32)
1409       // TODO: Clamp elements for 64-bit vectors?
1410       // It should only be necessary with variable indexes.
1411       // As a last resort, lower to the stack
1412       .lower();
1413   }
1414 
1415   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416     .unsupportedIf([=](const LegalityQuery &Query) {
1417         const LLT &EltTy = Query.Types[1].getElementType();
1418         return Query.Types[0] != EltTy;
1419       });
1420 
1421   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425     // FIXME: Doesn't handle extract of illegal sizes.
1426     getActionDefinitionsBuilder(Op)
1427       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428       .lowerIf([=](const LegalityQuery &Query) {
1429           // Sub-vector(or single element) insert and extract.
1430           // TODO: verify immediate offset here since lower only works with
1431           // whole elements.
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           return BigTy.isVector();
1434         })
1435       // FIXME: Multiples of 16 should not be legal.
1436       .legalIf([=](const LegalityQuery &Query) {
1437           const LLT BigTy = Query.Types[BigTyIdx];
1438           const LLT LitTy = Query.Types[LitTyIdx];
1439           return (BigTy.getSizeInBits() % 32 == 0) &&
1440                  (LitTy.getSizeInBits() % 16 == 0);
1441         })
1442       .widenScalarIf(
1443         [=](const LegalityQuery &Query) {
1444           const LLT BigTy = Query.Types[BigTyIdx];
1445           return (BigTy.getScalarSizeInBits() < 16);
1446         },
1447         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1448       .widenScalarIf(
1449         [=](const LegalityQuery &Query) {
1450           const LLT LitTy = Query.Types[LitTyIdx];
1451           return (LitTy.getScalarSizeInBits() < 16);
1452         },
1453         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1454       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455       .widenScalarToNextPow2(BigTyIdx, 32);
1456 
1457   }
1458 
1459   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460     .legalForCartesianProduct(AllS32Vectors, {S32})
1461     .legalForCartesianProduct(AllS64Vectors, {S64})
1462     .clampNumElements(0, V16S32, V32S32)
1463     .clampNumElements(0, V2S64, V16S64)
1464     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465 
1466   if (ST.hasScalarPackInsts()) {
1467     BuildVector
1468       // FIXME: Should probably widen s1 vectors straight to s32
1469       .minScalarOrElt(0, S16)
1470       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471       .minScalar(1, S32);
1472 
1473     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474       .legalFor({V2S16, S32})
1475       .lower();
1476     BuildVector.minScalarOrElt(0, S32);
1477   } else {
1478     BuildVector.customFor({V2S16, S16});
1479     BuildVector.minScalarOrElt(0, S32);
1480 
1481     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482       .customFor({V2S16, S32})
1483       .lower();
1484   }
1485 
1486   BuildVector.legalIf(isRegisterType(0));
1487 
1488   // FIXME: Clamp maximum size
1489   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491     .clampMaxNumElements(0, S32, 32)
1492     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493     .clampMaxNumElements(0, S16, 64);
1494 
1495   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496   // pre-legalize.
1497   if (ST.hasVOP3PInsts()) {
1498     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499       .customFor({V2S16, V2S16})
1500       .lower();
1501   } else
1502     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503 
1504   // Merge/Unmerge
1505   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508 
1509     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510       const LLT Ty = Query.Types[TypeIdx];
1511       if (Ty.isVector()) {
1512         const LLT &EltTy = Ty.getElementType();
1513         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514           return true;
1515         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516           return true;
1517       }
1518       return false;
1519     };
1520 
1521     auto &Builder = getActionDefinitionsBuilder(Op)
1522       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523       .lowerFor({{S16, V2S16}})
1524       .lowerIf([=](const LegalityQuery &Query) {
1525           const LLT BigTy = Query.Types[BigTyIdx];
1526           return BigTy.getSizeInBits() == 32;
1527         })
1528       // Try to widen to s16 first for small types.
1529       // TODO: Only do this on targets with legal s16 shifts
1530       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534                            elementTypeIs(1, S16)),
1535                        changeTo(1, V2S16))
1536       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538       // valid.
1539       .clampScalar(LitTyIdx, S32, S512)
1540       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541       // Break up vectors with weird elements into scalars
1542       .fewerElementsIf(
1543         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544         scalarize(0))
1545       .fewerElementsIf(
1546         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547         scalarize(1))
1548       .clampScalar(BigTyIdx, S32, MaxScalar);
1549 
1550     if (Op == G_MERGE_VALUES) {
1551       Builder.widenScalarIf(
1552         // TODO: Use 16-bit shifts if legal for 8-bit values?
1553         [=](const LegalityQuery &Query) {
1554           const LLT Ty = Query.Types[LitTyIdx];
1555           return Ty.getSizeInBits() < 32;
1556         },
1557         changeTo(LitTyIdx, S32));
1558     }
1559 
1560     Builder.widenScalarIf(
1561       [=](const LegalityQuery &Query) {
1562         const LLT Ty = Query.Types[BigTyIdx];
1563         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564           Ty.getSizeInBits() % 16 != 0;
1565       },
1566       [=](const LegalityQuery &Query) {
1567         // Pick the next power of 2, or a multiple of 64 over 128.
1568         // Whichever is smaller.
1569         const LLT &Ty = Query.Types[BigTyIdx];
1570         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571         if (NewSizeInBits >= 256) {
1572           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573           if (RoundedTo < NewSizeInBits)
1574             NewSizeInBits = RoundedTo;
1575         }
1576         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577       })
1578       // Any vectors left are the wrong size. Scalarize them.
1579       .scalarize(0)
1580       .scalarize(1);
1581   }
1582 
1583   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584   // RegBankSelect.
1585   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586     .legalFor({{S32}, {S64}});
1587 
1588   if (ST.hasVOP3PInsts()) {
1589     SextInReg.lowerFor({{V2S16}})
1590       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591       // get more vector shift opportunities, since we'll get those when
1592       // expanded.
1593       .clampMaxNumElementsStrict(0, S16, 2);
1594   } else if (ST.has16BitInsts()) {
1595     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596   } else {
1597     // Prefer to promote to s32 before lowering if we don't have 16-bit
1598     // shifts. This avoid a lot of intermediate truncate and extend operations.
1599     SextInReg.lowerFor({{S32}, {S64}});
1600   }
1601 
1602   SextInReg
1603     .scalarize(0)
1604     .clampScalar(0, S32, S64)
1605     .lower();
1606 
1607   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608     .scalarize(0)
1609     .lower();
1610 
1611   // TODO: Only Try to form v2s16 with legal packed instructions.
1612   getActionDefinitionsBuilder(G_FSHR)
1613     .legalFor({{S32, S32}})
1614     .lowerFor({{V2S16, V2S16}})
1615     .clampMaxNumElementsStrict(0, S16, 2)
1616     .scalarize(0)
1617     .lower();
1618 
1619   if (ST.hasVOP3PInsts()) {
1620     getActionDefinitionsBuilder(G_FSHL)
1621       .lowerFor({{V2S16, V2S16}})
1622       .clampMaxNumElementsStrict(0, S16, 2)
1623       .scalarize(0)
1624       .lower();
1625   } else {
1626     getActionDefinitionsBuilder(G_FSHL)
1627       .scalarize(0)
1628       .lower();
1629   }
1630 
1631   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632     .legalFor({S64});
1633 
1634   getActionDefinitionsBuilder(G_FENCE)
1635     .alwaysLegal();
1636 
1637   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638       .scalarize(0)
1639       .minScalar(0, S32)
1640       .lower();
1641 
1642   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643       .legalFor({{S32, S32}, {S64, S32}})
1644       .clampScalar(1, S32, S32)
1645       .clampScalar(0, S32, S64)
1646       .widenScalarToNextPow2(0)
1647       .scalarize(0);
1648 
1649   getActionDefinitionsBuilder({
1650       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651       G_FCOPYSIGN,
1652 
1653       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654       G_ATOMICRMW_NAND,
1655       G_ATOMICRMW_FSUB,
1656       G_READ_REGISTER,
1657       G_WRITE_REGISTER,
1658 
1659       G_SADDO, G_SSUBO,
1660 
1661        // TODO: Implement
1662       G_FMINIMUM, G_FMAXIMUM}).lower();
1663 
1664   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665       .lower();
1666 
1667   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670     .unsupported();
1671 
1672   getLegacyLegalizerInfo().computeTables();
1673   verify(*ST.getInstrInfo());
1674 }
1675 
1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677                                          MachineInstr &MI) const {
1678   MachineIRBuilder &B = Helper.MIRBuilder;
1679   MachineRegisterInfo &MRI = *B.getMRI();
1680 
1681   switch (MI.getOpcode()) {
1682   case TargetOpcode::G_ADDRSPACE_CAST:
1683     return legalizeAddrSpaceCast(MI, MRI, B);
1684   case TargetOpcode::G_FRINT:
1685     return legalizeFrint(MI, MRI, B);
1686   case TargetOpcode::G_FCEIL:
1687     return legalizeFceil(MI, MRI, B);
1688   case TargetOpcode::G_FREM:
1689     return legalizeFrem(MI, MRI, B);
1690   case TargetOpcode::G_INTRINSIC_TRUNC:
1691     return legalizeIntrinsicTrunc(MI, MRI, B);
1692   case TargetOpcode::G_SITOFP:
1693     return legalizeITOFP(MI, MRI, B, true);
1694   case TargetOpcode::G_UITOFP:
1695     return legalizeITOFP(MI, MRI, B, false);
1696   case TargetOpcode::G_FPTOSI:
1697     return legalizeFPTOI(MI, MRI, B, true);
1698   case TargetOpcode::G_FPTOUI:
1699     return legalizeFPTOI(MI, MRI, B, false);
1700   case TargetOpcode::G_FMINNUM:
1701   case TargetOpcode::G_FMAXNUM:
1702   case TargetOpcode::G_FMINNUM_IEEE:
1703   case TargetOpcode::G_FMAXNUM_IEEE:
1704     return legalizeMinNumMaxNum(Helper, MI);
1705   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706     return legalizeExtractVectorElt(MI, MRI, B);
1707   case TargetOpcode::G_INSERT_VECTOR_ELT:
1708     return legalizeInsertVectorElt(MI, MRI, B);
1709   case TargetOpcode::G_SHUFFLE_VECTOR:
1710     return legalizeShuffleVector(MI, MRI, B);
1711   case TargetOpcode::G_FSIN:
1712   case TargetOpcode::G_FCOS:
1713     return legalizeSinCos(MI, MRI, B);
1714   case TargetOpcode::G_GLOBAL_VALUE:
1715     return legalizeGlobalValue(MI, MRI, B);
1716   case TargetOpcode::G_LOAD:
1717   case TargetOpcode::G_SEXTLOAD:
1718   case TargetOpcode::G_ZEXTLOAD:
1719     return legalizeLoad(Helper, MI);
1720   case TargetOpcode::G_FMAD:
1721     return legalizeFMad(MI, MRI, B);
1722   case TargetOpcode::G_FDIV:
1723     return legalizeFDIV(MI, MRI, B);
1724   case TargetOpcode::G_UDIV:
1725   case TargetOpcode::G_UREM:
1726   case TargetOpcode::G_UDIVREM:
1727     return legalizeUnsignedDIV_REM(MI, MRI, B);
1728   case TargetOpcode::G_SDIV:
1729   case TargetOpcode::G_SREM:
1730   case TargetOpcode::G_SDIVREM:
1731     return legalizeSignedDIV_REM(MI, MRI, B);
1732   case TargetOpcode::G_ATOMIC_CMPXCHG:
1733     return legalizeAtomicCmpXChg(MI, MRI, B);
1734   case TargetOpcode::G_FLOG:
1735     return legalizeFlog(MI, B, numbers::ln2f);
1736   case TargetOpcode::G_FLOG10:
1737     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738   case TargetOpcode::G_FEXP:
1739     return legalizeFExp(MI, B);
1740   case TargetOpcode::G_FPOW:
1741     return legalizeFPow(MI, B);
1742   case TargetOpcode::G_FFLOOR:
1743     return legalizeFFloor(MI, MRI, B);
1744   case TargetOpcode::G_BUILD_VECTOR:
1745     return legalizeBuildVector(MI, MRI, B);
1746   case TargetOpcode::G_CTLZ:
1747   case TargetOpcode::G_CTTZ:
1748     return legalizeCTLZ_CTTZ(MI, MRI, B);
1749   default:
1750     return false;
1751   }
1752 
1753   llvm_unreachable("expected switch to return");
1754 }
1755 
1756 Register AMDGPULegalizerInfo::getSegmentAperture(
1757   unsigned AS,
1758   MachineRegisterInfo &MRI,
1759   MachineIRBuilder &B) const {
1760   MachineFunction &MF = B.getMF();
1761   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762   const LLT S32 = LLT::scalar(32);
1763 
1764   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1765 
1766   if (ST.hasApertureRegs()) {
1767     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768     // getreg.
1769     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1771         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1772     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1773         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1774         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1775     unsigned Encoding =
1776         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1777         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1778         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779 
1780     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781 
1782     B.buildInstr(AMDGPU::S_GETREG_B32)
1783       .addDef(GetReg)
1784       .addImm(Encoding);
1785     MRI.setType(GetReg, S32);
1786 
1787     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789   }
1790 
1791   Register QueuePtr = MRI.createGenericVirtualRegister(
1792     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1793 
1794   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1795     return Register();
1796 
1797   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798   // private_segment_aperture_base_hi.
1799   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800 
1801   // TODO: can we be smarter about machine pointer info?
1802   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1803   MachineMemOperand *MMO = MF.getMachineMemOperand(
1804       PtrInfo,
1805       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1806           MachineMemOperand::MOInvariant,
1807       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808 
1809   Register LoadAddr;
1810 
1811   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813 }
1814 
1815 /// Return true if the value is a known valid address, such that a null check is
1816 /// not necessary.
1817 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1818                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1819   MachineInstr *Def = MRI.getVRegDef(Val);
1820   switch (Def->getOpcode()) {
1821   case AMDGPU::G_FRAME_INDEX:
1822   case AMDGPU::G_GLOBAL_VALUE:
1823   case AMDGPU::G_BLOCK_ADDR:
1824     return true;
1825   case AMDGPU::G_CONSTANT: {
1826     const ConstantInt *CI = Def->getOperand(1).getCImm();
1827     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1828   }
1829   default:
1830     return false;
1831   }
1832 
1833   return false;
1834 }
1835 
1836 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1837   MachineInstr &MI, MachineRegisterInfo &MRI,
1838   MachineIRBuilder &B) const {
1839   MachineFunction &MF = B.getMF();
1840 
1841   const LLT S32 = LLT::scalar(32);
1842   Register Dst = MI.getOperand(0).getReg();
1843   Register Src = MI.getOperand(1).getReg();
1844 
1845   LLT DstTy = MRI.getType(Dst);
1846   LLT SrcTy = MRI.getType(Src);
1847   unsigned DestAS = DstTy.getAddressSpace();
1848   unsigned SrcAS = SrcTy.getAddressSpace();
1849 
1850   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1851   // vector element.
1852   assert(!DstTy.isVector());
1853 
1854   const AMDGPUTargetMachine &TM
1855     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1856 
1857   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1858     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1859     return true;
1860   }
1861 
1862   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863     // Truncate.
1864     B.buildExtract(Dst, Src, 0);
1865     MI.eraseFromParent();
1866     return true;
1867   }
1868 
1869   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1870     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1871     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1872 
1873     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1874     // another. Merge operands are required to be the same type, but creating an
1875     // extra ptrtoint would be kind of pointless.
1876     auto HighAddr = B.buildConstant(
1877       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1878     B.buildMerge(Dst, {Src, HighAddr});
1879     MI.eraseFromParent();
1880     return true;
1881   }
1882 
1883   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1884     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1885            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1886 
1887     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1888       // Extract low 32-bits of the pointer.
1889       B.buildExtract(Dst, Src, 0);
1890       MI.eraseFromParent();
1891       return true;
1892     }
1893 
1894     unsigned NullVal = TM.getNullPointerValue(DestAS);
1895 
1896     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1897     auto FlatNull = B.buildConstant(SrcTy, 0);
1898 
1899     // Extract low 32-bits of the pointer.
1900     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1901 
1902     auto CmpRes =
1903         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1904     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1905 
1906     MI.eraseFromParent();
1907     return true;
1908   }
1909 
1910   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1911     return false;
1912 
1913   if (!ST.hasFlatAddressSpace())
1914     return false;
1915 
1916   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1917   if (!ApertureReg.isValid())
1918     return false;
1919 
1920   // Coerce the type of the low half of the result so we can use merge_values.
1921   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1922 
1923   // TODO: Should we allow mismatched types but matching sizes in merges to
1924   // avoid the ptrtoint?
1925   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1926 
1927   if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1928     B.buildCopy(Dst, BuildPtr);
1929     MI.eraseFromParent();
1930     return true;
1931   }
1932 
1933   auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1934   auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1935 
1936   auto CmpRes =
1937       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1938 
1939   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1940 
1941   MI.eraseFromParent();
1942   return true;
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeFrint(
1946   MachineInstr &MI, MachineRegisterInfo &MRI,
1947   MachineIRBuilder &B) const {
1948   Register Src = MI.getOperand(1).getReg();
1949   LLT Ty = MRI.getType(Src);
1950   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1951 
1952   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1953   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1954 
1955   auto C1 = B.buildFConstant(Ty, C1Val);
1956   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1957 
1958   // TODO: Should this propagate fast-math-flags?
1959   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1960   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1961 
1962   auto C2 = B.buildFConstant(Ty, C2Val);
1963   auto Fabs = B.buildFAbs(Ty, Src);
1964 
1965   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1966   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1967   MI.eraseFromParent();
1968   return true;
1969 }
1970 
1971 bool AMDGPULegalizerInfo::legalizeFceil(
1972   MachineInstr &MI, MachineRegisterInfo &MRI,
1973   MachineIRBuilder &B) const {
1974 
1975   const LLT S1 = LLT::scalar(1);
1976   const LLT S64 = LLT::scalar(64);
1977 
1978   Register Src = MI.getOperand(1).getReg();
1979   assert(MRI.getType(Src) == S64);
1980 
1981   // result = trunc(src)
1982   // if (src > 0.0 && src != result)
1983   //   result += 1.0
1984 
1985   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1986 
1987   const auto Zero = B.buildFConstant(S64, 0.0);
1988   const auto One = B.buildFConstant(S64, 1.0);
1989   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1990   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1991   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1992   auto Add = B.buildSelect(S64, And, One, Zero);
1993 
1994   // TODO: Should this propagate fast-math-flags?
1995   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1996   MI.eraseFromParent();
1997   return true;
1998 }
1999 
2000 bool AMDGPULegalizerInfo::legalizeFrem(
2001   MachineInstr &MI, MachineRegisterInfo &MRI,
2002   MachineIRBuilder &B) const {
2003     Register DstReg = MI.getOperand(0).getReg();
2004     Register Src0Reg = MI.getOperand(1).getReg();
2005     Register Src1Reg = MI.getOperand(2).getReg();
2006     auto Flags = MI.getFlags();
2007     LLT Ty = MRI.getType(DstReg);
2008 
2009     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2010     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2011     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2012     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2013     MI.eraseFromParent();
2014     return true;
2015 }
2016 
2017 static MachineInstrBuilder extractF64Exponent(Register Hi,
2018                                               MachineIRBuilder &B) {
2019   const unsigned FractBits = 52;
2020   const unsigned ExpBits = 11;
2021   LLT S32 = LLT::scalar(32);
2022 
2023   auto Const0 = B.buildConstant(S32, FractBits - 32);
2024   auto Const1 = B.buildConstant(S32, ExpBits);
2025 
2026   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2027     .addUse(Hi)
2028     .addUse(Const0.getReg(0))
2029     .addUse(Const1.getReg(0));
2030 
2031   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2032 }
2033 
2034 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2035   MachineInstr &MI, MachineRegisterInfo &MRI,
2036   MachineIRBuilder &B) const {
2037   const LLT S1 = LLT::scalar(1);
2038   const LLT S32 = LLT::scalar(32);
2039   const LLT S64 = LLT::scalar(64);
2040 
2041   Register Src = MI.getOperand(1).getReg();
2042   assert(MRI.getType(Src) == S64);
2043 
2044   // TODO: Should this use extract since the low half is unused?
2045   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2046   Register Hi = Unmerge.getReg(1);
2047 
2048   // Extract the upper half, since this is where we will find the sign and
2049   // exponent.
2050   auto Exp = extractF64Exponent(Hi, B);
2051 
2052   const unsigned FractBits = 52;
2053 
2054   // Extract the sign bit.
2055   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2056   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2057 
2058   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2059 
2060   const auto Zero32 = B.buildConstant(S32, 0);
2061 
2062   // Extend back to 64-bits.
2063   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2064 
2065   auto Shr = B.buildAShr(S64, FractMask, Exp);
2066   auto Not = B.buildNot(S64, Shr);
2067   auto Tmp0 = B.buildAnd(S64, Src, Not);
2068   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2069 
2070   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2071   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2072 
2073   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2074   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2075   MI.eraseFromParent();
2076   return true;
2077 }
2078 
2079 bool AMDGPULegalizerInfo::legalizeITOFP(
2080   MachineInstr &MI, MachineRegisterInfo &MRI,
2081   MachineIRBuilder &B, bool Signed) const {
2082 
2083   Register Dst = MI.getOperand(0).getReg();
2084   Register Src = MI.getOperand(1).getReg();
2085 
2086   const LLT S64 = LLT::scalar(64);
2087   const LLT S32 = LLT::scalar(32);
2088 
2089   assert(MRI.getType(Src) == S64);
2090 
2091   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2092   auto ThirtyTwo = B.buildConstant(S32, 32);
2093 
2094   if (MRI.getType(Dst) == S64) {
2095     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2096                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2097 
2098     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2099     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2100                      .addUse(CvtHi.getReg(0))
2101                      .addUse(ThirtyTwo.getReg(0));
2102 
2103     // TODO: Should this propagate fast-math-flags?
2104     B.buildFAdd(Dst, LdExp, CvtLo);
2105     MI.eraseFromParent();
2106     return true;
2107   }
2108 
2109   assert(MRI.getType(Dst) == S32);
2110 
2111   auto One = B.buildConstant(S32, 1);
2112 
2113   MachineInstrBuilder ShAmt;
2114   if (Signed) {
2115     auto ThirtyOne = B.buildConstant(S32, 31);
2116     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2117     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2118     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2119     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2120                                /*HasSideEffects=*/false)
2121                   .addUse(Unmerge.getReg(1));
2122     auto LS2 = B.buildSub(S32, LS, One);
2123     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2124   } else
2125     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2126   auto Norm = B.buildShl(S64, Src, ShAmt);
2127   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2128   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2129   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2130   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2131   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2132   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2133                    /*HasSideEffects=*/false)
2134       .addUse(FVal.getReg(0))
2135       .addUse(Scale.getReg(0));
2136   MI.eraseFromParent();
2137   return true;
2138 }
2139 
2140 // TODO: Copied from DAG implementation. Verify logic and document how this
2141 // actually works.
2142 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2143                                         MachineRegisterInfo &MRI,
2144                                         MachineIRBuilder &B,
2145                                         bool Signed) const {
2146 
2147   Register Dst = MI.getOperand(0).getReg();
2148   Register Src = MI.getOperand(1).getReg();
2149 
2150   const LLT S64 = LLT::scalar(64);
2151   const LLT S32 = LLT::scalar(32);
2152 
2153   const LLT SrcLT = MRI.getType(Src);
2154   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2155 
2156   unsigned Flags = MI.getFlags();
2157 
2158   // The basic idea of converting a floating point number into a pair of 32-bit
2159   // integers is illustrated as follows:
2160   //
2161   //     tf := trunc(val);
2162   //    hif := floor(tf * 2^-32);
2163   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2164   //     hi := fptoi(hif);
2165   //     lo := fptoi(lof);
2166   //
2167   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2168   MachineInstrBuilder Sign;
2169   if (Signed && SrcLT == S32) {
2170     // However, a 32-bit floating point number has only 23 bits mantissa and
2171     // it's not enough to hold all the significant bits of `lof` if val is
2172     // negative. To avoid the loss of precision, We need to take the absolute
2173     // value after truncating and flip the result back based on the original
2174     // signedness.
2175     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2176     Trunc = B.buildFAbs(S32, Trunc, Flags);
2177   }
2178   MachineInstrBuilder K0, K1;
2179   if (SrcLT == S64) {
2180     K0 = B.buildFConstant(S64,
2181                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2182     K1 = B.buildFConstant(S64,
2183                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2184   } else {
2185     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2186     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2187   }
2188 
2189   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2190   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2191   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2192 
2193   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2194                                      : B.buildFPTOUI(S32, FloorMul);
2195   auto Lo = B.buildFPTOUI(S32, Fma);
2196 
2197   if (Signed && SrcLT == S32) {
2198     // Flip the result based on the signedness, which is either all 0s or 1s.
2199     Sign = B.buildMerge(S64, {Sign, Sign});
2200     // r := xor({lo, hi}, sign) - sign;
2201     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2202   } else
2203     B.buildMerge(Dst, {Lo, Hi});
2204   MI.eraseFromParent();
2205 
2206   return true;
2207 }
2208 
2209 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2210                                                MachineInstr &MI) const {
2211   MachineFunction &MF = Helper.MIRBuilder.getMF();
2212   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2213 
2214   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2215                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2216 
2217   // With ieee_mode disabled, the instructions have the correct behavior
2218   // already for G_FMINNUM/G_FMAXNUM
2219   if (!MFI->getMode().IEEE)
2220     return !IsIEEEOp;
2221 
2222   if (IsIEEEOp)
2223     return true;
2224 
2225   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2226 }
2227 
2228 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2229   MachineInstr &MI, MachineRegisterInfo &MRI,
2230   MachineIRBuilder &B) const {
2231   // TODO: Should move some of this into LegalizerHelper.
2232 
2233   // TODO: Promote dynamic indexing of s16 to s32
2234 
2235   // FIXME: Artifact combiner probably should have replaced the truncated
2236   // constant before this, so we shouldn't need
2237   // getIConstantVRegValWithLookThrough.
2238   Optional<ValueAndVReg> MaybeIdxVal =
2239       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2240   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2241     return true;
2242   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2243 
2244   Register Dst = MI.getOperand(0).getReg();
2245   Register Vec = MI.getOperand(1).getReg();
2246 
2247   LLT VecTy = MRI.getType(Vec);
2248   LLT EltTy = VecTy.getElementType();
2249   assert(EltTy == MRI.getType(Dst));
2250 
2251   if (IdxVal < VecTy.getNumElements()) {
2252     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2253     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2254   } else {
2255     B.buildUndef(Dst);
2256   }
2257 
2258   MI.eraseFromParent();
2259   return true;
2260 }
2261 
2262 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2263   MachineInstr &MI, MachineRegisterInfo &MRI,
2264   MachineIRBuilder &B) const {
2265   // TODO: Should move some of this into LegalizerHelper.
2266 
2267   // TODO: Promote dynamic indexing of s16 to s32
2268 
2269   // FIXME: Artifact combiner probably should have replaced the truncated
2270   // constant before this, so we shouldn't need
2271   // getIConstantVRegValWithLookThrough.
2272   Optional<ValueAndVReg> MaybeIdxVal =
2273       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2274   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2275     return true;
2276 
2277   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2278   Register Dst = MI.getOperand(0).getReg();
2279   Register Vec = MI.getOperand(1).getReg();
2280   Register Ins = MI.getOperand(2).getReg();
2281 
2282   LLT VecTy = MRI.getType(Vec);
2283   LLT EltTy = VecTy.getElementType();
2284   assert(EltTy == MRI.getType(Ins));
2285   (void)Ins;
2286 
2287   unsigned NumElts = VecTy.getNumElements();
2288   if (IdxVal < NumElts) {
2289     SmallVector<Register, 8> SrcRegs;
2290     for (unsigned i = 0; i < NumElts; ++i)
2291       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2292     B.buildUnmerge(SrcRegs, Vec);
2293 
2294     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2295     B.buildMerge(Dst, SrcRegs);
2296   } else {
2297     B.buildUndef(Dst);
2298   }
2299 
2300   MI.eraseFromParent();
2301   return true;
2302 }
2303 
2304 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2305   MachineInstr &MI, MachineRegisterInfo &MRI,
2306   MachineIRBuilder &B) const {
2307   const LLT V2S16 = LLT::fixed_vector(2, 16);
2308 
2309   Register Dst = MI.getOperand(0).getReg();
2310   Register Src0 = MI.getOperand(1).getReg();
2311   LLT DstTy = MRI.getType(Dst);
2312   LLT SrcTy = MRI.getType(Src0);
2313 
2314   if (SrcTy == V2S16 && DstTy == V2S16 &&
2315       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2316     return true;
2317 
2318   MachineIRBuilder HelperBuilder(MI);
2319   GISelObserverWrapper DummyObserver;
2320   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2321   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2322 }
2323 
2324 bool AMDGPULegalizerInfo::legalizeSinCos(
2325   MachineInstr &MI, MachineRegisterInfo &MRI,
2326   MachineIRBuilder &B) const {
2327 
2328   Register DstReg = MI.getOperand(0).getReg();
2329   Register SrcReg = MI.getOperand(1).getReg();
2330   LLT Ty = MRI.getType(DstReg);
2331   unsigned Flags = MI.getFlags();
2332 
2333   Register TrigVal;
2334   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2335   if (ST.hasTrigReducedRange()) {
2336     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2337     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2338       .addUse(MulVal.getReg(0))
2339       .setMIFlags(Flags).getReg(0);
2340   } else
2341     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2342 
2343   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2344     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2345   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2346     .addUse(TrigVal)
2347     .setMIFlags(Flags);
2348   MI.eraseFromParent();
2349   return true;
2350 }
2351 
2352 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2353                                                   MachineIRBuilder &B,
2354                                                   const GlobalValue *GV,
2355                                                   int64_t Offset,
2356                                                   unsigned GAFlags) const {
2357   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2358   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2359   // to the following code sequence:
2360   //
2361   // For constant address space:
2362   //   s_getpc_b64 s[0:1]
2363   //   s_add_u32 s0, s0, $symbol
2364   //   s_addc_u32 s1, s1, 0
2365   //
2366   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2367   //   a fixup or relocation is emitted to replace $symbol with a literal
2368   //   constant, which is a pc-relative offset from the encoding of the $symbol
2369   //   operand to the global variable.
2370   //
2371   // For global address space:
2372   //   s_getpc_b64 s[0:1]
2373   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2374   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2375   //
2376   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2377   //   fixups or relocations are emitted to replace $symbol@*@lo and
2378   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2379   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2380   //   operand to the global variable.
2381   //
2382   // What we want here is an offset from the value returned by s_getpc
2383   // (which is the address of the s_add_u32 instruction) to the global
2384   // variable, but since the encoding of $symbol starts 4 bytes after the start
2385   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2386   // small. This requires us to add 4 to the global variable offset in order to
2387   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2388   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2389   // instruction.
2390 
2391   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2392 
2393   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2394     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2395 
2396   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2397     .addDef(PCReg);
2398 
2399   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2400   if (GAFlags == SIInstrInfo::MO_NONE)
2401     MIB.addImm(0);
2402   else
2403     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2404 
2405   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2406 
2407   if (PtrTy.getSizeInBits() == 32)
2408     B.buildExtract(DstReg, PCReg, 0);
2409   return true;
2410  }
2411 
2412 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2413   MachineInstr &MI, MachineRegisterInfo &MRI,
2414   MachineIRBuilder &B) const {
2415   Register DstReg = MI.getOperand(0).getReg();
2416   LLT Ty = MRI.getType(DstReg);
2417   unsigned AS = Ty.getAddressSpace();
2418 
2419   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2420   MachineFunction &MF = B.getMF();
2421   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2422 
2423   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2424     if (!MFI->isModuleEntryFunction() &&
2425         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2426       const Function &Fn = MF.getFunction();
2427       DiagnosticInfoUnsupported BadLDSDecl(
2428         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2429         DS_Warning);
2430       Fn.getContext().diagnose(BadLDSDecl);
2431 
2432       // We currently don't have a way to correctly allocate LDS objects that
2433       // aren't directly associated with a kernel. We do force inlining of
2434       // functions that use local objects. However, if these dead functions are
2435       // not eliminated, we don't want a compile time error. Just emit a warning
2436       // and a trap, since there should be no callable path here.
2437       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2438       B.buildUndef(DstReg);
2439       MI.eraseFromParent();
2440       return true;
2441     }
2442 
2443     // TODO: We could emit code to handle the initialization somewhere.
2444     // We ignore the initializer for now and legalize it to allow selection.
2445     // The initializer will anyway get errored out during assembly emission.
2446     const SITargetLowering *TLI = ST.getTargetLowering();
2447     if (!TLI->shouldUseLDSConstAddress(GV)) {
2448       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2449       return true; // Leave in place;
2450     }
2451 
2452     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2453       Type *Ty = GV->getValueType();
2454       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2455       // zero-sized type in other languages to declare the dynamic shared
2456       // memory which size is not known at the compile time. They will be
2457       // allocated by the runtime and placed directly after the static
2458       // allocated ones. They all share the same offset.
2459       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2460         // Adjust alignment for that dynamic shared memory array.
2461         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2462         LLT S32 = LLT::scalar(32);
2463         auto Sz =
2464             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2465         B.buildIntToPtr(DstReg, Sz);
2466         MI.eraseFromParent();
2467         return true;
2468       }
2469     }
2470 
2471     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2472                                                    *cast<GlobalVariable>(GV)));
2473     MI.eraseFromParent();
2474     return true;
2475   }
2476 
2477   const SITargetLowering *TLI = ST.getTargetLowering();
2478 
2479   if (TLI->shouldEmitFixup(GV)) {
2480     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2481     MI.eraseFromParent();
2482     return true;
2483   }
2484 
2485   if (TLI->shouldEmitPCReloc(GV)) {
2486     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2487     MI.eraseFromParent();
2488     return true;
2489   }
2490 
2491   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2492   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2493 
2494   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2495   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2496       MachinePointerInfo::getGOT(MF),
2497       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2498           MachineMemOperand::MOInvariant,
2499       LoadTy, Align(8));
2500 
2501   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2502 
2503   if (Ty.getSizeInBits() == 32) {
2504     // Truncate if this is a 32-bit constant address.
2505     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2506     B.buildExtract(DstReg, Load, 0);
2507   } else
2508     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2509 
2510   MI.eraseFromParent();
2511   return true;
2512 }
2513 
2514 static LLT widenToNextPowerOf2(LLT Ty) {
2515   if (Ty.isVector())
2516     return Ty.changeElementCount(
2517         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2518   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2519 }
2520 
2521 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2522                                        MachineInstr &MI) const {
2523   MachineIRBuilder &B = Helper.MIRBuilder;
2524   MachineRegisterInfo &MRI = *B.getMRI();
2525   GISelChangeObserver &Observer = Helper.Observer;
2526 
2527   Register PtrReg = MI.getOperand(1).getReg();
2528   LLT PtrTy = MRI.getType(PtrReg);
2529   unsigned AddrSpace = PtrTy.getAddressSpace();
2530 
2531   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2532     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2533     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2534     Observer.changingInstr(MI);
2535     MI.getOperand(1).setReg(Cast.getReg(0));
2536     Observer.changedInstr(MI);
2537     return true;
2538   }
2539 
2540   if (MI.getOpcode() != AMDGPU::G_LOAD)
2541     return false;
2542 
2543   Register ValReg = MI.getOperand(0).getReg();
2544   LLT ValTy = MRI.getType(ValReg);
2545 
2546   MachineMemOperand *MMO = *MI.memoperands_begin();
2547   const unsigned ValSize = ValTy.getSizeInBits();
2548   const LLT MemTy = MMO->getMemoryType();
2549   const Align MemAlign = MMO->getAlign();
2550   const unsigned MemSize = MemTy.getSizeInBits();
2551   const uint64_t AlignInBits = 8 * MemAlign.value();
2552 
2553   // Widen non-power-of-2 loads to the alignment if needed
2554   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2555     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2556 
2557     // This was already the correct extending load result type, so just adjust
2558     // the memory type.
2559     if (WideMemSize == ValSize) {
2560       MachineFunction &MF = B.getMF();
2561 
2562       MachineMemOperand *WideMMO =
2563           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2564       Observer.changingInstr(MI);
2565       MI.setMemRefs(MF, {WideMMO});
2566       Observer.changedInstr(MI);
2567       return true;
2568     }
2569 
2570     // Don't bother handling edge case that should probably never be produced.
2571     if (ValSize > WideMemSize)
2572       return false;
2573 
2574     LLT WideTy = widenToNextPowerOf2(ValTy);
2575 
2576     Register WideLoad;
2577     if (!WideTy.isVector()) {
2578       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2579       B.buildTrunc(ValReg, WideLoad).getReg(0);
2580     } else {
2581       // Extract the subvector.
2582 
2583       if (isRegisterType(ValTy)) {
2584         // If this a case where G_EXTRACT is legal, use it.
2585         // (e.g. <3 x s32> -> <4 x s32>)
2586         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2587         B.buildExtract(ValReg, WideLoad, 0);
2588       } else {
2589         // For cases where the widened type isn't a nice register value, unmerge
2590         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2591         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2592         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2593       }
2594     }
2595 
2596     MI.eraseFromParent();
2597     return true;
2598   }
2599 
2600   return false;
2601 }
2602 
2603 bool AMDGPULegalizerInfo::legalizeFMad(
2604   MachineInstr &MI, MachineRegisterInfo &MRI,
2605   MachineIRBuilder &B) const {
2606   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2607   assert(Ty.isScalar());
2608 
2609   MachineFunction &MF = B.getMF();
2610   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2611 
2612   // TODO: Always legal with future ftz flag.
2613   // FIXME: Do we need just output?
2614   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2615     return true;
2616   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2617     return true;
2618 
2619   MachineIRBuilder HelperBuilder(MI);
2620   GISelObserverWrapper DummyObserver;
2621   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2622   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2623 }
2624 
2625 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2626   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2627   Register DstReg = MI.getOperand(0).getReg();
2628   Register PtrReg = MI.getOperand(1).getReg();
2629   Register CmpVal = MI.getOperand(2).getReg();
2630   Register NewVal = MI.getOperand(3).getReg();
2631 
2632   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2633          "this should not have been custom lowered");
2634 
2635   LLT ValTy = MRI.getType(CmpVal);
2636   LLT VecTy = LLT::fixed_vector(2, ValTy);
2637 
2638   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2639 
2640   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2641     .addDef(DstReg)
2642     .addUse(PtrReg)
2643     .addUse(PackedVal)
2644     .setMemRefs(MI.memoperands());
2645 
2646   MI.eraseFromParent();
2647   return true;
2648 }
2649 
2650 bool AMDGPULegalizerInfo::legalizeFlog(
2651   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2652   Register Dst = MI.getOperand(0).getReg();
2653   Register Src = MI.getOperand(1).getReg();
2654   LLT Ty = B.getMRI()->getType(Dst);
2655   unsigned Flags = MI.getFlags();
2656 
2657   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2658   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2659 
2660   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2661   MI.eraseFromParent();
2662   return true;
2663 }
2664 
2665 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2666                                        MachineIRBuilder &B) const {
2667   Register Dst = MI.getOperand(0).getReg();
2668   Register Src = MI.getOperand(1).getReg();
2669   unsigned Flags = MI.getFlags();
2670   LLT Ty = B.getMRI()->getType(Dst);
2671 
2672   auto K = B.buildFConstant(Ty, numbers::log2e);
2673   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2674   B.buildFExp2(Dst, Mul, Flags);
2675   MI.eraseFromParent();
2676   return true;
2677 }
2678 
2679 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2680                                        MachineIRBuilder &B) const {
2681   Register Dst = MI.getOperand(0).getReg();
2682   Register Src0 = MI.getOperand(1).getReg();
2683   Register Src1 = MI.getOperand(2).getReg();
2684   unsigned Flags = MI.getFlags();
2685   LLT Ty = B.getMRI()->getType(Dst);
2686   const LLT S16 = LLT::scalar(16);
2687   const LLT S32 = LLT::scalar(32);
2688 
2689   if (Ty == S32) {
2690     auto Log = B.buildFLog2(S32, Src0, Flags);
2691     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2692       .addUse(Log.getReg(0))
2693       .addUse(Src1)
2694       .setMIFlags(Flags);
2695     B.buildFExp2(Dst, Mul, Flags);
2696   } else if (Ty == S16) {
2697     // There's no f16 fmul_legacy, so we need to convert for it.
2698     auto Log = B.buildFLog2(S16, Src0, Flags);
2699     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2700     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2701     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2702       .addUse(Ext0.getReg(0))
2703       .addUse(Ext1.getReg(0))
2704       .setMIFlags(Flags);
2705 
2706     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2707   } else
2708     return false;
2709 
2710   MI.eraseFromParent();
2711   return true;
2712 }
2713 
2714 // Find a source register, ignoring any possible source modifiers.
2715 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2716   Register ModSrc = OrigSrc;
2717   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2718     ModSrc = SrcFNeg->getOperand(1).getReg();
2719     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2720       ModSrc = SrcFAbs->getOperand(1).getReg();
2721   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2722     ModSrc = SrcFAbs->getOperand(1).getReg();
2723   return ModSrc;
2724 }
2725 
2726 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2727                                          MachineRegisterInfo &MRI,
2728                                          MachineIRBuilder &B) const {
2729 
2730   const LLT S1 = LLT::scalar(1);
2731   const LLT S64 = LLT::scalar(64);
2732   Register Dst = MI.getOperand(0).getReg();
2733   Register OrigSrc = MI.getOperand(1).getReg();
2734   unsigned Flags = MI.getFlags();
2735   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2736          "this should not have been custom lowered");
2737 
2738   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2739   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2740   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2741   // V_FRACT bug is:
2742   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2743   //
2744   // Convert floor(x) to (x - fract(x))
2745 
2746   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2747     .addUse(OrigSrc)
2748     .setMIFlags(Flags);
2749 
2750   // Give source modifier matching some assistance before obscuring a foldable
2751   // pattern.
2752 
2753   // TODO: We can avoid the neg on the fract? The input sign to fract
2754   // shouldn't matter?
2755   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2756 
2757   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2758 
2759   Register Min = MRI.createGenericVirtualRegister(S64);
2760 
2761   // We don't need to concern ourselves with the snan handling difference, so
2762   // use the one which will directly select.
2763   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2764   if (MFI->getMode().IEEE)
2765     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2766   else
2767     B.buildFMinNum(Min, Fract, Const, Flags);
2768 
2769   Register CorrectedFract = Min;
2770   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2771     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2772     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2773   }
2774 
2775   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2776   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2777 
2778   MI.eraseFromParent();
2779   return true;
2780 }
2781 
2782 // Turn an illegal packed v2s16 build vector into bit operations.
2783 // TODO: This should probably be a bitcast action in LegalizerHelper.
2784 bool AMDGPULegalizerInfo::legalizeBuildVector(
2785   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2786   Register Dst = MI.getOperand(0).getReg();
2787   const LLT S32 = LLT::scalar(32);
2788   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2789 
2790   Register Src0 = MI.getOperand(1).getReg();
2791   Register Src1 = MI.getOperand(2).getReg();
2792   assert(MRI.getType(Src0) == LLT::scalar(16));
2793 
2794   auto Merge = B.buildMerge(S32, {Src0, Src1});
2795   B.buildBitcast(Dst, Merge);
2796 
2797   MI.eraseFromParent();
2798   return true;
2799 }
2800 
2801 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2802 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2803 // case with a single min instruction instead of a compare+select.
2804 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2805                                             MachineRegisterInfo &MRI,
2806                                             MachineIRBuilder &B) const {
2807   Register Dst = MI.getOperand(0).getReg();
2808   Register Src = MI.getOperand(1).getReg();
2809   LLT DstTy = MRI.getType(Dst);
2810   LLT SrcTy = MRI.getType(Src);
2811 
2812   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2813                         ? AMDGPU::G_AMDGPU_FFBH_U32
2814                         : AMDGPU::G_AMDGPU_FFBL_B32;
2815   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2816   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2817 
2818   MI.eraseFromParent();
2819   return true;
2820 }
2821 
2822 // Check that this is a G_XOR x, -1
2823 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2824   if (MI.getOpcode() != TargetOpcode::G_XOR)
2825     return false;
2826   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2827   return ConstVal && *ConstVal == -1;
2828 }
2829 
2830 // Return the use branch instruction, otherwise null if the usage is invalid.
2831 static MachineInstr *
2832 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2833                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2834   Register CondDef = MI.getOperand(0).getReg();
2835   if (!MRI.hasOneNonDBGUse(CondDef))
2836     return nullptr;
2837 
2838   MachineBasicBlock *Parent = MI.getParent();
2839   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2840 
2841   if (isNot(MRI, *UseMI)) {
2842     Register NegatedCond = UseMI->getOperand(0).getReg();
2843     if (!MRI.hasOneNonDBGUse(NegatedCond))
2844       return nullptr;
2845 
2846     // We're deleting the def of this value, so we need to remove it.
2847     eraseInstr(*UseMI, MRI);
2848 
2849     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2850     Negated = true;
2851   }
2852 
2853   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2854     return nullptr;
2855 
2856   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2857   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2858   if (Next == Parent->end()) {
2859     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2860     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2861       return nullptr;
2862     UncondBrTarget = &*NextMBB;
2863   } else {
2864     if (Next->getOpcode() != AMDGPU::G_BR)
2865       return nullptr;
2866     Br = &*Next;
2867     UncondBrTarget = Br->getOperand(0).getMBB();
2868   }
2869 
2870   return UseMI;
2871 }
2872 
2873 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2874                                          const ArgDescriptor *Arg,
2875                                          const TargetRegisterClass *ArgRC,
2876                                          LLT ArgTy) const {
2877   MCRegister SrcReg = Arg->getRegister();
2878   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2879   assert(DstReg.isVirtual() && "Virtual register expected");
2880 
2881   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2882                                              *ArgRC, B.getDebugLoc(), ArgTy);
2883   if (Arg->isMasked()) {
2884     // TODO: Should we try to emit this once in the entry block?
2885     const LLT S32 = LLT::scalar(32);
2886     const unsigned Mask = Arg->getMask();
2887     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2888 
2889     Register AndMaskSrc = LiveIn;
2890 
2891     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2892     // 0.
2893     if (Shift != 0) {
2894       auto ShiftAmt = B.buildConstant(S32, Shift);
2895       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2896     }
2897 
2898     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2899   } else {
2900     B.buildCopy(DstReg, LiveIn);
2901   }
2902 
2903   return true;
2904 }
2905 
2906 bool AMDGPULegalizerInfo::loadInputValue(
2907     Register DstReg, MachineIRBuilder &B,
2908     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2909   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2910   const ArgDescriptor *Arg;
2911   const TargetRegisterClass *ArgRC;
2912   LLT ArgTy;
2913   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2914 
2915   if (!Arg) {
2916     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2917       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2918       // case the pointer argument may be missing and we use null.
2919       B.buildConstant(DstReg, 0);
2920       return true;
2921     }
2922 
2923     // It's undefined behavior if a function marked with the amdgpu-no-*
2924     // attributes uses the corresponding intrinsic.
2925     B.buildUndef(DstReg);
2926     return true;
2927   }
2928 
2929   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2930     return false; // TODO: Handle these
2931   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2932 }
2933 
2934 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2935     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2936     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2937   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2938     return false;
2939 
2940   MI.eraseFromParent();
2941   return true;
2942 }
2943 
2944 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2945                                        MachineRegisterInfo &MRI,
2946                                        MachineIRBuilder &B) const {
2947   Register Dst = MI.getOperand(0).getReg();
2948   LLT DstTy = MRI.getType(Dst);
2949   LLT S16 = LLT::scalar(16);
2950   LLT S32 = LLT::scalar(32);
2951   LLT S64 = LLT::scalar(64);
2952 
2953   if (DstTy == S16)
2954     return legalizeFDIV16(MI, MRI, B);
2955   if (DstTy == S32)
2956     return legalizeFDIV32(MI, MRI, B);
2957   if (DstTy == S64)
2958     return legalizeFDIV64(MI, MRI, B);
2959 
2960   return false;
2961 }
2962 
2963 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2964                                                         Register DstDivReg,
2965                                                         Register DstRemReg,
2966                                                         Register X,
2967                                                         Register Y) const {
2968   const LLT S1 = LLT::scalar(1);
2969   const LLT S32 = LLT::scalar(32);
2970 
2971   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2972   // algorithm used here.
2973 
2974   // Initial estimate of inv(y).
2975   auto FloatY = B.buildUITOFP(S32, Y);
2976   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2977   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2978   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2979   auto Z = B.buildFPTOUI(S32, ScaledY);
2980 
2981   // One round of UNR.
2982   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2983   auto NegYZ = B.buildMul(S32, NegY, Z);
2984   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2985 
2986   // Quotient/remainder estimate.
2987   auto Q = B.buildUMulH(S32, X, Z);
2988   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2989 
2990   // First quotient/remainder refinement.
2991   auto One = B.buildConstant(S32, 1);
2992   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2993   if (DstDivReg)
2994     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2995   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2996 
2997   // Second quotient/remainder refinement.
2998   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2999   if (DstDivReg)
3000     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3001 
3002   if (DstRemReg)
3003     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3004 }
3005 
3006 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3007 //
3008 // Return lo, hi of result
3009 //
3010 // %cvt.lo = G_UITOFP Val.lo
3011 // %cvt.hi = G_UITOFP Val.hi
3012 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3013 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3014 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3015 // %mul2 = G_FMUL %mul1, 2**(-32)
3016 // %trunc = G_INTRINSIC_TRUNC %mul2
3017 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3018 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3019 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3020                                                        Register Val) {
3021   const LLT S32 = LLT::scalar(32);
3022   auto Unmerge = B.buildUnmerge(S32, Val);
3023 
3024   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3025   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3026 
3027   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3028                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3029 
3030   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3031   auto Mul1 =
3032       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3033 
3034   // 2**(-32)
3035   auto Mul2 =
3036       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3037   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3038 
3039   // -(2**32)
3040   auto Mad2 = B.buildFMAD(S32, Trunc,
3041                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3042 
3043   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3044   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3045 
3046   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3047 }
3048 
3049 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3050                                                         Register DstDivReg,
3051                                                         Register DstRemReg,
3052                                                         Register Numer,
3053                                                         Register Denom) const {
3054   const LLT S32 = LLT::scalar(32);
3055   const LLT S64 = LLT::scalar(64);
3056   const LLT S1 = LLT::scalar(1);
3057   Register RcpLo, RcpHi;
3058 
3059   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3060 
3061   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3062 
3063   auto Zero64 = B.buildConstant(S64, 0);
3064   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3065 
3066   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3067   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3068 
3069   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3070   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3071   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3072 
3073   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3074   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3075   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3076 
3077   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3078   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3079   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3080   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3081   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3082 
3083   auto Zero32 = B.buildConstant(S32, 0);
3084   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3085   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3086   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3087 
3088   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3089   Register NumerLo = UnmergeNumer.getReg(0);
3090   Register NumerHi = UnmergeNumer.getReg(1);
3091 
3092   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3093   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3094   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3095   Register Mul3_Lo = UnmergeMul3.getReg(0);
3096   Register Mul3_Hi = UnmergeMul3.getReg(1);
3097   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3098   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3099   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3100   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3101 
3102   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3103   Register DenomLo = UnmergeDenom.getReg(0);
3104   Register DenomHi = UnmergeDenom.getReg(1);
3105 
3106   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3107   auto C1 = B.buildSExt(S32, CmpHi);
3108 
3109   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3110   auto C2 = B.buildSExt(S32, CmpLo);
3111 
3112   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3113   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3114 
3115   // TODO: Here and below portions of the code can be enclosed into if/endif.
3116   // Currently control flow is unconditional and we have 4 selects after
3117   // potential endif to substitute PHIs.
3118 
3119   // if C3 != 0 ...
3120   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3121   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3122   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3123   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3124 
3125   auto One64 = B.buildConstant(S64, 1);
3126   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3127 
3128   auto C4 =
3129       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3130   auto C5 =
3131       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3132   auto C6 = B.buildSelect(
3133       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3134 
3135   // if (C6 != 0)
3136   auto Add4 = B.buildAdd(S64, Add3, One64);
3137   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3138 
3139   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3140   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3141   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3142 
3143   // endif C6
3144   // endif C3
3145 
3146   if (DstDivReg) {
3147     auto Sel1 = B.buildSelect(
3148         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3149     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3150                   Sel1, MulHi3);
3151   }
3152 
3153   if (DstRemReg) {
3154     auto Sel2 = B.buildSelect(
3155         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3156     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3157                   Sel2, Sub1);
3158   }
3159 }
3160 
3161 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3162                                                   MachineRegisterInfo &MRI,
3163                                                   MachineIRBuilder &B) const {
3164   Register DstDivReg, DstRemReg;
3165   switch (MI.getOpcode()) {
3166   default:
3167     llvm_unreachable("Unexpected opcode!");
3168   case AMDGPU::G_UDIV: {
3169     DstDivReg = MI.getOperand(0).getReg();
3170     break;
3171   }
3172   case AMDGPU::G_UREM: {
3173     DstRemReg = MI.getOperand(0).getReg();
3174     break;
3175   }
3176   case AMDGPU::G_UDIVREM: {
3177     DstDivReg = MI.getOperand(0).getReg();
3178     DstRemReg = MI.getOperand(1).getReg();
3179     break;
3180   }
3181   }
3182 
3183   const LLT S64 = LLT::scalar(64);
3184   const LLT S32 = LLT::scalar(32);
3185   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3186   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3187   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3188   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3189 
3190   if (Ty == S32)
3191     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3192   else if (Ty == S64)
3193     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3194   else
3195     return false;
3196 
3197   MI.eraseFromParent();
3198   return true;
3199 }
3200 
3201 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3202                                                 MachineRegisterInfo &MRI,
3203                                                 MachineIRBuilder &B) const {
3204   const LLT S64 = LLT::scalar(64);
3205   const LLT S32 = LLT::scalar(32);
3206 
3207   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3208   if (Ty != S32 && Ty != S64)
3209     return false;
3210 
3211   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3212   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3213   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3214 
3215   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3216   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3217   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3218 
3219   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3220   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3221 
3222   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3223   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3224 
3225   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3226   switch (MI.getOpcode()) {
3227   default:
3228     llvm_unreachable("Unexpected opcode!");
3229   case AMDGPU::G_SDIV: {
3230     DstDivReg = MI.getOperand(0).getReg();
3231     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3232     break;
3233   }
3234   case AMDGPU::G_SREM: {
3235     DstRemReg = MI.getOperand(0).getReg();
3236     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3237     break;
3238   }
3239   case AMDGPU::G_SDIVREM: {
3240     DstDivReg = MI.getOperand(0).getReg();
3241     DstRemReg = MI.getOperand(1).getReg();
3242     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3243     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3244     break;
3245   }
3246   }
3247 
3248   if (Ty == S32)
3249     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3250   else
3251     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3252 
3253   if (DstDivReg) {
3254     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3255     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3256     B.buildSub(DstDivReg, SignXor, Sign);
3257   }
3258 
3259   if (DstRemReg) {
3260     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3261     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3262     B.buildSub(DstRemReg, SignXor, Sign);
3263   }
3264 
3265   MI.eraseFromParent();
3266   return true;
3267 }
3268 
3269 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3270                                                  MachineRegisterInfo &MRI,
3271                                                  MachineIRBuilder &B) const {
3272   Register Res = MI.getOperand(0).getReg();
3273   Register LHS = MI.getOperand(1).getReg();
3274   Register RHS = MI.getOperand(2).getReg();
3275   uint16_t Flags = MI.getFlags();
3276   LLT ResTy = MRI.getType(Res);
3277 
3278   const MachineFunction &MF = B.getMF();
3279   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3280                             MI.getFlag(MachineInstr::FmAfn);
3281 
3282   if (!AllowInaccurateRcp)
3283     return false;
3284 
3285   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3286     // 1 / x -> RCP(x)
3287     if (CLHS->isExactlyValue(1.0)) {
3288       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3289         .addUse(RHS)
3290         .setMIFlags(Flags);
3291 
3292       MI.eraseFromParent();
3293       return true;
3294     }
3295 
3296     // -1 / x -> RCP( FNEG(x) )
3297     if (CLHS->isExactlyValue(-1.0)) {
3298       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3299       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3300         .addUse(FNeg.getReg(0))
3301         .setMIFlags(Flags);
3302 
3303       MI.eraseFromParent();
3304       return true;
3305     }
3306   }
3307 
3308   // x / y -> x * (1.0 / y)
3309   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3310     .addUse(RHS)
3311     .setMIFlags(Flags);
3312   B.buildFMul(Res, LHS, RCP, Flags);
3313 
3314   MI.eraseFromParent();
3315   return true;
3316 }
3317 
3318 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3319                                                    MachineRegisterInfo &MRI,
3320                                                    MachineIRBuilder &B) const {
3321   Register Res = MI.getOperand(0).getReg();
3322   Register X = MI.getOperand(1).getReg();
3323   Register Y = MI.getOperand(2).getReg();
3324   uint16_t Flags = MI.getFlags();
3325   LLT ResTy = MRI.getType(Res);
3326 
3327   const MachineFunction &MF = B.getMF();
3328   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3329                             MI.getFlag(MachineInstr::FmAfn);
3330 
3331   if (!AllowInaccurateRcp)
3332     return false;
3333 
3334   auto NegY = B.buildFNeg(ResTy, Y);
3335   auto One = B.buildFConstant(ResTy, 1.0);
3336 
3337   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3338     .addUse(Y)
3339     .setMIFlags(Flags);
3340 
3341   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3342   R = B.buildFMA(ResTy, Tmp0, R, R);
3343 
3344   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3345   R = B.buildFMA(ResTy, Tmp1, R, R);
3346 
3347   auto Ret = B.buildFMul(ResTy, X, R);
3348   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3349 
3350   B.buildFMA(Res, Tmp2, R, Ret);
3351   MI.eraseFromParent();
3352   return true;
3353 }
3354 
3355 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3356                                          MachineRegisterInfo &MRI,
3357                                          MachineIRBuilder &B) const {
3358   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3359     return true;
3360 
3361   Register Res = MI.getOperand(0).getReg();
3362   Register LHS = MI.getOperand(1).getReg();
3363   Register RHS = MI.getOperand(2).getReg();
3364 
3365   uint16_t Flags = MI.getFlags();
3366 
3367   LLT S16 = LLT::scalar(16);
3368   LLT S32 = LLT::scalar(32);
3369 
3370   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3371   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3372 
3373   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3374     .addUse(RHSExt.getReg(0))
3375     .setMIFlags(Flags);
3376 
3377   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3378   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3379 
3380   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3381     .addUse(RDst.getReg(0))
3382     .addUse(RHS)
3383     .addUse(LHS)
3384     .setMIFlags(Flags);
3385 
3386   MI.eraseFromParent();
3387   return true;
3388 }
3389 
3390 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3391 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3392 static void toggleSPDenormMode(bool Enable,
3393                                MachineIRBuilder &B,
3394                                const GCNSubtarget &ST,
3395                                AMDGPU::SIModeRegisterDefaults Mode) {
3396   // Set SP denorm mode to this value.
3397   unsigned SPDenormMode =
3398     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3399 
3400   if (ST.hasDenormModeInst()) {
3401     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3402     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3403 
3404     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3405     B.buildInstr(AMDGPU::S_DENORM_MODE)
3406       .addImm(NewDenormModeValue);
3407 
3408   } else {
3409     // Select FP32 bit field in mode register.
3410     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3411                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3412                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3413 
3414     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3415       .addImm(SPDenormMode)
3416       .addImm(SPDenormModeBitField);
3417   }
3418 }
3419 
3420 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3421                                          MachineRegisterInfo &MRI,
3422                                          MachineIRBuilder &B) const {
3423   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3424     return true;
3425 
3426   Register Res = MI.getOperand(0).getReg();
3427   Register LHS = MI.getOperand(1).getReg();
3428   Register RHS = MI.getOperand(2).getReg();
3429   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3430   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3431 
3432   uint16_t Flags = MI.getFlags();
3433 
3434   LLT S32 = LLT::scalar(32);
3435   LLT S1 = LLT::scalar(1);
3436 
3437   auto One = B.buildFConstant(S32, 1.0f);
3438 
3439   auto DenominatorScaled =
3440     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3441       .addUse(LHS)
3442       .addUse(RHS)
3443       .addImm(0)
3444       .setMIFlags(Flags);
3445   auto NumeratorScaled =
3446     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3447       .addUse(LHS)
3448       .addUse(RHS)
3449       .addImm(1)
3450       .setMIFlags(Flags);
3451 
3452   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3453     .addUse(DenominatorScaled.getReg(0))
3454     .setMIFlags(Flags);
3455   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3456 
3457   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3458   // aren't modeled as reading it.
3459   if (!Mode.allFP32Denormals())
3460     toggleSPDenormMode(true, B, ST, Mode);
3461 
3462   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3463   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3464   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3465   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3466   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3467   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3468 
3469   if (!Mode.allFP32Denormals())
3470     toggleSPDenormMode(false, B, ST, Mode);
3471 
3472   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3473     .addUse(Fma4.getReg(0))
3474     .addUse(Fma1.getReg(0))
3475     .addUse(Fma3.getReg(0))
3476     .addUse(NumeratorScaled.getReg(1))
3477     .setMIFlags(Flags);
3478 
3479   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3480     .addUse(Fmas.getReg(0))
3481     .addUse(RHS)
3482     .addUse(LHS)
3483     .setMIFlags(Flags);
3484 
3485   MI.eraseFromParent();
3486   return true;
3487 }
3488 
3489 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3490                                          MachineRegisterInfo &MRI,
3491                                          MachineIRBuilder &B) const {
3492   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3493     return true;
3494 
3495   Register Res = MI.getOperand(0).getReg();
3496   Register LHS = MI.getOperand(1).getReg();
3497   Register RHS = MI.getOperand(2).getReg();
3498 
3499   uint16_t Flags = MI.getFlags();
3500 
3501   LLT S64 = LLT::scalar(64);
3502   LLT S1 = LLT::scalar(1);
3503 
3504   auto One = B.buildFConstant(S64, 1.0);
3505 
3506   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3507     .addUse(LHS)
3508     .addUse(RHS)
3509     .addImm(0)
3510     .setMIFlags(Flags);
3511 
3512   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3513 
3514   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3515     .addUse(DivScale0.getReg(0))
3516     .setMIFlags(Flags);
3517 
3518   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3519   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3520   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3521 
3522   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3523     .addUse(LHS)
3524     .addUse(RHS)
3525     .addImm(1)
3526     .setMIFlags(Flags);
3527 
3528   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3529   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3530   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3531 
3532   Register Scale;
3533   if (!ST.hasUsableDivScaleConditionOutput()) {
3534     // Workaround a hardware bug on SI where the condition output from div_scale
3535     // is not usable.
3536 
3537     LLT S32 = LLT::scalar(32);
3538 
3539     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3540     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3541     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3542     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3543 
3544     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3545                               Scale1Unmerge.getReg(1));
3546     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3547                               Scale0Unmerge.getReg(1));
3548     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3549   } else {
3550     Scale = DivScale1.getReg(1);
3551   }
3552 
3553   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3554     .addUse(Fma4.getReg(0))
3555     .addUse(Fma3.getReg(0))
3556     .addUse(Mul.getReg(0))
3557     .addUse(Scale)
3558     .setMIFlags(Flags);
3559 
3560   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3561     .addUse(Fmas.getReg(0))
3562     .addUse(RHS)
3563     .addUse(LHS)
3564     .setMIFlags(Flags);
3565 
3566   MI.eraseFromParent();
3567   return true;
3568 }
3569 
3570 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3571                                                  MachineRegisterInfo &MRI,
3572                                                  MachineIRBuilder &B) const {
3573   Register Res = MI.getOperand(0).getReg();
3574   Register LHS = MI.getOperand(2).getReg();
3575   Register RHS = MI.getOperand(3).getReg();
3576   uint16_t Flags = MI.getFlags();
3577 
3578   LLT S32 = LLT::scalar(32);
3579   LLT S1 = LLT::scalar(1);
3580 
3581   auto Abs = B.buildFAbs(S32, RHS, Flags);
3582   const APFloat C0Val(1.0f);
3583 
3584   auto C0 = B.buildConstant(S32, 0x6f800000);
3585   auto C1 = B.buildConstant(S32, 0x2f800000);
3586   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3587 
3588   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3589   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3590 
3591   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3592 
3593   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3594     .addUse(Mul0.getReg(0))
3595     .setMIFlags(Flags);
3596 
3597   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3598 
3599   B.buildFMul(Res, Sel, Mul1, Flags);
3600 
3601   MI.eraseFromParent();
3602   return true;
3603 }
3604 
3605 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3606 // FIXME: Why do we handle this one but not other removed instructions?
3607 //
3608 // Reciprocal square root.  The clamp prevents infinite results, clamping
3609 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3610 // +-max_float.
3611 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3612                                                     MachineRegisterInfo &MRI,
3613                                                     MachineIRBuilder &B) const {
3614   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3615     return true;
3616 
3617   Register Dst = MI.getOperand(0).getReg();
3618   Register Src = MI.getOperand(2).getReg();
3619   auto Flags = MI.getFlags();
3620 
3621   LLT Ty = MRI.getType(Dst);
3622 
3623   const fltSemantics *FltSemantics;
3624   if (Ty == LLT::scalar(32))
3625     FltSemantics = &APFloat::IEEEsingle();
3626   else if (Ty == LLT::scalar(64))
3627     FltSemantics = &APFloat::IEEEdouble();
3628   else
3629     return false;
3630 
3631   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3632     .addUse(Src)
3633     .setMIFlags(Flags);
3634 
3635   // We don't need to concern ourselves with the snan handling difference, since
3636   // the rsq quieted (or not) so use the one which will directly select.
3637   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3638   const bool UseIEEE = MFI->getMode().IEEE;
3639 
3640   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3641   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3642                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3643 
3644   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3645 
3646   if (UseIEEE)
3647     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3648   else
3649     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3650   MI.eraseFromParent();
3651   return true;
3652 }
3653 
3654 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3655   switch (IID) {
3656   case Intrinsic::amdgcn_ds_fadd:
3657     return AMDGPU::G_ATOMICRMW_FADD;
3658   case Intrinsic::amdgcn_ds_fmin:
3659     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3660   case Intrinsic::amdgcn_ds_fmax:
3661     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3662   default:
3663     llvm_unreachable("not a DS FP intrinsic");
3664   }
3665 }
3666 
3667 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3668                                                       MachineInstr &MI,
3669                                                       Intrinsic::ID IID) const {
3670   GISelChangeObserver &Observer = Helper.Observer;
3671   Observer.changingInstr(MI);
3672 
3673   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3674 
3675   // The remaining operands were used to set fields in the MemOperand on
3676   // construction.
3677   for (int I = 6; I > 3; --I)
3678     MI.RemoveOperand(I);
3679 
3680   MI.RemoveOperand(1); // Remove the intrinsic ID.
3681   Observer.changedInstr(MI);
3682   return true;
3683 }
3684 
3685 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3686                                             MachineRegisterInfo &MRI,
3687                                             MachineIRBuilder &B) const {
3688   uint64_t Offset =
3689     ST.getTargetLowering()->getImplicitParameterOffset(
3690       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3691   LLT DstTy = MRI.getType(DstReg);
3692   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3693 
3694   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3695   if (!loadInputValue(KernargPtrReg, B,
3696                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3697     return false;
3698 
3699   // FIXME: This should be nuw
3700   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3701   return true;
3702 }
3703 
3704 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3705                                                  MachineRegisterInfo &MRI,
3706                                                  MachineIRBuilder &B) const {
3707   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3708   if (!MFI->isEntryFunction()) {
3709     return legalizePreloadedArgIntrin(MI, MRI, B,
3710                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3711   }
3712 
3713   Register DstReg = MI.getOperand(0).getReg();
3714   if (!getImplicitArgPtr(DstReg, MRI, B))
3715     return false;
3716 
3717   MI.eraseFromParent();
3718   return true;
3719 }
3720 
3721 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3722                                               MachineRegisterInfo &MRI,
3723                                               MachineIRBuilder &B,
3724                                               unsigned AddrSpace) const {
3725   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3726   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3727   Register Hi32 = Unmerge.getReg(1);
3728 
3729   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3730   MI.eraseFromParent();
3731   return true;
3732 }
3733 
3734 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3735 // offset (the offset that is included in bounds checking and swizzling, to be
3736 // split between the instruction's voffset and immoffset fields) and soffset
3737 // (the offset that is excluded from bounds checking and swizzling, to go in
3738 // the instruction's soffset field).  This function takes the first kind of
3739 // offset and figures out how to split it between voffset and immoffset.
3740 std::pair<Register, unsigned>
3741 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3742                                         Register OrigOffset) const {
3743   const unsigned MaxImm = 4095;
3744   Register BaseReg;
3745   unsigned ImmOffset;
3746   const LLT S32 = LLT::scalar(32);
3747   MachineRegisterInfo &MRI = *B.getMRI();
3748 
3749   std::tie(BaseReg, ImmOffset) =
3750       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3751 
3752   // If BaseReg is a pointer, convert it to int.
3753   if (MRI.getType(BaseReg).isPointer())
3754     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3755 
3756   // If the immediate value is too big for the immoffset field, put the value
3757   // and -4096 into the immoffset field so that the value that is copied/added
3758   // for the voffset field is a multiple of 4096, and it stands more chance
3759   // of being CSEd with the copy/add for another similar load/store.
3760   // However, do not do that rounding down to a multiple of 4096 if that is a
3761   // negative number, as it appears to be illegal to have a negative offset
3762   // in the vgpr, even if adding the immediate offset makes it positive.
3763   unsigned Overflow = ImmOffset & ~MaxImm;
3764   ImmOffset -= Overflow;
3765   if ((int32_t)Overflow < 0) {
3766     Overflow += ImmOffset;
3767     ImmOffset = 0;
3768   }
3769 
3770   if (Overflow != 0) {
3771     if (!BaseReg) {
3772       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3773     } else {
3774       auto OverflowVal = B.buildConstant(S32, Overflow);
3775       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3776     }
3777   }
3778 
3779   if (!BaseReg)
3780     BaseReg = B.buildConstant(S32, 0).getReg(0);
3781 
3782   return std::make_pair(BaseReg, ImmOffset);
3783 }
3784 
3785 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3786 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3787                                           Register VOffset, Register SOffset,
3788                                           unsigned ImmOffset, Register VIndex,
3789                                           MachineRegisterInfo &MRI) const {
3790   Optional<ValueAndVReg> MaybeVOffsetVal =
3791       getIConstantVRegValWithLookThrough(VOffset, MRI);
3792   Optional<ValueAndVReg> MaybeSOffsetVal =
3793       getIConstantVRegValWithLookThrough(SOffset, MRI);
3794   Optional<ValueAndVReg> MaybeVIndexVal =
3795       getIConstantVRegValWithLookThrough(VIndex, MRI);
3796   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3797   // update the MMO with that offset. The stride is unknown so we can only do
3798   // this if VIndex is constant 0.
3799   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3800       MaybeVIndexVal->Value == 0) {
3801     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3802                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3803     MMO->setOffset(TotalOffset);
3804   } else {
3805     // We don't have a constant combined offset to use in the MMO. Give up.
3806     MMO->setValue((Value *)nullptr);
3807   }
3808 }
3809 
3810 /// Handle register layout difference for f16 images for some subtargets.
3811 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3812                                              MachineRegisterInfo &MRI,
3813                                              Register Reg,
3814                                              bool ImageStore) const {
3815   const LLT S16 = LLT::scalar(16);
3816   const LLT S32 = LLT::scalar(32);
3817   LLT StoreVT = MRI.getType(Reg);
3818   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3819 
3820   if (ST.hasUnpackedD16VMem()) {
3821     auto Unmerge = B.buildUnmerge(S16, Reg);
3822 
3823     SmallVector<Register, 4> WideRegs;
3824     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3825       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3826 
3827     int NumElts = StoreVT.getNumElements();
3828 
3829     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3830         .getReg(0);
3831   }
3832 
3833   if (ImageStore && ST.hasImageStoreD16Bug()) {
3834     if (StoreVT.getNumElements() == 2) {
3835       SmallVector<Register, 4> PackedRegs;
3836       Reg = B.buildBitcast(S32, Reg).getReg(0);
3837       PackedRegs.push_back(Reg);
3838       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3839       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3840           .getReg(0);
3841     }
3842 
3843     if (StoreVT.getNumElements() == 3) {
3844       SmallVector<Register, 4> PackedRegs;
3845       auto Unmerge = B.buildUnmerge(S16, Reg);
3846       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3847         PackedRegs.push_back(Unmerge.getReg(I));
3848       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3849       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3850       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3851     }
3852 
3853     if (StoreVT.getNumElements() == 4) {
3854       SmallVector<Register, 4> PackedRegs;
3855       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3856       auto Unmerge = B.buildUnmerge(S32, Reg);
3857       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3858         PackedRegs.push_back(Unmerge.getReg(I));
3859       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3860       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3861           .getReg(0);
3862     }
3863 
3864     llvm_unreachable("invalid data type");
3865   }
3866 
3867   if (StoreVT == LLT::fixed_vector(3, S16)) {
3868     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3869               .getReg(0);
3870   }
3871   return Reg;
3872 }
3873 
3874 Register AMDGPULegalizerInfo::fixStoreSourceType(
3875   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3876   MachineRegisterInfo *MRI = B.getMRI();
3877   LLT Ty = MRI->getType(VData);
3878 
3879   const LLT S16 = LLT::scalar(16);
3880 
3881   // Fixup illegal register types for i8 stores.
3882   if (Ty == LLT::scalar(8) || Ty == S16) {
3883     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3884     return AnyExt;
3885   }
3886 
3887   if (Ty.isVector()) {
3888     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3889       if (IsFormat)
3890         return handleD16VData(B, *MRI, VData);
3891     }
3892   }
3893 
3894   return VData;
3895 }
3896 
3897 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3898                                               MachineRegisterInfo &MRI,
3899                                               MachineIRBuilder &B,
3900                                               bool IsTyped,
3901                                               bool IsFormat) const {
3902   Register VData = MI.getOperand(1).getReg();
3903   LLT Ty = MRI.getType(VData);
3904   LLT EltTy = Ty.getScalarType();
3905   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3906   const LLT S32 = LLT::scalar(32);
3907 
3908   VData = fixStoreSourceType(B, VData, IsFormat);
3909   Register RSrc = MI.getOperand(2).getReg();
3910 
3911   MachineMemOperand *MMO = *MI.memoperands_begin();
3912   const int MemSize = MMO->getSize();
3913 
3914   unsigned ImmOffset;
3915 
3916   // The typed intrinsics add an immediate after the registers.
3917   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3918 
3919   // The struct intrinsic variants add one additional operand over raw.
3920   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3921   Register VIndex;
3922   int OpOffset = 0;
3923   if (HasVIndex) {
3924     VIndex = MI.getOperand(3).getReg();
3925     OpOffset = 1;
3926   } else {
3927     VIndex = B.buildConstant(S32, 0).getReg(0);
3928   }
3929 
3930   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3931   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3932 
3933   unsigned Format = 0;
3934   if (IsTyped) {
3935     Format = MI.getOperand(5 + OpOffset).getImm();
3936     ++OpOffset;
3937   }
3938 
3939   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3940 
3941   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3942   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3943 
3944   unsigned Opc;
3945   if (IsTyped) {
3946     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3947                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3948   } else if (IsFormat) {
3949     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3950                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3951   } else {
3952     switch (MemSize) {
3953     case 1:
3954       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3955       break;
3956     case 2:
3957       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3958       break;
3959     default:
3960       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3961       break;
3962     }
3963   }
3964 
3965   auto MIB = B.buildInstr(Opc)
3966     .addUse(VData)              // vdata
3967     .addUse(RSrc)               // rsrc
3968     .addUse(VIndex)             // vindex
3969     .addUse(VOffset)            // voffset
3970     .addUse(SOffset)            // soffset
3971     .addImm(ImmOffset);         // offset(imm)
3972 
3973   if (IsTyped)
3974     MIB.addImm(Format);
3975 
3976   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3977      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3978      .addMemOperand(MMO);
3979 
3980   MI.eraseFromParent();
3981   return true;
3982 }
3983 
3984 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3985                                              MachineRegisterInfo &MRI,
3986                                              MachineIRBuilder &B,
3987                                              bool IsFormat,
3988                                              bool IsTyped) const {
3989   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3990   MachineMemOperand *MMO = *MI.memoperands_begin();
3991   const LLT MemTy = MMO->getMemoryType();
3992   const LLT S32 = LLT::scalar(32);
3993 
3994   Register Dst = MI.getOperand(0).getReg();
3995   Register RSrc = MI.getOperand(2).getReg();
3996 
3997   // The typed intrinsics add an immediate after the registers.
3998   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3999 
4000   // The struct intrinsic variants add one additional operand over raw.
4001   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4002   Register VIndex;
4003   int OpOffset = 0;
4004   if (HasVIndex) {
4005     VIndex = MI.getOperand(3).getReg();
4006     OpOffset = 1;
4007   } else {
4008     VIndex = B.buildConstant(S32, 0).getReg(0);
4009   }
4010 
4011   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4012   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4013 
4014   unsigned Format = 0;
4015   if (IsTyped) {
4016     Format = MI.getOperand(5 + OpOffset).getImm();
4017     ++OpOffset;
4018   }
4019 
4020   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4021   unsigned ImmOffset;
4022 
4023   LLT Ty = MRI.getType(Dst);
4024   LLT EltTy = Ty.getScalarType();
4025   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4026   const bool Unpacked = ST.hasUnpackedD16VMem();
4027 
4028   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4029   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4030 
4031   unsigned Opc;
4032 
4033   if (IsTyped) {
4034     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4035                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4036   } else if (IsFormat) {
4037     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4038                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4039   } else {
4040     switch (MemTy.getSizeInBits()) {
4041     case 8:
4042       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4043       break;
4044     case 16:
4045       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4046       break;
4047     default:
4048       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4049       break;
4050     }
4051   }
4052 
4053   Register LoadDstReg;
4054 
4055   bool IsExtLoad =
4056       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4057   LLT UnpackedTy = Ty.changeElementSize(32);
4058 
4059   if (IsExtLoad)
4060     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4061   else if (Unpacked && IsD16 && Ty.isVector())
4062     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4063   else
4064     LoadDstReg = Dst;
4065 
4066   auto MIB = B.buildInstr(Opc)
4067     .addDef(LoadDstReg)         // vdata
4068     .addUse(RSrc)               // rsrc
4069     .addUse(VIndex)             // vindex
4070     .addUse(VOffset)            // voffset
4071     .addUse(SOffset)            // soffset
4072     .addImm(ImmOffset);         // offset(imm)
4073 
4074   if (IsTyped)
4075     MIB.addImm(Format);
4076 
4077   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4078      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4079      .addMemOperand(MMO);
4080 
4081   if (LoadDstReg != Dst) {
4082     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4083 
4084     // Widen result for extending loads was widened.
4085     if (IsExtLoad)
4086       B.buildTrunc(Dst, LoadDstReg);
4087     else {
4088       // Repack to original 16-bit vector result
4089       // FIXME: G_TRUNC should work, but legalization currently fails
4090       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4091       SmallVector<Register, 4> Repack;
4092       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4093         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4094       B.buildMerge(Dst, Repack);
4095     }
4096   }
4097 
4098   MI.eraseFromParent();
4099   return true;
4100 }
4101 
4102 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4103                                                MachineIRBuilder &B,
4104                                                bool IsInc) const {
4105   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4106                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4107   B.buildInstr(Opc)
4108     .addDef(MI.getOperand(0).getReg())
4109     .addUse(MI.getOperand(2).getReg())
4110     .addUse(MI.getOperand(3).getReg())
4111     .cloneMemRefs(MI);
4112   MI.eraseFromParent();
4113   return true;
4114 }
4115 
4116 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4117   switch (IntrID) {
4118   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4119   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4120     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4121   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4122   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4123     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4124   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4125   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4126     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4127   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4128   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4129     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4130   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4131   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4132     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4133   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4134   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4135     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4136   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4137   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4138     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4139   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4140   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4141     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4142   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4143   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4144     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4145   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4146   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4147     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4148   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4149   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4150     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4151   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4152   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4153     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4154   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4155   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4156     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4157   case Intrinsic::amdgcn_buffer_atomic_fadd:
4158   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4159   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4160     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4161   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4162   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4163     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4164   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4165   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4166     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4167   default:
4168     llvm_unreachable("unhandled atomic opcode");
4169   }
4170 }
4171 
4172 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4173                                                MachineIRBuilder &B,
4174                                                Intrinsic::ID IID) const {
4175   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4176                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4177   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4178 
4179   Register Dst;
4180 
4181   int OpOffset = 0;
4182   if (HasReturn) {
4183     // A few FP atomics do not support return values.
4184     Dst = MI.getOperand(0).getReg();
4185   } else {
4186     OpOffset = -1;
4187   }
4188 
4189   Register VData = MI.getOperand(2 + OpOffset).getReg();
4190   Register CmpVal;
4191 
4192   if (IsCmpSwap) {
4193     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4194     ++OpOffset;
4195   }
4196 
4197   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4198   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4199 
4200   // The struct intrinsic variants add one additional operand over raw.
4201   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4202   Register VIndex;
4203   if (HasVIndex) {
4204     VIndex = MI.getOperand(4 + OpOffset).getReg();
4205     ++OpOffset;
4206   } else {
4207     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4208   }
4209 
4210   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4211   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4212   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4213 
4214   MachineMemOperand *MMO = *MI.memoperands_begin();
4215 
4216   unsigned ImmOffset;
4217   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4218   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4219 
4220   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4221 
4222   if (HasReturn)
4223     MIB.addDef(Dst);
4224 
4225   MIB.addUse(VData); // vdata
4226 
4227   if (IsCmpSwap)
4228     MIB.addReg(CmpVal);
4229 
4230   MIB.addUse(RSrc)               // rsrc
4231      .addUse(VIndex)             // vindex
4232      .addUse(VOffset)            // voffset
4233      .addUse(SOffset)            // soffset
4234      .addImm(ImmOffset)          // offset(imm)
4235      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4236      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4237      .addMemOperand(MMO);
4238 
4239   MI.eraseFromParent();
4240   return true;
4241 }
4242 
4243 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4244 /// vector with s16 typed elements.
4245 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4246                                       SmallVectorImpl<Register> &PackedAddrs,
4247                                       unsigned ArgOffset,
4248                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4249                                       bool IsA16, bool IsG16) {
4250   const LLT S16 = LLT::scalar(16);
4251   const LLT V2S16 = LLT::fixed_vector(2, 16);
4252   auto EndIdx = Intr->VAddrEnd;
4253 
4254   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4255     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4256     if (!SrcOp.isReg())
4257       continue; // _L to _LZ may have eliminated this.
4258 
4259     Register AddrReg = SrcOp.getReg();
4260 
4261     if ((I < Intr->GradientStart) ||
4262         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4263         (I >= Intr->CoordStart && !IsA16)) {
4264       // Handle any gradient or coordinate operands that should not be packed
4265       if ((I < Intr->GradientStart) && IsA16 &&
4266           (B.getMRI()->getType(AddrReg) == S16)) {
4267         // Special handling of bias when A16 is on. Bias is of type half but
4268         // occupies full 32-bit.
4269         PackedAddrs.push_back(
4270             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4271                 .getReg(0));
4272       } else {
4273         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4274         PackedAddrs.push_back(AddrReg);
4275       }
4276     } else {
4277       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4278       // derivatives dx/dh and dx/dv are packed with undef.
4279       if (((I + 1) >= EndIdx) ||
4280           ((Intr->NumGradients / 2) % 2 == 1 &&
4281            (I == static_cast<unsigned>(Intr->GradientStart +
4282                                        (Intr->NumGradients / 2) - 1) ||
4283             I == static_cast<unsigned>(Intr->GradientStart +
4284                                        Intr->NumGradients - 1))) ||
4285           // Check for _L to _LZ optimization
4286           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4287         PackedAddrs.push_back(
4288             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4289                 .getReg(0));
4290       } else {
4291         PackedAddrs.push_back(
4292             B.buildBuildVector(
4293                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4294                 .getReg(0));
4295         ++I;
4296       }
4297     }
4298   }
4299 }
4300 
4301 /// Convert from separate vaddr components to a single vector address register,
4302 /// and replace the remaining operands with $noreg.
4303 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4304                                      int DimIdx, int NumVAddrs) {
4305   const LLT S32 = LLT::scalar(32);
4306 
4307   SmallVector<Register, 8> AddrRegs;
4308   for (int I = 0; I != NumVAddrs; ++I) {
4309     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4310     if (SrcOp.isReg()) {
4311       AddrRegs.push_back(SrcOp.getReg());
4312       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4313     }
4314   }
4315 
4316   int NumAddrRegs = AddrRegs.size();
4317   if (NumAddrRegs != 1) {
4318     // Above 8 elements round up to next power of 2 (i.e. 16).
4319     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4320       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4321       auto Undef = B.buildUndef(S32);
4322       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4323       NumAddrRegs = RoundedNumRegs;
4324     }
4325 
4326     auto VAddr =
4327         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4328     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4329   }
4330 
4331   for (int I = 1; I != NumVAddrs; ++I) {
4332     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4333     if (SrcOp.isReg())
4334       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4335   }
4336 }
4337 
4338 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4339 ///
4340 /// Depending on the subtarget, load/store with 16-bit element data need to be
4341 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4342 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4343 /// registers.
4344 ///
4345 /// We don't want to directly select image instructions just yet, but also want
4346 /// to exposes all register repacking to the legalizer/combiners. We also don't
4347 /// want a selected instrution entering RegBankSelect. In order to avoid
4348 /// defining a multitude of intermediate image instructions, directly hack on
4349 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4350 /// padding now unnecessary arguments with $noreg.
4351 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4352     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4353     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4354 
4355   const unsigned NumDefs = MI.getNumExplicitDefs();
4356   const unsigned ArgOffset = NumDefs + 1;
4357   bool IsTFE = NumDefs == 2;
4358   // We are only processing the operands of d16 image operations on subtargets
4359   // that use the unpacked register layout, or need to repack the TFE result.
4360 
4361   // TODO: Do we need to guard against already legalized intrinsics?
4362   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4363       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4364 
4365   MachineRegisterInfo *MRI = B.getMRI();
4366   const LLT S32 = LLT::scalar(32);
4367   const LLT S16 = LLT::scalar(16);
4368   const LLT V2S16 = LLT::fixed_vector(2, 16);
4369 
4370   unsigned DMask = 0;
4371   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4372   LLT Ty = MRI->getType(VData);
4373 
4374   // Check for 16 bit addresses and pack if true.
4375   LLT GradTy =
4376       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4377   LLT AddrTy =
4378       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4379   const bool IsG16 = GradTy == S16;
4380   const bool IsA16 = AddrTy == S16;
4381   const bool IsD16 = Ty.getScalarType() == S16;
4382 
4383   int DMaskLanes = 0;
4384   if (!BaseOpcode->Atomic) {
4385     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4386     if (BaseOpcode->Gather4) {
4387       DMaskLanes = 4;
4388     } else if (DMask != 0) {
4389       DMaskLanes = countPopulation(DMask);
4390     } else if (!IsTFE && !BaseOpcode->Store) {
4391       // If dmask is 0, this is a no-op load. This can be eliminated.
4392       B.buildUndef(MI.getOperand(0));
4393       MI.eraseFromParent();
4394       return true;
4395     }
4396   }
4397 
4398   Observer.changingInstr(MI);
4399   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4400 
4401   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4402                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4403   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4404                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4405   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4406 
4407   // Track that we legalized this
4408   MI.setDesc(B.getTII().get(NewOpcode));
4409 
4410   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4411   // dmask to be at least 1 otherwise the instruction will fail
4412   if (IsTFE && DMask == 0) {
4413     DMask = 0x1;
4414     DMaskLanes = 1;
4415     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4416   }
4417 
4418   if (BaseOpcode->Atomic) {
4419     Register VData0 = MI.getOperand(2).getReg();
4420     LLT Ty = MRI->getType(VData0);
4421 
4422     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4423     if (Ty.isVector())
4424       return false;
4425 
4426     if (BaseOpcode->AtomicX2) {
4427       Register VData1 = MI.getOperand(3).getReg();
4428       // The two values are packed in one register.
4429       LLT PackedTy = LLT::fixed_vector(2, Ty);
4430       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4431       MI.getOperand(2).setReg(Concat.getReg(0));
4432       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4433     }
4434   }
4435 
4436   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4437 
4438   // Optimize _L to _LZ when _L is zero
4439   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4440           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4441     const ConstantFP *ConstantLod;
4442 
4443     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4444                  m_GFCst(ConstantLod))) {
4445       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4446         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4447         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4448             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4449                                                      Intr->Dim);
4450 
4451         // The starting indexes should remain in the same place.
4452         --CorrectedNumVAddrs;
4453 
4454         MI.getOperand(NumDefs).setIntrinsicID(
4455             static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4456         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4457         Intr = NewImageDimIntr;
4458       }
4459     }
4460   }
4461 
4462   // Optimize _mip away, when 'lod' is zero
4463   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4464     int64_t ConstantLod;
4465     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4466                  m_ICst(ConstantLod))) {
4467       if (ConstantLod == 0) {
4468         // TODO: Change intrinsic opcode and remove operand instead or replacing
4469         // it with 0, as the _L to _LZ handling is done above.
4470         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4471         --CorrectedNumVAddrs;
4472       }
4473     }
4474   }
4475 
4476   // Rewrite the addressing register layout before doing anything else.
4477   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4478     // 16 bit gradients are supported, but are tied to the A16 control
4479     // so both gradients and addresses must be 16 bit
4480     return false;
4481   }
4482 
4483   if (IsA16 && !ST.hasA16()) {
4484     // A16 not supported
4485     return false;
4486   }
4487 
4488   if (IsA16 || IsG16) {
4489     if (Intr->NumVAddrs > 1) {
4490       SmallVector<Register, 4> PackedRegs;
4491 
4492       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4493                                 IsG16);
4494 
4495       // See also below in the non-a16 branch
4496       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4497                           PackedRegs.size() <= ST.getNSAMaxSize();
4498 
4499       if (!UseNSA && PackedRegs.size() > 1) {
4500         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4501         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4502         PackedRegs[0] = Concat.getReg(0);
4503         PackedRegs.resize(1);
4504       }
4505 
4506       const unsigned NumPacked = PackedRegs.size();
4507       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4508         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4509         if (!SrcOp.isReg()) {
4510           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4511           continue;
4512         }
4513 
4514         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4515 
4516         if (I - Intr->VAddrStart < NumPacked)
4517           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4518         else
4519           SrcOp.setReg(AMDGPU::NoRegister);
4520       }
4521     }
4522   } else {
4523     // If the register allocator cannot place the address registers contiguously
4524     // without introducing moves, then using the non-sequential address encoding
4525     // is always preferable, since it saves VALU instructions and is usually a
4526     // wash in terms of code size or even better.
4527     //
4528     // However, we currently have no way of hinting to the register allocator
4529     // that MIMG addresses should be placed contiguously when it is possible to
4530     // do so, so force non-NSA for the common 2-address case as a heuristic.
4531     //
4532     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4533     // allocation when possible.
4534     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4535                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4536 
4537     if (!UseNSA && Intr->NumVAddrs > 1)
4538       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4539                                Intr->NumVAddrs);
4540   }
4541 
4542   int Flags = 0;
4543   if (IsA16)
4544     Flags |= 1;
4545   if (IsG16)
4546     Flags |= 2;
4547   MI.addOperand(MachineOperand::CreateImm(Flags));
4548 
4549   if (BaseOpcode->Store) { // No TFE for stores?
4550     // TODO: Handle dmask trim
4551     if (!Ty.isVector() || !IsD16)
4552       return true;
4553 
4554     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4555     if (RepackedReg != VData) {
4556       MI.getOperand(1).setReg(RepackedReg);
4557     }
4558 
4559     return true;
4560   }
4561 
4562   Register DstReg = MI.getOperand(0).getReg();
4563   const LLT EltTy = Ty.getScalarType();
4564   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4565 
4566   // Confirm that the return type is large enough for the dmask specified
4567   if (NumElts < DMaskLanes)
4568     return false;
4569 
4570   if (NumElts > 4 || DMaskLanes > 4)
4571     return false;
4572 
4573   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4574   const LLT AdjustedTy =
4575       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4576 
4577   // The raw dword aligned data component of the load. The only legal cases
4578   // where this matters should be when using the packed D16 format, for
4579   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4580   LLT RoundedTy;
4581 
4582   // S32 vector to to cover all data, plus TFE result element.
4583   LLT TFETy;
4584 
4585   // Register type to use for each loaded component. Will be S32 or V2S16.
4586   LLT RegTy;
4587 
4588   if (IsD16 && ST.hasUnpackedD16VMem()) {
4589     RoundedTy =
4590         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4591     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4592     RegTy = S32;
4593   } else {
4594     unsigned EltSize = EltTy.getSizeInBits();
4595     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4596     unsigned RoundedSize = 32 * RoundedElts;
4597     RoundedTy = LLT::scalarOrVector(
4598         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4599     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4600     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4601   }
4602 
4603   // The return type does not need adjustment.
4604   // TODO: Should we change s16 case to s32 or <2 x s16>?
4605   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4606     return true;
4607 
4608   Register Dst1Reg;
4609 
4610   // Insert after the instruction.
4611   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4612 
4613   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4614   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4615   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4616   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4617 
4618   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4619 
4620   MI.getOperand(0).setReg(NewResultReg);
4621 
4622   // In the IR, TFE is supposed to be used with a 2 element struct return
4623   // type. The instruction really returns these two values in one contiguous
4624   // register, with one additional dword beyond the loaded data. Rewrite the
4625   // return type to use a single register result.
4626 
4627   if (IsTFE) {
4628     Dst1Reg = MI.getOperand(1).getReg();
4629     if (MRI->getType(Dst1Reg) != S32)
4630       return false;
4631 
4632     // TODO: Make sure the TFE operand bit is set.
4633     MI.RemoveOperand(1);
4634 
4635     // Handle the easy case that requires no repack instructions.
4636     if (Ty == S32) {
4637       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4638       return true;
4639     }
4640   }
4641 
4642   // Now figure out how to copy the new result register back into the old
4643   // result.
4644   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4645 
4646   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4647 
4648   if (ResultNumRegs == 1) {
4649     assert(!IsTFE);
4650     ResultRegs[0] = NewResultReg;
4651   } else {
4652     // We have to repack into a new vector of some kind.
4653     for (int I = 0; I != NumDataRegs; ++I)
4654       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4655     B.buildUnmerge(ResultRegs, NewResultReg);
4656 
4657     // Drop the final TFE element to get the data part. The TFE result is
4658     // directly written to the right place already.
4659     if (IsTFE)
4660       ResultRegs.resize(NumDataRegs);
4661   }
4662 
4663   // For an s16 scalar result, we form an s32 result with a truncate regardless
4664   // of packed vs. unpacked.
4665   if (IsD16 && !Ty.isVector()) {
4666     B.buildTrunc(DstReg, ResultRegs[0]);
4667     return true;
4668   }
4669 
4670   // Avoid a build/concat_vector of 1 entry.
4671   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4672     B.buildBitcast(DstReg, ResultRegs[0]);
4673     return true;
4674   }
4675 
4676   assert(Ty.isVector());
4677 
4678   if (IsD16) {
4679     // For packed D16 results with TFE enabled, all the data components are
4680     // S32. Cast back to the expected type.
4681     //
4682     // TODO: We don't really need to use load s32 elements. We would only need one
4683     // cast for the TFE result if a multiple of v2s16 was used.
4684     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4685       for (Register &Reg : ResultRegs)
4686         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4687     } else if (ST.hasUnpackedD16VMem()) {
4688       for (Register &Reg : ResultRegs)
4689         Reg = B.buildTrunc(S16, Reg).getReg(0);
4690     }
4691   }
4692 
4693   auto padWithUndef = [&](LLT Ty, int NumElts) {
4694     if (NumElts == 0)
4695       return;
4696     Register Undef = B.buildUndef(Ty).getReg(0);
4697     for (int I = 0; I != NumElts; ++I)
4698       ResultRegs.push_back(Undef);
4699   };
4700 
4701   // Pad out any elements eliminated due to the dmask.
4702   LLT ResTy = MRI->getType(ResultRegs[0]);
4703   if (!ResTy.isVector()) {
4704     padWithUndef(ResTy, NumElts - ResultRegs.size());
4705     B.buildBuildVector(DstReg, ResultRegs);
4706     return true;
4707   }
4708 
4709   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4710   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4711 
4712   // Deal with the one annoying legal case.
4713   const LLT V3S16 = LLT::fixed_vector(3, 16);
4714   if (Ty == V3S16) {
4715     if (IsTFE) {
4716       if (ResultRegs.size() == 1) {
4717         NewResultReg = ResultRegs[0];
4718       } else if (ResultRegs.size() == 2) {
4719         LLT V4S16 = LLT::fixed_vector(4, 16);
4720         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4721       } else {
4722         return false;
4723       }
4724     }
4725 
4726     if (MRI->getType(DstReg).getNumElements() <
4727         MRI->getType(NewResultReg).getNumElements()) {
4728       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4729     } else {
4730       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4731     }
4732     return true;
4733   }
4734 
4735   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4736   B.buildConcatVectors(DstReg, ResultRegs);
4737   return true;
4738 }
4739 
4740 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4741   LegalizerHelper &Helper, MachineInstr &MI) const {
4742   MachineIRBuilder &B = Helper.MIRBuilder;
4743   GISelChangeObserver &Observer = Helper.Observer;
4744 
4745   Register Dst = MI.getOperand(0).getReg();
4746   LLT Ty = B.getMRI()->getType(Dst);
4747   unsigned Size = Ty.getSizeInBits();
4748   MachineFunction &MF = B.getMF();
4749 
4750   Observer.changingInstr(MI);
4751 
4752   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4753     Ty = getBitcastRegisterType(Ty);
4754     Helper.bitcastDst(MI, Ty, 0);
4755     Dst = MI.getOperand(0).getReg();
4756     B.setInsertPt(B.getMBB(), MI);
4757   }
4758 
4759   // FIXME: We don't really need this intermediate instruction. The intrinsic
4760   // should be fixed to have a memory operand. Since it's readnone, we're not
4761   // allowed to add one.
4762   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4763   MI.RemoveOperand(1); // Remove intrinsic ID
4764 
4765   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4766   // TODO: Should this use datalayout alignment?
4767   const unsigned MemSize = (Size + 7) / 8;
4768   const Align MemAlign(4);
4769   MachineMemOperand *MMO = MF.getMachineMemOperand(
4770       MachinePointerInfo(),
4771       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4772           MachineMemOperand::MOInvariant,
4773       MemSize, MemAlign);
4774   MI.addMemOperand(MF, MMO);
4775 
4776   // There are no 96-bit result scalar loads, but widening to 128-bit should
4777   // always be legal. We may need to restore this to a 96-bit result if it turns
4778   // out this needs to be converted to a vector load during RegBankSelect.
4779   if (!isPowerOf2_32(Size)) {
4780     if (Ty.isVector())
4781       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4782     else
4783       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4784   }
4785 
4786   Observer.changedInstr(MI);
4787   return true;
4788 }
4789 
4790 // TODO: Move to selection
4791 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4792                                                 MachineRegisterInfo &MRI,
4793                                                 MachineIRBuilder &B) const {
4794   if (!ST.isTrapHandlerEnabled() ||
4795       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4796     return legalizeTrapEndpgm(MI, MRI, B);
4797 
4798   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4799     switch (*HsaAbiVer) {
4800     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4801     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4802       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4803     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4804       return ST.supportsGetDoorbellID() ?
4805           legalizeTrapHsa(MI, MRI, B) :
4806           legalizeTrapHsaQueuePtr(MI, MRI, B);
4807     }
4808   }
4809 
4810   llvm_unreachable("Unknown trap handler");
4811 }
4812 
4813 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4814     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4815   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4816   MI.eraseFromParent();
4817   return true;
4818 }
4819 
4820 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4821     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4822   // Pass queue pointer to trap handler as input, and insert trap instruction
4823   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4824   Register LiveIn =
4825     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4826   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4827     return false;
4828 
4829   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4830   B.buildCopy(SGPR01, LiveIn);
4831   B.buildInstr(AMDGPU::S_TRAP)
4832       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4833       .addReg(SGPR01, RegState::Implicit);
4834 
4835   MI.eraseFromParent();
4836   return true;
4837 }
4838 
4839 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4840     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4841   B.buildInstr(AMDGPU::S_TRAP)
4842       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4843   MI.eraseFromParent();
4844   return true;
4845 }
4846 
4847 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4848     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4849   // Is non-HSA path or trap-handler disabled? Then, report a warning
4850   // accordingly
4851   if (!ST.isTrapHandlerEnabled() ||
4852       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4853     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4854                                      "debugtrap handler not supported",
4855                                      MI.getDebugLoc(), DS_Warning);
4856     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4857     Ctx.diagnose(NoTrap);
4858   } else {
4859     // Insert debug-trap instruction
4860     B.buildInstr(AMDGPU::S_TRAP)
4861         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4862   }
4863 
4864   MI.eraseFromParent();
4865   return true;
4866 }
4867 
4868 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4869                                                MachineIRBuilder &B) const {
4870   MachineRegisterInfo &MRI = *B.getMRI();
4871   const LLT S16 = LLT::scalar(16);
4872   const LLT S32 = LLT::scalar(32);
4873 
4874   Register DstReg = MI.getOperand(0).getReg();
4875   Register NodePtr = MI.getOperand(2).getReg();
4876   Register RayExtent = MI.getOperand(3).getReg();
4877   Register RayOrigin = MI.getOperand(4).getReg();
4878   Register RayDir = MI.getOperand(5).getReg();
4879   Register RayInvDir = MI.getOperand(6).getReg();
4880   Register TDescr = MI.getOperand(7).getReg();
4881 
4882   if (!ST.hasGFX10_AEncoding()) {
4883     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4884                                         "intrinsic not supported on subtarget",
4885                                         MI.getDebugLoc());
4886     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4887     return false;
4888   }
4889 
4890   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4891   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4892   const unsigned NumVDataDwords = 4;
4893   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4894   const bool UseNSA =
4895       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4896   const unsigned BaseOpcodes[2][2] = {
4897       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4898       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4899        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4900   int Opcode;
4901   if (UseNSA) {
4902     Opcode =
4903         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4904                               NumVDataDwords, NumVAddrDwords);
4905   } else {
4906     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4907                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4908                                    PowerOf2Ceil(NumVAddrDwords));
4909   }
4910   assert(Opcode != -1);
4911 
4912   SmallVector<Register, 12> Ops;
4913   if (Is64) {
4914     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4915     Ops.push_back(Unmerge.getReg(0));
4916     Ops.push_back(Unmerge.getReg(1));
4917   } else {
4918     Ops.push_back(NodePtr);
4919   }
4920   Ops.push_back(RayExtent);
4921 
4922   auto packLanes = [&Ops, &S32, &B](Register Src) {
4923     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4924     Ops.push_back(Unmerge.getReg(0));
4925     Ops.push_back(Unmerge.getReg(1));
4926     Ops.push_back(Unmerge.getReg(2));
4927   };
4928 
4929   packLanes(RayOrigin);
4930   if (IsA16) {
4931     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4932     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4933     Register R1 = MRI.createGenericVirtualRegister(S32);
4934     Register R2 = MRI.createGenericVirtualRegister(S32);
4935     Register R3 = MRI.createGenericVirtualRegister(S32);
4936     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4937     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4938     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4939     Ops.push_back(R1);
4940     Ops.push_back(R2);
4941     Ops.push_back(R3);
4942   } else {
4943     packLanes(RayDir);
4944     packLanes(RayInvDir);
4945   }
4946 
4947   if (!UseNSA) {
4948     // Build a single vector containing all the operands so far prepared.
4949     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4950     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4951     Ops.clear();
4952     Ops.push_back(MergedOps);
4953   }
4954 
4955   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4956     .addDef(DstReg)
4957     .addImm(Opcode);
4958 
4959   for (Register R : Ops) {
4960     MIB.addUse(R);
4961   }
4962 
4963   MIB.addUse(TDescr)
4964      .addImm(IsA16 ? 1 : 0)
4965      .cloneMemRefs(MI);
4966 
4967   MI.eraseFromParent();
4968   return true;
4969 }
4970 
4971 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4972   B.buildConstant(MI.getOperand(0).getReg(), C);
4973   MI.eraseFromParent();
4974   return true;
4975 }
4976 
4977 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4978                                             MachineInstr &MI) const {
4979   MachineIRBuilder &B = Helper.MIRBuilder;
4980   MachineRegisterInfo &MRI = *B.getMRI();
4981 
4982   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4983   auto IntrID = MI.getIntrinsicID();
4984   switch (IntrID) {
4985   case Intrinsic::amdgcn_if:
4986   case Intrinsic::amdgcn_else: {
4987     MachineInstr *Br = nullptr;
4988     MachineBasicBlock *UncondBrTarget = nullptr;
4989     bool Negated = false;
4990     if (MachineInstr *BrCond =
4991             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4992       const SIRegisterInfo *TRI
4993         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4994 
4995       Register Def = MI.getOperand(1).getReg();
4996       Register Use = MI.getOperand(3).getReg();
4997 
4998       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4999 
5000       if (Negated)
5001         std::swap(CondBrTarget, UncondBrTarget);
5002 
5003       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5004       if (IntrID == Intrinsic::amdgcn_if) {
5005         B.buildInstr(AMDGPU::SI_IF)
5006           .addDef(Def)
5007           .addUse(Use)
5008           .addMBB(UncondBrTarget);
5009       } else {
5010         B.buildInstr(AMDGPU::SI_ELSE)
5011             .addDef(Def)
5012             .addUse(Use)
5013             .addMBB(UncondBrTarget);
5014       }
5015 
5016       if (Br) {
5017         Br->getOperand(0).setMBB(CondBrTarget);
5018       } else {
5019         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5020         // since we're swapping branch targets it needs to be reinserted.
5021         // FIXME: IRTranslator should probably not do this
5022         B.buildBr(*CondBrTarget);
5023       }
5024 
5025       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5026       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5027       MI.eraseFromParent();
5028       BrCond->eraseFromParent();
5029       return true;
5030     }
5031 
5032     return false;
5033   }
5034   case Intrinsic::amdgcn_loop: {
5035     MachineInstr *Br = nullptr;
5036     MachineBasicBlock *UncondBrTarget = nullptr;
5037     bool Negated = false;
5038     if (MachineInstr *BrCond =
5039             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5040       const SIRegisterInfo *TRI
5041         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5042 
5043       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5044       Register Reg = MI.getOperand(2).getReg();
5045 
5046       if (Negated)
5047         std::swap(CondBrTarget, UncondBrTarget);
5048 
5049       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5050       B.buildInstr(AMDGPU::SI_LOOP)
5051         .addUse(Reg)
5052         .addMBB(UncondBrTarget);
5053 
5054       if (Br)
5055         Br->getOperand(0).setMBB(CondBrTarget);
5056       else
5057         B.buildBr(*CondBrTarget);
5058 
5059       MI.eraseFromParent();
5060       BrCond->eraseFromParent();
5061       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5062       return true;
5063     }
5064 
5065     return false;
5066   }
5067   case Intrinsic::amdgcn_kernarg_segment_ptr:
5068     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5069       // This only makes sense to call in a kernel, so just lower to null.
5070       B.buildConstant(MI.getOperand(0).getReg(), 0);
5071       MI.eraseFromParent();
5072       return true;
5073     }
5074 
5075     return legalizePreloadedArgIntrin(
5076       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5077   case Intrinsic::amdgcn_implicitarg_ptr:
5078     return legalizeImplicitArgPtr(MI, MRI, B);
5079   case Intrinsic::amdgcn_workitem_id_x:
5080     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5081       return replaceWithConstant(B, MI, 0);
5082     return legalizePreloadedArgIntrin(MI, MRI, B,
5083                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5084   case Intrinsic::amdgcn_workitem_id_y:
5085     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5086       return replaceWithConstant(B, MI, 0);
5087 
5088     return legalizePreloadedArgIntrin(MI, MRI, B,
5089                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5090   case Intrinsic::amdgcn_workitem_id_z:
5091     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5092       return replaceWithConstant(B, MI, 0);
5093 
5094     return legalizePreloadedArgIntrin(MI, MRI, B,
5095                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5096   case Intrinsic::amdgcn_workgroup_id_x:
5097     return legalizePreloadedArgIntrin(MI, MRI, B,
5098                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5099   case Intrinsic::amdgcn_workgroup_id_y:
5100     return legalizePreloadedArgIntrin(MI, MRI, B,
5101                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5102   case Intrinsic::amdgcn_workgroup_id_z:
5103     return legalizePreloadedArgIntrin(MI, MRI, B,
5104                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5105   case Intrinsic::amdgcn_dispatch_ptr:
5106     return legalizePreloadedArgIntrin(MI, MRI, B,
5107                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5108   case Intrinsic::amdgcn_queue_ptr:
5109     return legalizePreloadedArgIntrin(MI, MRI, B,
5110                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5111   case Intrinsic::amdgcn_implicit_buffer_ptr:
5112     return legalizePreloadedArgIntrin(
5113       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5114   case Intrinsic::amdgcn_dispatch_id:
5115     return legalizePreloadedArgIntrin(MI, MRI, B,
5116                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5117   case Intrinsic::amdgcn_fdiv_fast:
5118     return legalizeFDIVFastIntrin(MI, MRI, B);
5119   case Intrinsic::amdgcn_is_shared:
5120     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5121   case Intrinsic::amdgcn_is_private:
5122     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5123   case Intrinsic::amdgcn_wavefrontsize: {
5124     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5125     MI.eraseFromParent();
5126     return true;
5127   }
5128   case Intrinsic::amdgcn_s_buffer_load:
5129     return legalizeSBufferLoad(Helper, MI);
5130   case Intrinsic::amdgcn_raw_buffer_store:
5131   case Intrinsic::amdgcn_struct_buffer_store:
5132     return legalizeBufferStore(MI, MRI, B, false, false);
5133   case Intrinsic::amdgcn_raw_buffer_store_format:
5134   case Intrinsic::amdgcn_struct_buffer_store_format:
5135     return legalizeBufferStore(MI, MRI, B, false, true);
5136   case Intrinsic::amdgcn_raw_tbuffer_store:
5137   case Intrinsic::amdgcn_struct_tbuffer_store:
5138     return legalizeBufferStore(MI, MRI, B, true, true);
5139   case Intrinsic::amdgcn_raw_buffer_load:
5140   case Intrinsic::amdgcn_struct_buffer_load:
5141     return legalizeBufferLoad(MI, MRI, B, false, false);
5142   case Intrinsic::amdgcn_raw_buffer_load_format:
5143   case Intrinsic::amdgcn_struct_buffer_load_format:
5144     return legalizeBufferLoad(MI, MRI, B, true, false);
5145   case Intrinsic::amdgcn_raw_tbuffer_load:
5146   case Intrinsic::amdgcn_struct_tbuffer_load:
5147     return legalizeBufferLoad(MI, MRI, B, true, true);
5148   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5149   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5150   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5151   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5152   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5153   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5154   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5155   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5156   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5157   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5158   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5159   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5160   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5161   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5162   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5163   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5164   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5165   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5166   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5167   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5168   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5169   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5170   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5171   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5172   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5173   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5174   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5175   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5176   case Intrinsic::amdgcn_buffer_atomic_fadd:
5177   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5178   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5179   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5180   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5181     return legalizeBufferAtomic(MI, B, IntrID);
5182   case Intrinsic::amdgcn_atomic_inc:
5183     return legalizeAtomicIncDec(MI, B, true);
5184   case Intrinsic::amdgcn_atomic_dec:
5185     return legalizeAtomicIncDec(MI, B, false);
5186   case Intrinsic::trap:
5187     return legalizeTrapIntrinsic(MI, MRI, B);
5188   case Intrinsic::debugtrap:
5189     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5190   case Intrinsic::amdgcn_rsq_clamp:
5191     return legalizeRsqClampIntrinsic(MI, MRI, B);
5192   case Intrinsic::amdgcn_ds_fadd:
5193   case Intrinsic::amdgcn_ds_fmin:
5194   case Intrinsic::amdgcn_ds_fmax:
5195     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5196   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5197     return legalizeBVHIntrinsic(MI, B);
5198   default: {
5199     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5200             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5201       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5202     return true;
5203   }
5204   }
5205 
5206   return true;
5207 }
5208