1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .widenScalarToNextPow2(1, 32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32);
936 
937 
938   // The hardware instructions return a different result on 0 than the generic
939   // instructions expect. The hardware produces -1, but these produce the
940   // bitwidth.
941   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
942     .scalarize(0)
943     .clampScalar(0, S32, S32)
944     .clampScalar(1, S32, S64)
945     .widenScalarToNextPow2(0, 32)
946     .widenScalarToNextPow2(1, 32)
947     .custom();
948 
949   // The 64-bit versions produce 32-bit results, but only on the SALU.
950   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
951     .legalFor({{S32, S32}, {S32, S64}})
952     .clampScalar(0, S32, S32)
953     .clampScalar(1, S32, S64)
954     .scalarize(0)
955     .widenScalarToNextPow2(0, 32)
956     .widenScalarToNextPow2(1, 32);
957 
958   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
959   // RegBankSelect.
960   getActionDefinitionsBuilder(G_BITREVERSE)
961     .legalFor({S32, S64})
962     .clampScalar(0, S32, S64)
963     .scalarize(0)
964     .widenScalarToNextPow2(0);
965 
966   if (ST.has16BitInsts()) {
967     getActionDefinitionsBuilder(G_BSWAP)
968       .legalFor({S16, S32, V2S16})
969       .clampMaxNumElementsStrict(0, S16, 2)
970       // FIXME: Fixing non-power-of-2 before clamp is workaround for
971       // narrowScalar limitation.
972       .widenScalarToNextPow2(0)
973       .clampScalar(0, S16, S32)
974       .scalarize(0);
975 
976     if (ST.hasVOP3PInsts()) {
977       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
978         .legalFor({S32, S16, V2S16})
979         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
980         .clampMaxNumElements(0, S16, 2)
981         .minScalar(0, S16)
982         .widenScalarToNextPow2(0)
983         .scalarize(0)
984         .lower();
985     } else {
986       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
987         .legalFor({S32, S16})
988         .widenScalarToNextPow2(0)
989         .minScalar(0, S16)
990         .scalarize(0)
991         .lower();
992     }
993   } else {
994     // TODO: Should have same legality without v_perm_b32
995     getActionDefinitionsBuilder(G_BSWAP)
996       .legalFor({S32})
997       .lowerIf(scalarNarrowerThan(0, 32))
998       // FIXME: Fixing non-power-of-2 before clamp is workaround for
999       // narrowScalar limitation.
1000       .widenScalarToNextPow2(0)
1001       .maxScalar(0, S32)
1002       .scalarize(0)
1003       .lower();
1004 
1005     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1006       .legalFor({S32})
1007       .minScalar(0, S32)
1008       .widenScalarToNextPow2(0)
1009       .scalarize(0)
1010       .lower();
1011   }
1012 
1013   getActionDefinitionsBuilder(G_INTTOPTR)
1014     // List the common cases
1015     .legalForCartesianProduct(AddrSpaces64, {S64})
1016     .legalForCartesianProduct(AddrSpaces32, {S32})
1017     .scalarize(0)
1018     // Accept any address space as long as the size matches
1019     .legalIf(sameSize(0, 1))
1020     .widenScalarIf(smallerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       })
1024     .narrowScalarIf(largerThan(1, 0),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1027       });
1028 
1029   getActionDefinitionsBuilder(G_PTRTOINT)
1030     // List the common cases
1031     .legalForCartesianProduct(AddrSpaces64, {S64})
1032     .legalForCartesianProduct(AddrSpaces32, {S32})
1033     .scalarize(0)
1034     // Accept any address space as long as the size matches
1035     .legalIf(sameSize(0, 1))
1036     .widenScalarIf(smallerThan(0, 1),
1037       [](const LegalityQuery &Query) {
1038         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1039       })
1040     .narrowScalarIf(
1041       largerThan(0, 1),
1042       [](const LegalityQuery &Query) {
1043         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044       });
1045 
1046   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1047     .scalarize(0)
1048     .custom();
1049 
1050   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1051                                     bool IsLoad) -> bool {
1052     const LLT DstTy = Query.Types[0];
1053 
1054     // Split vector extloads.
1055     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1056 
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059 
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064 
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076 
1077     return false;
1078   };
1079 
1080   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1081   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1082   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1083 
1084   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1085   // LDS
1086   // TODO: Unsupported flat for SI.
1087 
1088   for (unsigned Op : {G_LOAD, G_STORE}) {
1089     const bool IsStore = Op == G_STORE;
1090 
1091     auto &Actions = getActionDefinitionsBuilder(Op);
1092     // Explicitly list some common cases.
1093     // TODO: Does this help compile time at all?
1094     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1095                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1096                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1097                                       {S64, GlobalPtr, S64, GlobalAlign32},
1098                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1099                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1100                                       {S32, GlobalPtr, S8, GlobalAlign8},
1101                                       {S32, GlobalPtr, S16, GlobalAlign16},
1102 
1103                                       {S32, LocalPtr, S32, 32},
1104                                       {S64, LocalPtr, S64, 32},
1105                                       {V2S32, LocalPtr, V2S32, 32},
1106                                       {S32, LocalPtr, S8, 8},
1107                                       {S32, LocalPtr, S16, 16},
1108                                       {V2S16, LocalPtr, S32, 32},
1109 
1110                                       {S32, PrivatePtr, S32, 32},
1111                                       {S32, PrivatePtr, S8, 8},
1112                                       {S32, PrivatePtr, S16, 16},
1113                                       {V2S16, PrivatePtr, S32, 32},
1114 
1115                                       {S32, ConstantPtr, S32, GlobalAlign32},
1116                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1117                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1118                                       {S64, ConstantPtr, S64, GlobalAlign32},
1119                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1120     Actions.legalIf(
1121       [=](const LegalityQuery &Query) -> bool {
1122         return isLoadStoreLegal(ST, Query);
1123       });
1124 
1125     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1126     // 64-bits.
1127     //
1128     // TODO: Should generalize bitcast action into coerce, which will also cover
1129     // inserting addrspacecasts.
1130     Actions.customIf(typeIs(1, Constant32Ptr));
1131 
1132     // Turn any illegal element vectors into something easier to deal
1133     // with. These will ultimately produce 32-bit scalar shifts to extract the
1134     // parts anyway.
1135     //
1136     // For odd 16-bit element vectors, prefer to split those into pieces with
1137     // 16-bit vector parts.
1138     Actions.bitcastIf(
1139       [=](const LegalityQuery &Query) -> bool {
1140         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1141                                           Query.MMODescrs[0].MemoryTy);
1142       }, bitcastToRegisterType(0));
1143 
1144     if (!IsStore) {
1145       // Widen suitably aligned loads by loading extra bytes. The standard
1146       // legalization actions can't properly express widening memory operands.
1147       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1148         return shouldWidenLoad(ST, Query, G_LOAD);
1149       });
1150     }
1151 
1152     // FIXME: load/store narrowing should be moved to lower action
1153     Actions
1154         .narrowScalarIf(
1155             [=](const LegalityQuery &Query) -> bool {
1156               return !Query.Types[0].isVector() &&
1157                      needToSplitMemOp(Query, Op == G_LOAD);
1158             },
1159             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1160               const LLT DstTy = Query.Types[0];
1161               const LLT PtrTy = Query.Types[1];
1162 
1163               const unsigned DstSize = DstTy.getSizeInBits();
1164               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1165 
1166               // Split extloads.
1167               if (DstSize > MemSize)
1168                 return std::make_pair(0, LLT::scalar(MemSize));
1169 
1170               unsigned MaxSize = maxSizeForAddrSpace(ST,
1171                                                      PtrTy.getAddressSpace(),
1172                                                      Op == G_LOAD);
1173               if (MemSize > MaxSize)
1174                 return std::make_pair(0, LLT::scalar(MaxSize));
1175 
1176               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1177               return std::make_pair(0, LLT::scalar(Align));
1178             })
1179         .fewerElementsIf(
1180             [=](const LegalityQuery &Query) -> bool {
1181               return Query.Types[0].isVector() &&
1182                      needToSplitMemOp(Query, Op == G_LOAD);
1183             },
1184             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1185               const LLT DstTy = Query.Types[0];
1186               const LLT PtrTy = Query.Types[1];
1187 
1188               LLT EltTy = DstTy.getElementType();
1189               unsigned MaxSize = maxSizeForAddrSpace(ST,
1190                                                      PtrTy.getAddressSpace(),
1191                                                      Op == G_LOAD);
1192 
1193               // FIXME: Handle widened to power of 2 results better. This ends
1194               // up scalarizing.
1195               // FIXME: 3 element stores scalarized on SI
1196 
1197               // Split if it's too large for the address space.
1198               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1199               if (MemSize > MaxSize) {
1200                 unsigned NumElts = DstTy.getNumElements();
1201                 unsigned EltSize = EltTy.getSizeInBits();
1202 
1203                 if (MaxSize % EltSize == 0) {
1204                   return std::make_pair(
1205                       0, LLT::scalarOrVector(
1206                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1207                 }
1208 
1209                 unsigned NumPieces = MemSize / MaxSize;
1210 
1211                 // FIXME: Refine when odd breakdowns handled
1212                 // The scalars will need to be re-legalized.
1213                 if (NumPieces == 1 || NumPieces >= NumElts ||
1214                     NumElts % NumPieces != 0)
1215                   return std::make_pair(0, EltTy);
1216 
1217                 return std::make_pair(
1218                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1219               }
1220 
1221               // FIXME: We could probably handle weird extending loads better.
1222               if (DstTy.getSizeInBits() > MemSize)
1223                 return std::make_pair(0, EltTy);
1224 
1225               unsigned EltSize = EltTy.getSizeInBits();
1226               unsigned DstSize = DstTy.getSizeInBits();
1227               if (!isPowerOf2_32(DstSize)) {
1228                 // We're probably decomposing an odd sized store. Try to split
1229                 // to the widest type. TODO: Account for alignment. As-is it
1230                 // should be OK, since the new parts will be further legalized.
1231                 unsigned FloorSize = PowerOf2Floor(DstSize);
1232                 return std::make_pair(
1233                     0, LLT::scalarOrVector(
1234                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1235               }
1236 
1237               // May need relegalization for the scalars.
1238               return std::make_pair(0, EltTy);
1239             })
1240     .minScalar(0, S32)
1241     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1242     .widenScalarToNextPow2(0)
1243     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1244     .lower();
1245   }
1246 
1247   // FIXME: Unaligned accesses not lowered.
1248   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1249                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1250                                                   {S32, GlobalPtr, S16, 2 * 8},
1251                                                   {S32, LocalPtr, S8, 8},
1252                                                   {S32, LocalPtr, S16, 16},
1253                                                   {S32, PrivatePtr, S8, 8},
1254                                                   {S32, PrivatePtr, S16, 16},
1255                                                   {S32, ConstantPtr, S8, 8},
1256                                                   {S32, ConstantPtr, S16, 2 * 8}})
1257                        .legalIf(
1258                          [=](const LegalityQuery &Query) -> bool {
1259                            return isLoadStoreLegal(ST, Query);
1260                          });
1261 
1262   if (ST.hasFlatAddressSpace()) {
1263     ExtLoads.legalForTypesWithMemDesc(
1264         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1265   }
1266 
1267   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1268   // 64-bits.
1269   //
1270   // TODO: Should generalize bitcast action into coerce, which will also cover
1271   // inserting addrspacecasts.
1272   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1273 
1274   ExtLoads.clampScalar(0, S32, S32)
1275           .widenScalarToNextPow2(0)
1276           .lower();
1277 
1278   auto &Atomics = getActionDefinitionsBuilder(
1279     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1280      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1281      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1282      G_ATOMICRMW_UMIN})
1283     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1284                {S64, GlobalPtr}, {S64, LocalPtr},
1285                {S32, RegionPtr}, {S64, RegionPtr}});
1286   if (ST.hasFlatAddressSpace()) {
1287     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1288   }
1289 
1290   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1291   if (ST.hasLDSFPAtomicAdd()) {
1292     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1293     if (ST.hasGFX90AInsts())
1294       Atomic.legalFor({{S64, LocalPtr}});
1295   }
1296   if (ST.hasAtomicFaddInsts())
1297     Atomic.legalFor({{S32, GlobalPtr}});
1298 
1299   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1300   // demarshalling
1301   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1302     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1303                 {S32, FlatPtr}, {S64, FlatPtr}})
1304     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1305                {S32, RegionPtr}, {S64, RegionPtr}});
1306   // TODO: Pointer types, any 32-bit or 64-bit vector
1307 
1308   // Condition should be s32 for scalar, s1 for vector.
1309   getActionDefinitionsBuilder(G_SELECT)
1310       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1311                                  LocalPtr, FlatPtr, PrivatePtr,
1312                                  LLT::fixed_vector(2, LocalPtr),
1313                                  LLT::fixed_vector(2, PrivatePtr)},
1314                                 {S1, S32})
1315       .clampScalar(0, S16, S64)
1316       .scalarize(1)
1317       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1318       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1319       .clampMaxNumElements(0, S32, 2)
1320       .clampMaxNumElements(0, LocalPtr, 2)
1321       .clampMaxNumElements(0, PrivatePtr, 2)
1322       .scalarize(0)
1323       .widenScalarToNextPow2(0)
1324       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1325 
1326   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1327   // be more flexible with the shift amount type.
1328   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1329     .legalFor({{S32, S32}, {S64, S32}});
1330   if (ST.has16BitInsts()) {
1331     if (ST.hasVOP3PInsts()) {
1332       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1333             .clampMaxNumElements(0, S16, 2);
1334     } else
1335       Shifts.legalFor({{S16, S16}});
1336 
1337     // TODO: Support 16-bit shift amounts for all types
1338     Shifts.widenScalarIf(
1339       [=](const LegalityQuery &Query) {
1340         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1341         // 32-bit amount.
1342         const LLT ValTy = Query.Types[0];
1343         const LLT AmountTy = Query.Types[1];
1344         return ValTy.getSizeInBits() <= 16 &&
1345                AmountTy.getSizeInBits() < 16;
1346       }, changeTo(1, S16));
1347     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1348     Shifts.clampScalar(1, S32, S32);
1349     Shifts.widenScalarToNextPow2(0, 16);
1350     Shifts.clampScalar(0, S16, S64);
1351 
1352     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1353       .minScalar(0, S16)
1354       .scalarize(0)
1355       .lower();
1356   } else {
1357     // Make sure we legalize the shift amount type first, as the general
1358     // expansion for the shifted type will produce much worse code if it hasn't
1359     // been truncated already.
1360     Shifts.clampScalar(1, S32, S32);
1361     Shifts.widenScalarToNextPow2(0, 32);
1362     Shifts.clampScalar(0, S32, S64);
1363 
1364     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1365       .minScalar(0, S32)
1366       .scalarize(0)
1367       .lower();
1368   }
1369   Shifts.scalarize(0);
1370 
1371   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1372     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1373     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1374     unsigned IdxTypeIdx = 2;
1375 
1376     getActionDefinitionsBuilder(Op)
1377       .customIf([=](const LegalityQuery &Query) {
1378           const LLT EltTy = Query.Types[EltTypeIdx];
1379           const LLT VecTy = Query.Types[VecTypeIdx];
1380           const LLT IdxTy = Query.Types[IdxTypeIdx];
1381           const unsigned EltSize = EltTy.getSizeInBits();
1382           return (EltSize == 32 || EltSize == 64) &&
1383                   VecTy.getSizeInBits() % 32 == 0 &&
1384                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1385                   IdxTy.getSizeInBits() == 32;
1386         })
1387       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1388                  bitcastToVectorElement32(VecTypeIdx))
1389       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1390       .bitcastIf(
1391         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1392         [=](const LegalityQuery &Query) {
1393           // For > 64-bit element types, try to turn this into a 64-bit
1394           // element vector since we may be able to do better indexing
1395           // if this is scalar. If not, fall back to 32.
1396           const LLT EltTy = Query.Types[EltTypeIdx];
1397           const LLT VecTy = Query.Types[VecTypeIdx];
1398           const unsigned DstEltSize = EltTy.getSizeInBits();
1399           const unsigned VecSize = VecTy.getSizeInBits();
1400 
1401           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1402           return std::make_pair(
1403               VecTypeIdx,
1404               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1405         })
1406       .clampScalar(EltTypeIdx, S32, S64)
1407       .clampScalar(VecTypeIdx, S32, S64)
1408       .clampScalar(IdxTypeIdx, S32, S32)
1409       .clampMaxNumElements(VecTypeIdx, S32, 32)
1410       // TODO: Clamp elements for 64-bit vectors?
1411       // It should only be necessary with variable indexes.
1412       // As a last resort, lower to the stack
1413       .lower();
1414   }
1415 
1416   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1417     .unsupportedIf([=](const LegalityQuery &Query) {
1418         const LLT &EltTy = Query.Types[1].getElementType();
1419         return Query.Types[0] != EltTy;
1420       });
1421 
1422   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1423     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1424     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1425 
1426     // FIXME: Doesn't handle extract of illegal sizes.
1427     getActionDefinitionsBuilder(Op)
1428       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1429       .lowerIf([=](const LegalityQuery &Query) {
1430           // Sub-vector(or single element) insert and extract.
1431           // TODO: verify immediate offset here since lower only works with
1432           // whole elements.
1433           const LLT BigTy = Query.Types[BigTyIdx];
1434           return BigTy.isVector();
1435         })
1436       // FIXME: Multiples of 16 should not be legal.
1437       .legalIf([=](const LegalityQuery &Query) {
1438           const LLT BigTy = Query.Types[BigTyIdx];
1439           const LLT LitTy = Query.Types[LitTyIdx];
1440           return (BigTy.getSizeInBits() % 32 == 0) &&
1441                  (LitTy.getSizeInBits() % 16 == 0);
1442         })
1443       .widenScalarIf(
1444         [=](const LegalityQuery &Query) {
1445           const LLT BigTy = Query.Types[BigTyIdx];
1446           return (BigTy.getScalarSizeInBits() < 16);
1447         },
1448         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1449       .widenScalarIf(
1450         [=](const LegalityQuery &Query) {
1451           const LLT LitTy = Query.Types[LitTyIdx];
1452           return (LitTy.getScalarSizeInBits() < 16);
1453         },
1454         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1455       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1456       .widenScalarToNextPow2(BigTyIdx, 32);
1457 
1458   }
1459 
1460   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1461     .legalForCartesianProduct(AllS32Vectors, {S32})
1462     .legalForCartesianProduct(AllS64Vectors, {S64})
1463     .clampNumElements(0, V16S32, V32S32)
1464     .clampNumElements(0, V2S64, V16S64)
1465     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1466 
1467   if (ST.hasScalarPackInsts()) {
1468     BuildVector
1469       // FIXME: Should probably widen s1 vectors straight to s32
1470       .minScalarOrElt(0, S16)
1471       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1472       .minScalar(1, S32);
1473 
1474     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1475       .legalFor({V2S16, S32})
1476       .lower();
1477     BuildVector.minScalarOrElt(0, S32);
1478   } else {
1479     BuildVector.customFor({V2S16, S16});
1480     BuildVector.minScalarOrElt(0, S32);
1481 
1482     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1483       .customFor({V2S16, S32})
1484       .lower();
1485   }
1486 
1487   BuildVector.legalIf(isRegisterType(0));
1488 
1489   // FIXME: Clamp maximum size
1490   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1491     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1492     .clampMaxNumElements(0, S32, 32)
1493     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1494     .clampMaxNumElements(0, S16, 64);
1495 
1496   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1497   // pre-legalize.
1498   if (ST.hasVOP3PInsts()) {
1499     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1500       .customFor({V2S16, V2S16})
1501       .lower();
1502   } else
1503     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1504 
1505   // Merge/Unmerge
1506   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1507     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1508     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1509 
1510     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1511       const LLT Ty = Query.Types[TypeIdx];
1512       if (Ty.isVector()) {
1513         const LLT &EltTy = Ty.getElementType();
1514         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1515           return true;
1516         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1517           return true;
1518       }
1519       return false;
1520     };
1521 
1522     auto &Builder = getActionDefinitionsBuilder(Op)
1523       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1524       .lowerFor({{S16, V2S16}})
1525       .lowerIf([=](const LegalityQuery &Query) {
1526           const LLT BigTy = Query.Types[BigTyIdx];
1527           return BigTy.getSizeInBits() == 32;
1528         })
1529       // Try to widen to s16 first for small types.
1530       // TODO: Only do this on targets with legal s16 shifts
1531       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1532       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1533       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1534       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1535                            elementTypeIs(1, S16)),
1536                        changeTo(1, V2S16))
1537       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1538       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1539       // valid.
1540       .clampScalar(LitTyIdx, S32, S512)
1541       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1542       // Break up vectors with weird elements into scalars
1543       .fewerElementsIf(
1544         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1545         scalarize(0))
1546       .fewerElementsIf(
1547         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1548         scalarize(1))
1549       .clampScalar(BigTyIdx, S32, MaxScalar);
1550 
1551     if (Op == G_MERGE_VALUES) {
1552       Builder.widenScalarIf(
1553         // TODO: Use 16-bit shifts if legal for 8-bit values?
1554         [=](const LegalityQuery &Query) {
1555           const LLT Ty = Query.Types[LitTyIdx];
1556           return Ty.getSizeInBits() < 32;
1557         },
1558         changeTo(LitTyIdx, S32));
1559     }
1560 
1561     Builder.widenScalarIf(
1562       [=](const LegalityQuery &Query) {
1563         const LLT Ty = Query.Types[BigTyIdx];
1564         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1565           Ty.getSizeInBits() % 16 != 0;
1566       },
1567       [=](const LegalityQuery &Query) {
1568         // Pick the next power of 2, or a multiple of 64 over 128.
1569         // Whichever is smaller.
1570         const LLT &Ty = Query.Types[BigTyIdx];
1571         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1572         if (NewSizeInBits >= 256) {
1573           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1574           if (RoundedTo < NewSizeInBits)
1575             NewSizeInBits = RoundedTo;
1576         }
1577         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1578       })
1579       // Any vectors left are the wrong size. Scalarize them.
1580       .scalarize(0)
1581       .scalarize(1);
1582   }
1583 
1584   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1585   // RegBankSelect.
1586   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1587     .legalFor({{S32}, {S64}});
1588 
1589   if (ST.hasVOP3PInsts()) {
1590     SextInReg.lowerFor({{V2S16}})
1591       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1592       // get more vector shift opportunities, since we'll get those when
1593       // expanded.
1594       .clampMaxNumElementsStrict(0, S16, 2);
1595   } else if (ST.has16BitInsts()) {
1596     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1597   } else {
1598     // Prefer to promote to s32 before lowering if we don't have 16-bit
1599     // shifts. This avoid a lot of intermediate truncate and extend operations.
1600     SextInReg.lowerFor({{S32}, {S64}});
1601   }
1602 
1603   SextInReg
1604     .scalarize(0)
1605     .clampScalar(0, S32, S64)
1606     .lower();
1607 
1608   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1609     .scalarize(0)
1610     .lower();
1611 
1612   // TODO: Only Try to form v2s16 with legal packed instructions.
1613   getActionDefinitionsBuilder(G_FSHR)
1614     .legalFor({{S32, S32}})
1615     .lowerFor({{V2S16, V2S16}})
1616     .clampMaxNumElementsStrict(0, S16, 2)
1617     .scalarize(0)
1618     .lower();
1619 
1620   if (ST.hasVOP3PInsts()) {
1621     getActionDefinitionsBuilder(G_FSHL)
1622       .lowerFor({{V2S16, V2S16}})
1623       .clampMaxNumElementsStrict(0, S16, 2)
1624       .scalarize(0)
1625       .lower();
1626   } else {
1627     getActionDefinitionsBuilder(G_FSHL)
1628       .scalarize(0)
1629       .lower();
1630   }
1631 
1632   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1633     .legalFor({S64});
1634 
1635   getActionDefinitionsBuilder(G_FENCE)
1636     .alwaysLegal();
1637 
1638   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1639       .scalarize(0)
1640       .minScalar(0, S32)
1641       .lower();
1642 
1643   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1644       .legalFor({{S32, S32}, {S64, S32}})
1645       .clampScalar(1, S32, S32)
1646       .clampScalar(0, S32, S64)
1647       .widenScalarToNextPow2(0)
1648       .scalarize(0);
1649 
1650   getActionDefinitionsBuilder({
1651       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1652       G_FCOPYSIGN,
1653 
1654       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1655       G_ATOMICRMW_NAND,
1656       G_ATOMICRMW_FSUB,
1657       G_READ_REGISTER,
1658       G_WRITE_REGISTER,
1659 
1660       G_SADDO, G_SSUBO,
1661 
1662        // TODO: Implement
1663       G_FMINIMUM, G_FMAXIMUM}).lower();
1664 
1665   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1666       .lower();
1667 
1668   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1669         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1670         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1671     .unsupported();
1672 
1673   getLegacyLegalizerInfo().computeTables();
1674   verify(*ST.getInstrInfo());
1675 }
1676 
1677 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1678                                          MachineInstr &MI) const {
1679   MachineIRBuilder &B = Helper.MIRBuilder;
1680   MachineRegisterInfo &MRI = *B.getMRI();
1681 
1682   switch (MI.getOpcode()) {
1683   case TargetOpcode::G_ADDRSPACE_CAST:
1684     return legalizeAddrSpaceCast(MI, MRI, B);
1685   case TargetOpcode::G_FRINT:
1686     return legalizeFrint(MI, MRI, B);
1687   case TargetOpcode::G_FCEIL:
1688     return legalizeFceil(MI, MRI, B);
1689   case TargetOpcode::G_FREM:
1690     return legalizeFrem(MI, MRI, B);
1691   case TargetOpcode::G_INTRINSIC_TRUNC:
1692     return legalizeIntrinsicTrunc(MI, MRI, B);
1693   case TargetOpcode::G_SITOFP:
1694     return legalizeITOFP(MI, MRI, B, true);
1695   case TargetOpcode::G_UITOFP:
1696     return legalizeITOFP(MI, MRI, B, false);
1697   case TargetOpcode::G_FPTOSI:
1698     return legalizeFPTOI(MI, MRI, B, true);
1699   case TargetOpcode::G_FPTOUI:
1700     return legalizeFPTOI(MI, MRI, B, false);
1701   case TargetOpcode::G_FMINNUM:
1702   case TargetOpcode::G_FMAXNUM:
1703   case TargetOpcode::G_FMINNUM_IEEE:
1704   case TargetOpcode::G_FMAXNUM_IEEE:
1705     return legalizeMinNumMaxNum(Helper, MI);
1706   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1707     return legalizeExtractVectorElt(MI, MRI, B);
1708   case TargetOpcode::G_INSERT_VECTOR_ELT:
1709     return legalizeInsertVectorElt(MI, MRI, B);
1710   case TargetOpcode::G_SHUFFLE_VECTOR:
1711     return legalizeShuffleVector(MI, MRI, B);
1712   case TargetOpcode::G_FSIN:
1713   case TargetOpcode::G_FCOS:
1714     return legalizeSinCos(MI, MRI, B);
1715   case TargetOpcode::G_GLOBAL_VALUE:
1716     return legalizeGlobalValue(MI, MRI, B);
1717   case TargetOpcode::G_LOAD:
1718   case TargetOpcode::G_SEXTLOAD:
1719   case TargetOpcode::G_ZEXTLOAD:
1720     return legalizeLoad(Helper, MI);
1721   case TargetOpcode::G_FMAD:
1722     return legalizeFMad(MI, MRI, B);
1723   case TargetOpcode::G_FDIV:
1724     return legalizeFDIV(MI, MRI, B);
1725   case TargetOpcode::G_UDIV:
1726   case TargetOpcode::G_UREM:
1727   case TargetOpcode::G_UDIVREM:
1728     return legalizeUnsignedDIV_REM(MI, MRI, B);
1729   case TargetOpcode::G_SDIV:
1730   case TargetOpcode::G_SREM:
1731   case TargetOpcode::G_SDIVREM:
1732     return legalizeSignedDIV_REM(MI, MRI, B);
1733   case TargetOpcode::G_ATOMIC_CMPXCHG:
1734     return legalizeAtomicCmpXChg(MI, MRI, B);
1735   case TargetOpcode::G_FLOG:
1736     return legalizeFlog(MI, B, numbers::ln2f);
1737   case TargetOpcode::G_FLOG10:
1738     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1739   case TargetOpcode::G_FEXP:
1740     return legalizeFExp(MI, B);
1741   case TargetOpcode::G_FPOW:
1742     return legalizeFPow(MI, B);
1743   case TargetOpcode::G_FFLOOR:
1744     return legalizeFFloor(MI, MRI, B);
1745   case TargetOpcode::G_BUILD_VECTOR:
1746     return legalizeBuildVector(MI, MRI, B);
1747   case TargetOpcode::G_CTLZ:
1748   case TargetOpcode::G_CTTZ:
1749     return legalizeCTLZ_CTTZ(MI, MRI, B);
1750   default:
1751     return false;
1752   }
1753 
1754   llvm_unreachable("expected switch to return");
1755 }
1756 
1757 Register AMDGPULegalizerInfo::getSegmentAperture(
1758   unsigned AS,
1759   MachineRegisterInfo &MRI,
1760   MachineIRBuilder &B) const {
1761   MachineFunction &MF = B.getMF();
1762   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1763   const LLT S32 = LLT::scalar(32);
1764 
1765   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1766 
1767   if (ST.hasApertureRegs()) {
1768     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1769     // getreg.
1770     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1771         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1772         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1773     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1774         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1775         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1776     unsigned Encoding =
1777         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1778         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1779         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1780 
1781     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1782 
1783     B.buildInstr(AMDGPU::S_GETREG_B32)
1784       .addDef(GetReg)
1785       .addImm(Encoding);
1786     MRI.setType(GetReg, S32);
1787 
1788     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1789     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1790   }
1791 
1792   Register QueuePtr = MRI.createGenericVirtualRegister(
1793     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1794 
1795   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1796     return Register();
1797 
1798   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1799   // private_segment_aperture_base_hi.
1800   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1801 
1802   // TODO: can we be smarter about machine pointer info?
1803   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1804   MachineMemOperand *MMO = MF.getMachineMemOperand(
1805       PtrInfo,
1806       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1807           MachineMemOperand::MOInvariant,
1808       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1809 
1810   Register LoadAddr;
1811 
1812   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1813   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1814 }
1815 
1816 /// Return true if the value is a known valid address, such that a null check is
1817 /// not necessary.
1818 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1819                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1820   MachineInstr *Def = MRI.getVRegDef(Val);
1821   switch (Def->getOpcode()) {
1822   case AMDGPU::G_FRAME_INDEX:
1823   case AMDGPU::G_GLOBAL_VALUE:
1824   case AMDGPU::G_BLOCK_ADDR:
1825     return true;
1826   case AMDGPU::G_CONSTANT: {
1827     const ConstantInt *CI = Def->getOperand(1).getCImm();
1828     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1829   }
1830   default:
1831     return false;
1832   }
1833 
1834   return false;
1835 }
1836 
1837 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1838   MachineInstr &MI, MachineRegisterInfo &MRI,
1839   MachineIRBuilder &B) const {
1840   MachineFunction &MF = B.getMF();
1841 
1842   const LLT S32 = LLT::scalar(32);
1843   Register Dst = MI.getOperand(0).getReg();
1844   Register Src = MI.getOperand(1).getReg();
1845 
1846   LLT DstTy = MRI.getType(Dst);
1847   LLT SrcTy = MRI.getType(Src);
1848   unsigned DestAS = DstTy.getAddressSpace();
1849   unsigned SrcAS = SrcTy.getAddressSpace();
1850 
1851   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1852   // vector element.
1853   assert(!DstTy.isVector());
1854 
1855   const AMDGPUTargetMachine &TM
1856     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1857 
1858   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1859     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1860     return true;
1861   }
1862 
1863   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1864     // Truncate.
1865     B.buildExtract(Dst, Src, 0);
1866     MI.eraseFromParent();
1867     return true;
1868   }
1869 
1870   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1871     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1872     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1873 
1874     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1875     // another. Merge operands are required to be the same type, but creating an
1876     // extra ptrtoint would be kind of pointless.
1877     auto HighAddr = B.buildConstant(
1878       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1879     B.buildMerge(Dst, {Src, HighAddr});
1880     MI.eraseFromParent();
1881     return true;
1882   }
1883 
1884   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1885     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1886            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1887 
1888     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1889       // Extract low 32-bits of the pointer.
1890       B.buildExtract(Dst, Src, 0);
1891       MI.eraseFromParent();
1892       return true;
1893     }
1894 
1895     unsigned NullVal = TM.getNullPointerValue(DestAS);
1896 
1897     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1898     auto FlatNull = B.buildConstant(SrcTy, 0);
1899 
1900     // Extract low 32-bits of the pointer.
1901     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1902 
1903     auto CmpRes =
1904         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1905     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1906 
1907     MI.eraseFromParent();
1908     return true;
1909   }
1910 
1911   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1912     return false;
1913 
1914   if (!ST.hasFlatAddressSpace())
1915     return false;
1916 
1917   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1918   if (!ApertureReg.isValid())
1919     return false;
1920 
1921   // Coerce the type of the low half of the result so we can use merge_values.
1922   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1923 
1924   // TODO: Should we allow mismatched types but matching sizes in merges to
1925   // avoid the ptrtoint?
1926   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1927 
1928   if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1929     B.buildCopy(Dst, BuildPtr);
1930     MI.eraseFromParent();
1931     return true;
1932   }
1933 
1934   auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1935   auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1936 
1937   auto CmpRes =
1938       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1939 
1940   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1941 
1942   MI.eraseFromParent();
1943   return true;
1944 }
1945 
1946 bool AMDGPULegalizerInfo::legalizeFrint(
1947   MachineInstr &MI, MachineRegisterInfo &MRI,
1948   MachineIRBuilder &B) const {
1949   Register Src = MI.getOperand(1).getReg();
1950   LLT Ty = MRI.getType(Src);
1951   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1952 
1953   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1954   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1955 
1956   auto C1 = B.buildFConstant(Ty, C1Val);
1957   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1958 
1959   // TODO: Should this propagate fast-math-flags?
1960   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1961   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1962 
1963   auto C2 = B.buildFConstant(Ty, C2Val);
1964   auto Fabs = B.buildFAbs(Ty, Src);
1965 
1966   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1967   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1968   MI.eraseFromParent();
1969   return true;
1970 }
1971 
1972 bool AMDGPULegalizerInfo::legalizeFceil(
1973   MachineInstr &MI, MachineRegisterInfo &MRI,
1974   MachineIRBuilder &B) const {
1975 
1976   const LLT S1 = LLT::scalar(1);
1977   const LLT S64 = LLT::scalar(64);
1978 
1979   Register Src = MI.getOperand(1).getReg();
1980   assert(MRI.getType(Src) == S64);
1981 
1982   // result = trunc(src)
1983   // if (src > 0.0 && src != result)
1984   //   result += 1.0
1985 
1986   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1987 
1988   const auto Zero = B.buildFConstant(S64, 0.0);
1989   const auto One = B.buildFConstant(S64, 1.0);
1990   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1991   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1992   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1993   auto Add = B.buildSelect(S64, And, One, Zero);
1994 
1995   // TODO: Should this propagate fast-math-flags?
1996   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1997   MI.eraseFromParent();
1998   return true;
1999 }
2000 
2001 bool AMDGPULegalizerInfo::legalizeFrem(
2002   MachineInstr &MI, MachineRegisterInfo &MRI,
2003   MachineIRBuilder &B) const {
2004     Register DstReg = MI.getOperand(0).getReg();
2005     Register Src0Reg = MI.getOperand(1).getReg();
2006     Register Src1Reg = MI.getOperand(2).getReg();
2007     auto Flags = MI.getFlags();
2008     LLT Ty = MRI.getType(DstReg);
2009 
2010     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2011     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2012     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2013     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2014     MI.eraseFromParent();
2015     return true;
2016 }
2017 
2018 static MachineInstrBuilder extractF64Exponent(Register Hi,
2019                                               MachineIRBuilder &B) {
2020   const unsigned FractBits = 52;
2021   const unsigned ExpBits = 11;
2022   LLT S32 = LLT::scalar(32);
2023 
2024   auto Const0 = B.buildConstant(S32, FractBits - 32);
2025   auto Const1 = B.buildConstant(S32, ExpBits);
2026 
2027   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2028     .addUse(Hi)
2029     .addUse(Const0.getReg(0))
2030     .addUse(Const1.getReg(0));
2031 
2032   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2033 }
2034 
2035 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2036   MachineInstr &MI, MachineRegisterInfo &MRI,
2037   MachineIRBuilder &B) const {
2038   const LLT S1 = LLT::scalar(1);
2039   const LLT S32 = LLT::scalar(32);
2040   const LLT S64 = LLT::scalar(64);
2041 
2042   Register Src = MI.getOperand(1).getReg();
2043   assert(MRI.getType(Src) == S64);
2044 
2045   // TODO: Should this use extract since the low half is unused?
2046   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2047   Register Hi = Unmerge.getReg(1);
2048 
2049   // Extract the upper half, since this is where we will find the sign and
2050   // exponent.
2051   auto Exp = extractF64Exponent(Hi, B);
2052 
2053   const unsigned FractBits = 52;
2054 
2055   // Extract the sign bit.
2056   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2057   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2058 
2059   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2060 
2061   const auto Zero32 = B.buildConstant(S32, 0);
2062 
2063   // Extend back to 64-bits.
2064   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2065 
2066   auto Shr = B.buildAShr(S64, FractMask, Exp);
2067   auto Not = B.buildNot(S64, Shr);
2068   auto Tmp0 = B.buildAnd(S64, Src, Not);
2069   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2070 
2071   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2072   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2073 
2074   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2075   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2076   MI.eraseFromParent();
2077   return true;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeITOFP(
2081   MachineInstr &MI, MachineRegisterInfo &MRI,
2082   MachineIRBuilder &B, bool Signed) const {
2083 
2084   Register Dst = MI.getOperand(0).getReg();
2085   Register Src = MI.getOperand(1).getReg();
2086 
2087   const LLT S64 = LLT::scalar(64);
2088   const LLT S32 = LLT::scalar(32);
2089 
2090   assert(MRI.getType(Src) == S64);
2091 
2092   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2093   auto ThirtyTwo = B.buildConstant(S32, 32);
2094 
2095   if (MRI.getType(Dst) == S64) {
2096     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2097                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2098 
2099     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2100     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2101                      .addUse(CvtHi.getReg(0))
2102                      .addUse(ThirtyTwo.getReg(0));
2103 
2104     // TODO: Should this propagate fast-math-flags?
2105     B.buildFAdd(Dst, LdExp, CvtLo);
2106     MI.eraseFromParent();
2107     return true;
2108   }
2109 
2110   assert(MRI.getType(Dst) == S32);
2111 
2112   auto One = B.buildConstant(S32, 1);
2113 
2114   MachineInstrBuilder ShAmt;
2115   if (Signed) {
2116     auto ThirtyOne = B.buildConstant(S32, 31);
2117     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2118     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2119     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2120     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2121                                /*HasSideEffects=*/false)
2122                   .addUse(Unmerge.getReg(1));
2123     auto LS2 = B.buildSub(S32, LS, One);
2124     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2125   } else
2126     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2127   auto Norm = B.buildShl(S64, Src, ShAmt);
2128   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2129   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2130   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2131   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2132   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2133   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2134                    /*HasSideEffects=*/false)
2135       .addUse(FVal.getReg(0))
2136       .addUse(Scale.getReg(0));
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 // TODO: Copied from DAG implementation. Verify logic and document how this
2142 // actually works.
2143 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2144                                         MachineRegisterInfo &MRI,
2145                                         MachineIRBuilder &B,
2146                                         bool Signed) const {
2147 
2148   Register Dst = MI.getOperand(0).getReg();
2149   Register Src = MI.getOperand(1).getReg();
2150 
2151   const LLT S64 = LLT::scalar(64);
2152   const LLT S32 = LLT::scalar(32);
2153 
2154   const LLT SrcLT = MRI.getType(Src);
2155   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2156 
2157   unsigned Flags = MI.getFlags();
2158 
2159   // The basic idea of converting a floating point number into a pair of 32-bit
2160   // integers is illustrated as follows:
2161   //
2162   //     tf := trunc(val);
2163   //    hif := floor(tf * 2^-32);
2164   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2165   //     hi := fptoi(hif);
2166   //     lo := fptoi(lof);
2167   //
2168   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2169   MachineInstrBuilder Sign;
2170   if (Signed && SrcLT == S32) {
2171     // However, a 32-bit floating point number has only 23 bits mantissa and
2172     // it's not enough to hold all the significant bits of `lof` if val is
2173     // negative. To avoid the loss of precision, We need to take the absolute
2174     // value after truncating and flip the result back based on the original
2175     // signedness.
2176     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2177     Trunc = B.buildFAbs(S32, Trunc, Flags);
2178   }
2179   MachineInstrBuilder K0, K1;
2180   if (SrcLT == S64) {
2181     K0 = B.buildFConstant(S64,
2182                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2183     K1 = B.buildFConstant(S64,
2184                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2185   } else {
2186     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2187     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2188   }
2189 
2190   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2191   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2192   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2193 
2194   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2195                                      : B.buildFPTOUI(S32, FloorMul);
2196   auto Lo = B.buildFPTOUI(S32, Fma);
2197 
2198   if (Signed && SrcLT == S32) {
2199     // Flip the result based on the signedness, which is either all 0s or 1s.
2200     Sign = B.buildMerge(S64, {Sign, Sign});
2201     // r := xor({lo, hi}, sign) - sign;
2202     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2203   } else
2204     B.buildMerge(Dst, {Lo, Hi});
2205   MI.eraseFromParent();
2206 
2207   return true;
2208 }
2209 
2210 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2211                                                MachineInstr &MI) const {
2212   MachineFunction &MF = Helper.MIRBuilder.getMF();
2213   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2214 
2215   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2216                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2217 
2218   // With ieee_mode disabled, the instructions have the correct behavior
2219   // already for G_FMINNUM/G_FMAXNUM
2220   if (!MFI->getMode().IEEE)
2221     return !IsIEEEOp;
2222 
2223   if (IsIEEEOp)
2224     return true;
2225 
2226   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2227 }
2228 
2229 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2230   MachineInstr &MI, MachineRegisterInfo &MRI,
2231   MachineIRBuilder &B) const {
2232   // TODO: Should move some of this into LegalizerHelper.
2233 
2234   // TODO: Promote dynamic indexing of s16 to s32
2235 
2236   // FIXME: Artifact combiner probably should have replaced the truncated
2237   // constant before this, so we shouldn't need
2238   // getIConstantVRegValWithLookThrough.
2239   Optional<ValueAndVReg> MaybeIdxVal =
2240       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2241   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2242     return true;
2243   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2244 
2245   Register Dst = MI.getOperand(0).getReg();
2246   Register Vec = MI.getOperand(1).getReg();
2247 
2248   LLT VecTy = MRI.getType(Vec);
2249   LLT EltTy = VecTy.getElementType();
2250   assert(EltTy == MRI.getType(Dst));
2251 
2252   if (IdxVal < VecTy.getNumElements()) {
2253     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2254     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2255   } else {
2256     B.buildUndef(Dst);
2257   }
2258 
2259   MI.eraseFromParent();
2260   return true;
2261 }
2262 
2263 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2264   MachineInstr &MI, MachineRegisterInfo &MRI,
2265   MachineIRBuilder &B) const {
2266   // TODO: Should move some of this into LegalizerHelper.
2267 
2268   // TODO: Promote dynamic indexing of s16 to s32
2269 
2270   // FIXME: Artifact combiner probably should have replaced the truncated
2271   // constant before this, so we shouldn't need
2272   // getIConstantVRegValWithLookThrough.
2273   Optional<ValueAndVReg> MaybeIdxVal =
2274       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2275   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2276     return true;
2277 
2278   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2279   Register Dst = MI.getOperand(0).getReg();
2280   Register Vec = MI.getOperand(1).getReg();
2281   Register Ins = MI.getOperand(2).getReg();
2282 
2283   LLT VecTy = MRI.getType(Vec);
2284   LLT EltTy = VecTy.getElementType();
2285   assert(EltTy == MRI.getType(Ins));
2286   (void)Ins;
2287 
2288   unsigned NumElts = VecTy.getNumElements();
2289   if (IdxVal < NumElts) {
2290     SmallVector<Register, 8> SrcRegs;
2291     for (unsigned i = 0; i < NumElts; ++i)
2292       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2293     B.buildUnmerge(SrcRegs, Vec);
2294 
2295     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2296     B.buildMerge(Dst, SrcRegs);
2297   } else {
2298     B.buildUndef(Dst);
2299   }
2300 
2301   MI.eraseFromParent();
2302   return true;
2303 }
2304 
2305 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2306   MachineInstr &MI, MachineRegisterInfo &MRI,
2307   MachineIRBuilder &B) const {
2308   const LLT V2S16 = LLT::fixed_vector(2, 16);
2309 
2310   Register Dst = MI.getOperand(0).getReg();
2311   Register Src0 = MI.getOperand(1).getReg();
2312   LLT DstTy = MRI.getType(Dst);
2313   LLT SrcTy = MRI.getType(Src0);
2314 
2315   if (SrcTy == V2S16 && DstTy == V2S16 &&
2316       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2317     return true;
2318 
2319   MachineIRBuilder HelperBuilder(MI);
2320   GISelObserverWrapper DummyObserver;
2321   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2322   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2323 }
2324 
2325 bool AMDGPULegalizerInfo::legalizeSinCos(
2326   MachineInstr &MI, MachineRegisterInfo &MRI,
2327   MachineIRBuilder &B) const {
2328 
2329   Register DstReg = MI.getOperand(0).getReg();
2330   Register SrcReg = MI.getOperand(1).getReg();
2331   LLT Ty = MRI.getType(DstReg);
2332   unsigned Flags = MI.getFlags();
2333 
2334   Register TrigVal;
2335   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2336   if (ST.hasTrigReducedRange()) {
2337     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2338     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2339       .addUse(MulVal.getReg(0))
2340       .setMIFlags(Flags).getReg(0);
2341   } else
2342     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2343 
2344   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2345     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2346   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2347     .addUse(TrigVal)
2348     .setMIFlags(Flags);
2349   MI.eraseFromParent();
2350   return true;
2351 }
2352 
2353 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2354                                                   MachineIRBuilder &B,
2355                                                   const GlobalValue *GV,
2356                                                   int64_t Offset,
2357                                                   unsigned GAFlags) const {
2358   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2359   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2360   // to the following code sequence:
2361   //
2362   // For constant address space:
2363   //   s_getpc_b64 s[0:1]
2364   //   s_add_u32 s0, s0, $symbol
2365   //   s_addc_u32 s1, s1, 0
2366   //
2367   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2368   //   a fixup or relocation is emitted to replace $symbol with a literal
2369   //   constant, which is a pc-relative offset from the encoding of the $symbol
2370   //   operand to the global variable.
2371   //
2372   // For global address space:
2373   //   s_getpc_b64 s[0:1]
2374   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2375   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2376   //
2377   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2378   //   fixups or relocations are emitted to replace $symbol@*@lo and
2379   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2380   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2381   //   operand to the global variable.
2382   //
2383   // What we want here is an offset from the value returned by s_getpc
2384   // (which is the address of the s_add_u32 instruction) to the global
2385   // variable, but since the encoding of $symbol starts 4 bytes after the start
2386   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2387   // small. This requires us to add 4 to the global variable offset in order to
2388   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2389   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2390   // instruction.
2391 
2392   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2393 
2394   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2395     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2396 
2397   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2398     .addDef(PCReg);
2399 
2400   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2401   if (GAFlags == SIInstrInfo::MO_NONE)
2402     MIB.addImm(0);
2403   else
2404     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2405 
2406   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2407 
2408   if (PtrTy.getSizeInBits() == 32)
2409     B.buildExtract(DstReg, PCReg, 0);
2410   return true;
2411  }
2412 
2413 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2414   MachineInstr &MI, MachineRegisterInfo &MRI,
2415   MachineIRBuilder &B) const {
2416   Register DstReg = MI.getOperand(0).getReg();
2417   LLT Ty = MRI.getType(DstReg);
2418   unsigned AS = Ty.getAddressSpace();
2419 
2420   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2421   MachineFunction &MF = B.getMF();
2422   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2423 
2424   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2425     if (!MFI->isModuleEntryFunction() &&
2426         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2427       const Function &Fn = MF.getFunction();
2428       DiagnosticInfoUnsupported BadLDSDecl(
2429         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2430         DS_Warning);
2431       Fn.getContext().diagnose(BadLDSDecl);
2432 
2433       // We currently don't have a way to correctly allocate LDS objects that
2434       // aren't directly associated with a kernel. We do force inlining of
2435       // functions that use local objects. However, if these dead functions are
2436       // not eliminated, we don't want a compile time error. Just emit a warning
2437       // and a trap, since there should be no callable path here.
2438       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2439       B.buildUndef(DstReg);
2440       MI.eraseFromParent();
2441       return true;
2442     }
2443 
2444     // TODO: We could emit code to handle the initialization somewhere.
2445     // We ignore the initializer for now and legalize it to allow selection.
2446     // The initializer will anyway get errored out during assembly emission.
2447     const SITargetLowering *TLI = ST.getTargetLowering();
2448     if (!TLI->shouldUseLDSConstAddress(GV)) {
2449       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2450       return true; // Leave in place;
2451     }
2452 
2453     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2454       Type *Ty = GV->getValueType();
2455       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2456       // zero-sized type in other languages to declare the dynamic shared
2457       // memory which size is not known at the compile time. They will be
2458       // allocated by the runtime and placed directly after the static
2459       // allocated ones. They all share the same offset.
2460       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2461         // Adjust alignment for that dynamic shared memory array.
2462         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2463         LLT S32 = LLT::scalar(32);
2464         auto Sz =
2465             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2466         B.buildIntToPtr(DstReg, Sz);
2467         MI.eraseFromParent();
2468         return true;
2469       }
2470     }
2471 
2472     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2473                                                    *cast<GlobalVariable>(GV)));
2474     MI.eraseFromParent();
2475     return true;
2476   }
2477 
2478   const SITargetLowering *TLI = ST.getTargetLowering();
2479 
2480   if (TLI->shouldEmitFixup(GV)) {
2481     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2482     MI.eraseFromParent();
2483     return true;
2484   }
2485 
2486   if (TLI->shouldEmitPCReloc(GV)) {
2487     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2488     MI.eraseFromParent();
2489     return true;
2490   }
2491 
2492   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2493   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2494 
2495   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2496   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2497       MachinePointerInfo::getGOT(MF),
2498       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2499           MachineMemOperand::MOInvariant,
2500       LoadTy, Align(8));
2501 
2502   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2503 
2504   if (Ty.getSizeInBits() == 32) {
2505     // Truncate if this is a 32-bit constant address.
2506     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2507     B.buildExtract(DstReg, Load, 0);
2508   } else
2509     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2510 
2511   MI.eraseFromParent();
2512   return true;
2513 }
2514 
2515 static LLT widenToNextPowerOf2(LLT Ty) {
2516   if (Ty.isVector())
2517     return Ty.changeElementCount(
2518         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2519   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2520 }
2521 
2522 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2523                                        MachineInstr &MI) const {
2524   MachineIRBuilder &B = Helper.MIRBuilder;
2525   MachineRegisterInfo &MRI = *B.getMRI();
2526   GISelChangeObserver &Observer = Helper.Observer;
2527 
2528   Register PtrReg = MI.getOperand(1).getReg();
2529   LLT PtrTy = MRI.getType(PtrReg);
2530   unsigned AddrSpace = PtrTy.getAddressSpace();
2531 
2532   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2533     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2534     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2535     Observer.changingInstr(MI);
2536     MI.getOperand(1).setReg(Cast.getReg(0));
2537     Observer.changedInstr(MI);
2538     return true;
2539   }
2540 
2541   if (MI.getOpcode() != AMDGPU::G_LOAD)
2542     return false;
2543 
2544   Register ValReg = MI.getOperand(0).getReg();
2545   LLT ValTy = MRI.getType(ValReg);
2546 
2547   MachineMemOperand *MMO = *MI.memoperands_begin();
2548   const unsigned ValSize = ValTy.getSizeInBits();
2549   const LLT MemTy = MMO->getMemoryType();
2550   const Align MemAlign = MMO->getAlign();
2551   const unsigned MemSize = MemTy.getSizeInBits();
2552   const uint64_t AlignInBits = 8 * MemAlign.value();
2553 
2554   // Widen non-power-of-2 loads to the alignment if needed
2555   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2556     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2557 
2558     // This was already the correct extending load result type, so just adjust
2559     // the memory type.
2560     if (WideMemSize == ValSize) {
2561       MachineFunction &MF = B.getMF();
2562 
2563       MachineMemOperand *WideMMO =
2564           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2565       Observer.changingInstr(MI);
2566       MI.setMemRefs(MF, {WideMMO});
2567       Observer.changedInstr(MI);
2568       return true;
2569     }
2570 
2571     // Don't bother handling edge case that should probably never be produced.
2572     if (ValSize > WideMemSize)
2573       return false;
2574 
2575     LLT WideTy = widenToNextPowerOf2(ValTy);
2576 
2577     Register WideLoad;
2578     if (!WideTy.isVector()) {
2579       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2580       B.buildTrunc(ValReg, WideLoad).getReg(0);
2581     } else {
2582       // Extract the subvector.
2583 
2584       if (isRegisterType(ValTy)) {
2585         // If this a case where G_EXTRACT is legal, use it.
2586         // (e.g. <3 x s32> -> <4 x s32>)
2587         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2588         B.buildExtract(ValReg, WideLoad, 0);
2589       } else {
2590         // For cases where the widened type isn't a nice register value, unmerge
2591         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2592         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2593         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2594       }
2595     }
2596 
2597     MI.eraseFromParent();
2598     return true;
2599   }
2600 
2601   return false;
2602 }
2603 
2604 bool AMDGPULegalizerInfo::legalizeFMad(
2605   MachineInstr &MI, MachineRegisterInfo &MRI,
2606   MachineIRBuilder &B) const {
2607   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2608   assert(Ty.isScalar());
2609 
2610   MachineFunction &MF = B.getMF();
2611   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2612 
2613   // TODO: Always legal with future ftz flag.
2614   // FIXME: Do we need just output?
2615   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2616     return true;
2617   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2618     return true;
2619 
2620   MachineIRBuilder HelperBuilder(MI);
2621   GISelObserverWrapper DummyObserver;
2622   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2623   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2624 }
2625 
2626 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2627   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2628   Register DstReg = MI.getOperand(0).getReg();
2629   Register PtrReg = MI.getOperand(1).getReg();
2630   Register CmpVal = MI.getOperand(2).getReg();
2631   Register NewVal = MI.getOperand(3).getReg();
2632 
2633   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2634          "this should not have been custom lowered");
2635 
2636   LLT ValTy = MRI.getType(CmpVal);
2637   LLT VecTy = LLT::fixed_vector(2, ValTy);
2638 
2639   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2640 
2641   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2642     .addDef(DstReg)
2643     .addUse(PtrReg)
2644     .addUse(PackedVal)
2645     .setMemRefs(MI.memoperands());
2646 
2647   MI.eraseFromParent();
2648   return true;
2649 }
2650 
2651 bool AMDGPULegalizerInfo::legalizeFlog(
2652   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2653   Register Dst = MI.getOperand(0).getReg();
2654   Register Src = MI.getOperand(1).getReg();
2655   LLT Ty = B.getMRI()->getType(Dst);
2656   unsigned Flags = MI.getFlags();
2657 
2658   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2659   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2660 
2661   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2662   MI.eraseFromParent();
2663   return true;
2664 }
2665 
2666 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2667                                        MachineIRBuilder &B) const {
2668   Register Dst = MI.getOperand(0).getReg();
2669   Register Src = MI.getOperand(1).getReg();
2670   unsigned Flags = MI.getFlags();
2671   LLT Ty = B.getMRI()->getType(Dst);
2672 
2673   auto K = B.buildFConstant(Ty, numbers::log2e);
2674   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2675   B.buildFExp2(Dst, Mul, Flags);
2676   MI.eraseFromParent();
2677   return true;
2678 }
2679 
2680 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2681                                        MachineIRBuilder &B) const {
2682   Register Dst = MI.getOperand(0).getReg();
2683   Register Src0 = MI.getOperand(1).getReg();
2684   Register Src1 = MI.getOperand(2).getReg();
2685   unsigned Flags = MI.getFlags();
2686   LLT Ty = B.getMRI()->getType(Dst);
2687   const LLT S16 = LLT::scalar(16);
2688   const LLT S32 = LLT::scalar(32);
2689 
2690   if (Ty == S32) {
2691     auto Log = B.buildFLog2(S32, Src0, Flags);
2692     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2693       .addUse(Log.getReg(0))
2694       .addUse(Src1)
2695       .setMIFlags(Flags);
2696     B.buildFExp2(Dst, Mul, Flags);
2697   } else if (Ty == S16) {
2698     // There's no f16 fmul_legacy, so we need to convert for it.
2699     auto Log = B.buildFLog2(S16, Src0, Flags);
2700     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2701     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2702     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2703       .addUse(Ext0.getReg(0))
2704       .addUse(Ext1.getReg(0))
2705       .setMIFlags(Flags);
2706 
2707     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2708   } else
2709     return false;
2710 
2711   MI.eraseFromParent();
2712   return true;
2713 }
2714 
2715 // Find a source register, ignoring any possible source modifiers.
2716 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2717   Register ModSrc = OrigSrc;
2718   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2719     ModSrc = SrcFNeg->getOperand(1).getReg();
2720     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2721       ModSrc = SrcFAbs->getOperand(1).getReg();
2722   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2723     ModSrc = SrcFAbs->getOperand(1).getReg();
2724   return ModSrc;
2725 }
2726 
2727 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2728                                          MachineRegisterInfo &MRI,
2729                                          MachineIRBuilder &B) const {
2730 
2731   const LLT S1 = LLT::scalar(1);
2732   const LLT S64 = LLT::scalar(64);
2733   Register Dst = MI.getOperand(0).getReg();
2734   Register OrigSrc = MI.getOperand(1).getReg();
2735   unsigned Flags = MI.getFlags();
2736   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2737          "this should not have been custom lowered");
2738 
2739   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2740   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2741   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2742   // V_FRACT bug is:
2743   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2744   //
2745   // Convert floor(x) to (x - fract(x))
2746 
2747   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2748     .addUse(OrigSrc)
2749     .setMIFlags(Flags);
2750 
2751   // Give source modifier matching some assistance before obscuring a foldable
2752   // pattern.
2753 
2754   // TODO: We can avoid the neg on the fract? The input sign to fract
2755   // shouldn't matter?
2756   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2757 
2758   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2759 
2760   Register Min = MRI.createGenericVirtualRegister(S64);
2761 
2762   // We don't need to concern ourselves with the snan handling difference, so
2763   // use the one which will directly select.
2764   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2765   if (MFI->getMode().IEEE)
2766     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2767   else
2768     B.buildFMinNum(Min, Fract, Const, Flags);
2769 
2770   Register CorrectedFract = Min;
2771   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2772     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2773     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2774   }
2775 
2776   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2777   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2778 
2779   MI.eraseFromParent();
2780   return true;
2781 }
2782 
2783 // Turn an illegal packed v2s16 build vector into bit operations.
2784 // TODO: This should probably be a bitcast action in LegalizerHelper.
2785 bool AMDGPULegalizerInfo::legalizeBuildVector(
2786   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2787   Register Dst = MI.getOperand(0).getReg();
2788   const LLT S32 = LLT::scalar(32);
2789   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2790 
2791   Register Src0 = MI.getOperand(1).getReg();
2792   Register Src1 = MI.getOperand(2).getReg();
2793   assert(MRI.getType(Src0) == LLT::scalar(16));
2794 
2795   auto Merge = B.buildMerge(S32, {Src0, Src1});
2796   B.buildBitcast(Dst, Merge);
2797 
2798   MI.eraseFromParent();
2799   return true;
2800 }
2801 
2802 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2803 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2804 // case with a single min instruction instead of a compare+select.
2805 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2806                                             MachineRegisterInfo &MRI,
2807                                             MachineIRBuilder &B) const {
2808   Register Dst = MI.getOperand(0).getReg();
2809   Register Src = MI.getOperand(1).getReg();
2810   LLT DstTy = MRI.getType(Dst);
2811   LLT SrcTy = MRI.getType(Src);
2812 
2813   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2814                         ? AMDGPU::G_AMDGPU_FFBH_U32
2815                         : AMDGPU::G_AMDGPU_FFBL_B32;
2816   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2817   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2818 
2819   MI.eraseFromParent();
2820   return true;
2821 }
2822 
2823 // Check that this is a G_XOR x, -1
2824 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2825   if (MI.getOpcode() != TargetOpcode::G_XOR)
2826     return false;
2827   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2828   return ConstVal && *ConstVal == -1;
2829 }
2830 
2831 // Return the use branch instruction, otherwise null if the usage is invalid.
2832 static MachineInstr *
2833 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2834                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2835   Register CondDef = MI.getOperand(0).getReg();
2836   if (!MRI.hasOneNonDBGUse(CondDef))
2837     return nullptr;
2838 
2839   MachineBasicBlock *Parent = MI.getParent();
2840   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2841 
2842   if (isNot(MRI, *UseMI)) {
2843     Register NegatedCond = UseMI->getOperand(0).getReg();
2844     if (!MRI.hasOneNonDBGUse(NegatedCond))
2845       return nullptr;
2846 
2847     // We're deleting the def of this value, so we need to remove it.
2848     eraseInstr(*UseMI, MRI);
2849 
2850     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2851     Negated = true;
2852   }
2853 
2854   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2855     return nullptr;
2856 
2857   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2858   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2859   if (Next == Parent->end()) {
2860     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2861     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2862       return nullptr;
2863     UncondBrTarget = &*NextMBB;
2864   } else {
2865     if (Next->getOpcode() != AMDGPU::G_BR)
2866       return nullptr;
2867     Br = &*Next;
2868     UncondBrTarget = Br->getOperand(0).getMBB();
2869   }
2870 
2871   return UseMI;
2872 }
2873 
2874 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2875                                          const ArgDescriptor *Arg,
2876                                          const TargetRegisterClass *ArgRC,
2877                                          LLT ArgTy) const {
2878   MCRegister SrcReg = Arg->getRegister();
2879   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2880   assert(DstReg.isVirtual() && "Virtual register expected");
2881 
2882   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2883                                              *ArgRC, B.getDebugLoc(), ArgTy);
2884   if (Arg->isMasked()) {
2885     // TODO: Should we try to emit this once in the entry block?
2886     const LLT S32 = LLT::scalar(32);
2887     const unsigned Mask = Arg->getMask();
2888     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2889 
2890     Register AndMaskSrc = LiveIn;
2891 
2892     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2893     // 0.
2894     if (Shift != 0) {
2895       auto ShiftAmt = B.buildConstant(S32, Shift);
2896       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2897     }
2898 
2899     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2900   } else {
2901     B.buildCopy(DstReg, LiveIn);
2902   }
2903 
2904   return true;
2905 }
2906 
2907 bool AMDGPULegalizerInfo::loadInputValue(
2908     Register DstReg, MachineIRBuilder &B,
2909     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2910   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2911   const ArgDescriptor *Arg;
2912   const TargetRegisterClass *ArgRC;
2913   LLT ArgTy;
2914   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2915 
2916   if (!Arg) {
2917     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2918       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2919       // case the pointer argument may be missing and we use null.
2920       B.buildConstant(DstReg, 0);
2921       return true;
2922     }
2923 
2924     // It's undefined behavior if a function marked with the amdgpu-no-*
2925     // attributes uses the corresponding intrinsic.
2926     B.buildUndef(DstReg);
2927     return true;
2928   }
2929 
2930   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2931     return false; // TODO: Handle these
2932   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2933 }
2934 
2935 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2936     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2937     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2938   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2939     return false;
2940 
2941   MI.eraseFromParent();
2942   return true;
2943 }
2944 
2945 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2946                                        MachineRegisterInfo &MRI,
2947                                        MachineIRBuilder &B) const {
2948   Register Dst = MI.getOperand(0).getReg();
2949   LLT DstTy = MRI.getType(Dst);
2950   LLT S16 = LLT::scalar(16);
2951   LLT S32 = LLT::scalar(32);
2952   LLT S64 = LLT::scalar(64);
2953 
2954   if (DstTy == S16)
2955     return legalizeFDIV16(MI, MRI, B);
2956   if (DstTy == S32)
2957     return legalizeFDIV32(MI, MRI, B);
2958   if (DstTy == S64)
2959     return legalizeFDIV64(MI, MRI, B);
2960 
2961   return false;
2962 }
2963 
2964 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2965                                                         Register DstDivReg,
2966                                                         Register DstRemReg,
2967                                                         Register X,
2968                                                         Register Y) const {
2969   const LLT S1 = LLT::scalar(1);
2970   const LLT S32 = LLT::scalar(32);
2971 
2972   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2973   // algorithm used here.
2974 
2975   // Initial estimate of inv(y).
2976   auto FloatY = B.buildUITOFP(S32, Y);
2977   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2978   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2979   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2980   auto Z = B.buildFPTOUI(S32, ScaledY);
2981 
2982   // One round of UNR.
2983   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2984   auto NegYZ = B.buildMul(S32, NegY, Z);
2985   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2986 
2987   // Quotient/remainder estimate.
2988   auto Q = B.buildUMulH(S32, X, Z);
2989   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2990 
2991   // First quotient/remainder refinement.
2992   auto One = B.buildConstant(S32, 1);
2993   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2994   if (DstDivReg)
2995     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2996   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2997 
2998   // Second quotient/remainder refinement.
2999   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3000   if (DstDivReg)
3001     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3002 
3003   if (DstRemReg)
3004     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3005 }
3006 
3007 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3008 //
3009 // Return lo, hi of result
3010 //
3011 // %cvt.lo = G_UITOFP Val.lo
3012 // %cvt.hi = G_UITOFP Val.hi
3013 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3014 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3015 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3016 // %mul2 = G_FMUL %mul1, 2**(-32)
3017 // %trunc = G_INTRINSIC_TRUNC %mul2
3018 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3019 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3020 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3021                                                        Register Val) {
3022   const LLT S32 = LLT::scalar(32);
3023   auto Unmerge = B.buildUnmerge(S32, Val);
3024 
3025   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3026   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3027 
3028   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3029                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3030 
3031   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3032   auto Mul1 =
3033       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3034 
3035   // 2**(-32)
3036   auto Mul2 =
3037       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3038   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3039 
3040   // -(2**32)
3041   auto Mad2 = B.buildFMAD(S32, Trunc,
3042                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3043 
3044   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3045   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3046 
3047   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3048 }
3049 
3050 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3051                                                         Register DstDivReg,
3052                                                         Register DstRemReg,
3053                                                         Register Numer,
3054                                                         Register Denom) const {
3055   const LLT S32 = LLT::scalar(32);
3056   const LLT S64 = LLT::scalar(64);
3057   const LLT S1 = LLT::scalar(1);
3058   Register RcpLo, RcpHi;
3059 
3060   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3061 
3062   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3063 
3064   auto Zero64 = B.buildConstant(S64, 0);
3065   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3066 
3067   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3068   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3069 
3070   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3071   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3072   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3073 
3074   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3075   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3076   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3077 
3078   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3079   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3080   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3081   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3082   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3083 
3084   auto Zero32 = B.buildConstant(S32, 0);
3085   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3086   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3087   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3088 
3089   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3090   Register NumerLo = UnmergeNumer.getReg(0);
3091   Register NumerHi = UnmergeNumer.getReg(1);
3092 
3093   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3094   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3095   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3096   Register Mul3_Lo = UnmergeMul3.getReg(0);
3097   Register Mul3_Hi = UnmergeMul3.getReg(1);
3098   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3099   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3100   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3101   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3102 
3103   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3104   Register DenomLo = UnmergeDenom.getReg(0);
3105   Register DenomHi = UnmergeDenom.getReg(1);
3106 
3107   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3108   auto C1 = B.buildSExt(S32, CmpHi);
3109 
3110   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3111   auto C2 = B.buildSExt(S32, CmpLo);
3112 
3113   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3114   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3115 
3116   // TODO: Here and below portions of the code can be enclosed into if/endif.
3117   // Currently control flow is unconditional and we have 4 selects after
3118   // potential endif to substitute PHIs.
3119 
3120   // if C3 != 0 ...
3121   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3122   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3123   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3124   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3125 
3126   auto One64 = B.buildConstant(S64, 1);
3127   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3128 
3129   auto C4 =
3130       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3131   auto C5 =
3132       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3133   auto C6 = B.buildSelect(
3134       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3135 
3136   // if (C6 != 0)
3137   auto Add4 = B.buildAdd(S64, Add3, One64);
3138   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3139 
3140   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3141   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3142   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3143 
3144   // endif C6
3145   // endif C3
3146 
3147   if (DstDivReg) {
3148     auto Sel1 = B.buildSelect(
3149         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3150     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3151                   Sel1, MulHi3);
3152   }
3153 
3154   if (DstRemReg) {
3155     auto Sel2 = B.buildSelect(
3156         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3157     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3158                   Sel2, Sub1);
3159   }
3160 }
3161 
3162 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3163                                                   MachineRegisterInfo &MRI,
3164                                                   MachineIRBuilder &B) const {
3165   Register DstDivReg, DstRemReg;
3166   switch (MI.getOpcode()) {
3167   default:
3168     llvm_unreachable("Unexpected opcode!");
3169   case AMDGPU::G_UDIV: {
3170     DstDivReg = MI.getOperand(0).getReg();
3171     break;
3172   }
3173   case AMDGPU::G_UREM: {
3174     DstRemReg = MI.getOperand(0).getReg();
3175     break;
3176   }
3177   case AMDGPU::G_UDIVREM: {
3178     DstDivReg = MI.getOperand(0).getReg();
3179     DstRemReg = MI.getOperand(1).getReg();
3180     break;
3181   }
3182   }
3183 
3184   const LLT S64 = LLT::scalar(64);
3185   const LLT S32 = LLT::scalar(32);
3186   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3187   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3188   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3189   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3190 
3191   if (Ty == S32)
3192     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3193   else if (Ty == S64)
3194     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3195   else
3196     return false;
3197 
3198   MI.eraseFromParent();
3199   return true;
3200 }
3201 
3202 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3203                                                 MachineRegisterInfo &MRI,
3204                                                 MachineIRBuilder &B) const {
3205   const LLT S64 = LLT::scalar(64);
3206   const LLT S32 = LLT::scalar(32);
3207 
3208   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3209   if (Ty != S32 && Ty != S64)
3210     return false;
3211 
3212   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3213   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3214   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3215 
3216   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3217   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3218   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3219 
3220   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3221   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3222 
3223   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3224   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3225 
3226   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3227   switch (MI.getOpcode()) {
3228   default:
3229     llvm_unreachable("Unexpected opcode!");
3230   case AMDGPU::G_SDIV: {
3231     DstDivReg = MI.getOperand(0).getReg();
3232     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3233     break;
3234   }
3235   case AMDGPU::G_SREM: {
3236     DstRemReg = MI.getOperand(0).getReg();
3237     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3238     break;
3239   }
3240   case AMDGPU::G_SDIVREM: {
3241     DstDivReg = MI.getOperand(0).getReg();
3242     DstRemReg = MI.getOperand(1).getReg();
3243     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3244     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3245     break;
3246   }
3247   }
3248 
3249   if (Ty == S32)
3250     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3251   else
3252     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3253 
3254   if (DstDivReg) {
3255     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3256     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3257     B.buildSub(DstDivReg, SignXor, Sign);
3258   }
3259 
3260   if (DstRemReg) {
3261     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3262     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3263     B.buildSub(DstRemReg, SignXor, Sign);
3264   }
3265 
3266   MI.eraseFromParent();
3267   return true;
3268 }
3269 
3270 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3271                                                  MachineRegisterInfo &MRI,
3272                                                  MachineIRBuilder &B) const {
3273   Register Res = MI.getOperand(0).getReg();
3274   Register LHS = MI.getOperand(1).getReg();
3275   Register RHS = MI.getOperand(2).getReg();
3276   uint16_t Flags = MI.getFlags();
3277   LLT ResTy = MRI.getType(Res);
3278 
3279   const MachineFunction &MF = B.getMF();
3280   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3281                             MI.getFlag(MachineInstr::FmAfn);
3282 
3283   if (!AllowInaccurateRcp)
3284     return false;
3285 
3286   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3287     // 1 / x -> RCP(x)
3288     if (CLHS->isExactlyValue(1.0)) {
3289       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3290         .addUse(RHS)
3291         .setMIFlags(Flags);
3292 
3293       MI.eraseFromParent();
3294       return true;
3295     }
3296 
3297     // -1 / x -> RCP( FNEG(x) )
3298     if (CLHS->isExactlyValue(-1.0)) {
3299       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3300       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3301         .addUse(FNeg.getReg(0))
3302         .setMIFlags(Flags);
3303 
3304       MI.eraseFromParent();
3305       return true;
3306     }
3307   }
3308 
3309   // x / y -> x * (1.0 / y)
3310   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3311     .addUse(RHS)
3312     .setMIFlags(Flags);
3313   B.buildFMul(Res, LHS, RCP, Flags);
3314 
3315   MI.eraseFromParent();
3316   return true;
3317 }
3318 
3319 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3320                                                    MachineRegisterInfo &MRI,
3321                                                    MachineIRBuilder &B) const {
3322   Register Res = MI.getOperand(0).getReg();
3323   Register X = MI.getOperand(1).getReg();
3324   Register Y = MI.getOperand(2).getReg();
3325   uint16_t Flags = MI.getFlags();
3326   LLT ResTy = MRI.getType(Res);
3327 
3328   const MachineFunction &MF = B.getMF();
3329   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3330                             MI.getFlag(MachineInstr::FmAfn);
3331 
3332   if (!AllowInaccurateRcp)
3333     return false;
3334 
3335   auto NegY = B.buildFNeg(ResTy, Y);
3336   auto One = B.buildFConstant(ResTy, 1.0);
3337 
3338   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3339     .addUse(Y)
3340     .setMIFlags(Flags);
3341 
3342   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3343   R = B.buildFMA(ResTy, Tmp0, R, R);
3344 
3345   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3346   R = B.buildFMA(ResTy, Tmp1, R, R);
3347 
3348   auto Ret = B.buildFMul(ResTy, X, R);
3349   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3350 
3351   B.buildFMA(Res, Tmp2, R, Ret);
3352   MI.eraseFromParent();
3353   return true;
3354 }
3355 
3356 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3357                                          MachineRegisterInfo &MRI,
3358                                          MachineIRBuilder &B) const {
3359   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3360     return true;
3361 
3362   Register Res = MI.getOperand(0).getReg();
3363   Register LHS = MI.getOperand(1).getReg();
3364   Register RHS = MI.getOperand(2).getReg();
3365 
3366   uint16_t Flags = MI.getFlags();
3367 
3368   LLT S16 = LLT::scalar(16);
3369   LLT S32 = LLT::scalar(32);
3370 
3371   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3372   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3373 
3374   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3375     .addUse(RHSExt.getReg(0))
3376     .setMIFlags(Flags);
3377 
3378   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3379   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3380 
3381   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3382     .addUse(RDst.getReg(0))
3383     .addUse(RHS)
3384     .addUse(LHS)
3385     .setMIFlags(Flags);
3386 
3387   MI.eraseFromParent();
3388   return true;
3389 }
3390 
3391 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3392 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3393 static void toggleSPDenormMode(bool Enable,
3394                                MachineIRBuilder &B,
3395                                const GCNSubtarget &ST,
3396                                AMDGPU::SIModeRegisterDefaults Mode) {
3397   // Set SP denorm mode to this value.
3398   unsigned SPDenormMode =
3399     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3400 
3401   if (ST.hasDenormModeInst()) {
3402     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3403     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3404 
3405     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3406     B.buildInstr(AMDGPU::S_DENORM_MODE)
3407       .addImm(NewDenormModeValue);
3408 
3409   } else {
3410     // Select FP32 bit field in mode register.
3411     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3412                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3413                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3414 
3415     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3416       .addImm(SPDenormMode)
3417       .addImm(SPDenormModeBitField);
3418   }
3419 }
3420 
3421 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3422                                          MachineRegisterInfo &MRI,
3423                                          MachineIRBuilder &B) const {
3424   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3425     return true;
3426 
3427   Register Res = MI.getOperand(0).getReg();
3428   Register LHS = MI.getOperand(1).getReg();
3429   Register RHS = MI.getOperand(2).getReg();
3430   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3431   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3432 
3433   uint16_t Flags = MI.getFlags();
3434 
3435   LLT S32 = LLT::scalar(32);
3436   LLT S1 = LLT::scalar(1);
3437 
3438   auto One = B.buildFConstant(S32, 1.0f);
3439 
3440   auto DenominatorScaled =
3441     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3442       .addUse(LHS)
3443       .addUse(RHS)
3444       .addImm(0)
3445       .setMIFlags(Flags);
3446   auto NumeratorScaled =
3447     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3448       .addUse(LHS)
3449       .addUse(RHS)
3450       .addImm(1)
3451       .setMIFlags(Flags);
3452 
3453   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3454     .addUse(DenominatorScaled.getReg(0))
3455     .setMIFlags(Flags);
3456   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3457 
3458   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3459   // aren't modeled as reading it.
3460   if (!Mode.allFP32Denormals())
3461     toggleSPDenormMode(true, B, ST, Mode);
3462 
3463   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3464   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3465   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3466   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3467   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3468   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3469 
3470   if (!Mode.allFP32Denormals())
3471     toggleSPDenormMode(false, B, ST, Mode);
3472 
3473   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3474     .addUse(Fma4.getReg(0))
3475     .addUse(Fma1.getReg(0))
3476     .addUse(Fma3.getReg(0))
3477     .addUse(NumeratorScaled.getReg(1))
3478     .setMIFlags(Flags);
3479 
3480   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3481     .addUse(Fmas.getReg(0))
3482     .addUse(RHS)
3483     .addUse(LHS)
3484     .setMIFlags(Flags);
3485 
3486   MI.eraseFromParent();
3487   return true;
3488 }
3489 
3490 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3491                                          MachineRegisterInfo &MRI,
3492                                          MachineIRBuilder &B) const {
3493   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3494     return true;
3495 
3496   Register Res = MI.getOperand(0).getReg();
3497   Register LHS = MI.getOperand(1).getReg();
3498   Register RHS = MI.getOperand(2).getReg();
3499 
3500   uint16_t Flags = MI.getFlags();
3501 
3502   LLT S64 = LLT::scalar(64);
3503   LLT S1 = LLT::scalar(1);
3504 
3505   auto One = B.buildFConstant(S64, 1.0);
3506 
3507   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3508     .addUse(LHS)
3509     .addUse(RHS)
3510     .addImm(0)
3511     .setMIFlags(Flags);
3512 
3513   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3514 
3515   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3516     .addUse(DivScale0.getReg(0))
3517     .setMIFlags(Flags);
3518 
3519   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3520   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3521   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3522 
3523   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3524     .addUse(LHS)
3525     .addUse(RHS)
3526     .addImm(1)
3527     .setMIFlags(Flags);
3528 
3529   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3530   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3531   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3532 
3533   Register Scale;
3534   if (!ST.hasUsableDivScaleConditionOutput()) {
3535     // Workaround a hardware bug on SI where the condition output from div_scale
3536     // is not usable.
3537 
3538     LLT S32 = LLT::scalar(32);
3539 
3540     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3541     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3542     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3543     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3544 
3545     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3546                               Scale1Unmerge.getReg(1));
3547     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3548                               Scale0Unmerge.getReg(1));
3549     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3550   } else {
3551     Scale = DivScale1.getReg(1);
3552   }
3553 
3554   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3555     .addUse(Fma4.getReg(0))
3556     .addUse(Fma3.getReg(0))
3557     .addUse(Mul.getReg(0))
3558     .addUse(Scale)
3559     .setMIFlags(Flags);
3560 
3561   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3562     .addUse(Fmas.getReg(0))
3563     .addUse(RHS)
3564     .addUse(LHS)
3565     .setMIFlags(Flags);
3566 
3567   MI.eraseFromParent();
3568   return true;
3569 }
3570 
3571 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3572                                                  MachineRegisterInfo &MRI,
3573                                                  MachineIRBuilder &B) const {
3574   Register Res = MI.getOperand(0).getReg();
3575   Register LHS = MI.getOperand(2).getReg();
3576   Register RHS = MI.getOperand(3).getReg();
3577   uint16_t Flags = MI.getFlags();
3578 
3579   LLT S32 = LLT::scalar(32);
3580   LLT S1 = LLT::scalar(1);
3581 
3582   auto Abs = B.buildFAbs(S32, RHS, Flags);
3583   const APFloat C0Val(1.0f);
3584 
3585   auto C0 = B.buildConstant(S32, 0x6f800000);
3586   auto C1 = B.buildConstant(S32, 0x2f800000);
3587   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3588 
3589   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3590   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3591 
3592   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3593 
3594   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3595     .addUse(Mul0.getReg(0))
3596     .setMIFlags(Flags);
3597 
3598   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3599 
3600   B.buildFMul(Res, Sel, Mul1, Flags);
3601 
3602   MI.eraseFromParent();
3603   return true;
3604 }
3605 
3606 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3607 // FIXME: Why do we handle this one but not other removed instructions?
3608 //
3609 // Reciprocal square root.  The clamp prevents infinite results, clamping
3610 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3611 // +-max_float.
3612 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3613                                                     MachineRegisterInfo &MRI,
3614                                                     MachineIRBuilder &B) const {
3615   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3616     return true;
3617 
3618   Register Dst = MI.getOperand(0).getReg();
3619   Register Src = MI.getOperand(2).getReg();
3620   auto Flags = MI.getFlags();
3621 
3622   LLT Ty = MRI.getType(Dst);
3623 
3624   const fltSemantics *FltSemantics;
3625   if (Ty == LLT::scalar(32))
3626     FltSemantics = &APFloat::IEEEsingle();
3627   else if (Ty == LLT::scalar(64))
3628     FltSemantics = &APFloat::IEEEdouble();
3629   else
3630     return false;
3631 
3632   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3633     .addUse(Src)
3634     .setMIFlags(Flags);
3635 
3636   // We don't need to concern ourselves with the snan handling difference, since
3637   // the rsq quieted (or not) so use the one which will directly select.
3638   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3639   const bool UseIEEE = MFI->getMode().IEEE;
3640 
3641   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3642   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3643                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3644 
3645   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3646 
3647   if (UseIEEE)
3648     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3649   else
3650     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3651   MI.eraseFromParent();
3652   return true;
3653 }
3654 
3655 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3656   switch (IID) {
3657   case Intrinsic::amdgcn_ds_fadd:
3658     return AMDGPU::G_ATOMICRMW_FADD;
3659   case Intrinsic::amdgcn_ds_fmin:
3660     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3661   case Intrinsic::amdgcn_ds_fmax:
3662     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3663   default:
3664     llvm_unreachable("not a DS FP intrinsic");
3665   }
3666 }
3667 
3668 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3669                                                       MachineInstr &MI,
3670                                                       Intrinsic::ID IID) const {
3671   GISelChangeObserver &Observer = Helper.Observer;
3672   Observer.changingInstr(MI);
3673 
3674   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3675 
3676   // The remaining operands were used to set fields in the MemOperand on
3677   // construction.
3678   for (int I = 6; I > 3; --I)
3679     MI.RemoveOperand(I);
3680 
3681   MI.RemoveOperand(1); // Remove the intrinsic ID.
3682   Observer.changedInstr(MI);
3683   return true;
3684 }
3685 
3686 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3687                                             MachineRegisterInfo &MRI,
3688                                             MachineIRBuilder &B) const {
3689   uint64_t Offset =
3690     ST.getTargetLowering()->getImplicitParameterOffset(
3691       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3692   LLT DstTy = MRI.getType(DstReg);
3693   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3694 
3695   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3696   if (!loadInputValue(KernargPtrReg, B,
3697                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3698     return false;
3699 
3700   // FIXME: This should be nuw
3701   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3702   return true;
3703 }
3704 
3705 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3706                                                  MachineRegisterInfo &MRI,
3707                                                  MachineIRBuilder &B) const {
3708   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3709   if (!MFI->isEntryFunction()) {
3710     return legalizePreloadedArgIntrin(MI, MRI, B,
3711                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3712   }
3713 
3714   Register DstReg = MI.getOperand(0).getReg();
3715   if (!getImplicitArgPtr(DstReg, MRI, B))
3716     return false;
3717 
3718   MI.eraseFromParent();
3719   return true;
3720 }
3721 
3722 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3723                                               MachineRegisterInfo &MRI,
3724                                               MachineIRBuilder &B,
3725                                               unsigned AddrSpace) const {
3726   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3727   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3728   Register Hi32 = Unmerge.getReg(1);
3729 
3730   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3731   MI.eraseFromParent();
3732   return true;
3733 }
3734 
3735 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3736 // offset (the offset that is included in bounds checking and swizzling, to be
3737 // split between the instruction's voffset and immoffset fields) and soffset
3738 // (the offset that is excluded from bounds checking and swizzling, to go in
3739 // the instruction's soffset field).  This function takes the first kind of
3740 // offset and figures out how to split it between voffset and immoffset.
3741 std::pair<Register, unsigned>
3742 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3743                                         Register OrigOffset) const {
3744   const unsigned MaxImm = 4095;
3745   Register BaseReg;
3746   unsigned ImmOffset;
3747   const LLT S32 = LLT::scalar(32);
3748   MachineRegisterInfo &MRI = *B.getMRI();
3749 
3750   std::tie(BaseReg, ImmOffset) =
3751       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3752 
3753   // If BaseReg is a pointer, convert it to int.
3754   if (MRI.getType(BaseReg).isPointer())
3755     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3756 
3757   // If the immediate value is too big for the immoffset field, put the value
3758   // and -4096 into the immoffset field so that the value that is copied/added
3759   // for the voffset field is a multiple of 4096, and it stands more chance
3760   // of being CSEd with the copy/add for another similar load/store.
3761   // However, do not do that rounding down to a multiple of 4096 if that is a
3762   // negative number, as it appears to be illegal to have a negative offset
3763   // in the vgpr, even if adding the immediate offset makes it positive.
3764   unsigned Overflow = ImmOffset & ~MaxImm;
3765   ImmOffset -= Overflow;
3766   if ((int32_t)Overflow < 0) {
3767     Overflow += ImmOffset;
3768     ImmOffset = 0;
3769   }
3770 
3771   if (Overflow != 0) {
3772     if (!BaseReg) {
3773       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3774     } else {
3775       auto OverflowVal = B.buildConstant(S32, Overflow);
3776       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3777     }
3778   }
3779 
3780   if (!BaseReg)
3781     BaseReg = B.buildConstant(S32, 0).getReg(0);
3782 
3783   return std::make_pair(BaseReg, ImmOffset);
3784 }
3785 
3786 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3787 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3788                                           Register VOffset, Register SOffset,
3789                                           unsigned ImmOffset, Register VIndex,
3790                                           MachineRegisterInfo &MRI) const {
3791   Optional<ValueAndVReg> MaybeVOffsetVal =
3792       getIConstantVRegValWithLookThrough(VOffset, MRI);
3793   Optional<ValueAndVReg> MaybeSOffsetVal =
3794       getIConstantVRegValWithLookThrough(SOffset, MRI);
3795   Optional<ValueAndVReg> MaybeVIndexVal =
3796       getIConstantVRegValWithLookThrough(VIndex, MRI);
3797   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3798   // update the MMO with that offset. The stride is unknown so we can only do
3799   // this if VIndex is constant 0.
3800   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3801       MaybeVIndexVal->Value == 0) {
3802     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3803                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3804     MMO->setOffset(TotalOffset);
3805   } else {
3806     // We don't have a constant combined offset to use in the MMO. Give up.
3807     MMO->setValue((Value *)nullptr);
3808   }
3809 }
3810 
3811 /// Handle register layout difference for f16 images for some subtargets.
3812 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3813                                              MachineRegisterInfo &MRI,
3814                                              Register Reg,
3815                                              bool ImageStore) const {
3816   const LLT S16 = LLT::scalar(16);
3817   const LLT S32 = LLT::scalar(32);
3818   LLT StoreVT = MRI.getType(Reg);
3819   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3820 
3821   if (ST.hasUnpackedD16VMem()) {
3822     auto Unmerge = B.buildUnmerge(S16, Reg);
3823 
3824     SmallVector<Register, 4> WideRegs;
3825     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3826       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3827 
3828     int NumElts = StoreVT.getNumElements();
3829 
3830     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3831         .getReg(0);
3832   }
3833 
3834   if (ImageStore && ST.hasImageStoreD16Bug()) {
3835     if (StoreVT.getNumElements() == 2) {
3836       SmallVector<Register, 4> PackedRegs;
3837       Reg = B.buildBitcast(S32, Reg).getReg(0);
3838       PackedRegs.push_back(Reg);
3839       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3840       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3841           .getReg(0);
3842     }
3843 
3844     if (StoreVT.getNumElements() == 3) {
3845       SmallVector<Register, 4> PackedRegs;
3846       auto Unmerge = B.buildUnmerge(S16, Reg);
3847       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3848         PackedRegs.push_back(Unmerge.getReg(I));
3849       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3850       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3851       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3852     }
3853 
3854     if (StoreVT.getNumElements() == 4) {
3855       SmallVector<Register, 4> PackedRegs;
3856       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3857       auto Unmerge = B.buildUnmerge(S32, Reg);
3858       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3859         PackedRegs.push_back(Unmerge.getReg(I));
3860       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3861       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3862           .getReg(0);
3863     }
3864 
3865     llvm_unreachable("invalid data type");
3866   }
3867 
3868   if (StoreVT == LLT::fixed_vector(3, S16)) {
3869     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3870               .getReg(0);
3871   }
3872   return Reg;
3873 }
3874 
3875 Register AMDGPULegalizerInfo::fixStoreSourceType(
3876   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3877   MachineRegisterInfo *MRI = B.getMRI();
3878   LLT Ty = MRI->getType(VData);
3879 
3880   const LLT S16 = LLT::scalar(16);
3881 
3882   // Fixup illegal register types for i8 stores.
3883   if (Ty == LLT::scalar(8) || Ty == S16) {
3884     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3885     return AnyExt;
3886   }
3887 
3888   if (Ty.isVector()) {
3889     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3890       if (IsFormat)
3891         return handleD16VData(B, *MRI, VData);
3892     }
3893   }
3894 
3895   return VData;
3896 }
3897 
3898 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3899                                               MachineRegisterInfo &MRI,
3900                                               MachineIRBuilder &B,
3901                                               bool IsTyped,
3902                                               bool IsFormat) const {
3903   Register VData = MI.getOperand(1).getReg();
3904   LLT Ty = MRI.getType(VData);
3905   LLT EltTy = Ty.getScalarType();
3906   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3907   const LLT S32 = LLT::scalar(32);
3908 
3909   VData = fixStoreSourceType(B, VData, IsFormat);
3910   Register RSrc = MI.getOperand(2).getReg();
3911 
3912   MachineMemOperand *MMO = *MI.memoperands_begin();
3913   const int MemSize = MMO->getSize();
3914 
3915   unsigned ImmOffset;
3916 
3917   // The typed intrinsics add an immediate after the registers.
3918   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3919 
3920   // The struct intrinsic variants add one additional operand over raw.
3921   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3922   Register VIndex;
3923   int OpOffset = 0;
3924   if (HasVIndex) {
3925     VIndex = MI.getOperand(3).getReg();
3926     OpOffset = 1;
3927   } else {
3928     VIndex = B.buildConstant(S32, 0).getReg(0);
3929   }
3930 
3931   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3932   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3933 
3934   unsigned Format = 0;
3935   if (IsTyped) {
3936     Format = MI.getOperand(5 + OpOffset).getImm();
3937     ++OpOffset;
3938   }
3939 
3940   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3941 
3942   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3943   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3944 
3945   unsigned Opc;
3946   if (IsTyped) {
3947     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3948                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3949   } else if (IsFormat) {
3950     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3951                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3952   } else {
3953     switch (MemSize) {
3954     case 1:
3955       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3956       break;
3957     case 2:
3958       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3959       break;
3960     default:
3961       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3962       break;
3963     }
3964   }
3965 
3966   auto MIB = B.buildInstr(Opc)
3967     .addUse(VData)              // vdata
3968     .addUse(RSrc)               // rsrc
3969     .addUse(VIndex)             // vindex
3970     .addUse(VOffset)            // voffset
3971     .addUse(SOffset)            // soffset
3972     .addImm(ImmOffset);         // offset(imm)
3973 
3974   if (IsTyped)
3975     MIB.addImm(Format);
3976 
3977   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3978      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3979      .addMemOperand(MMO);
3980 
3981   MI.eraseFromParent();
3982   return true;
3983 }
3984 
3985 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3986                                              MachineRegisterInfo &MRI,
3987                                              MachineIRBuilder &B,
3988                                              bool IsFormat,
3989                                              bool IsTyped) const {
3990   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3991   MachineMemOperand *MMO = *MI.memoperands_begin();
3992   const LLT MemTy = MMO->getMemoryType();
3993   const LLT S32 = LLT::scalar(32);
3994 
3995   Register Dst = MI.getOperand(0).getReg();
3996   Register RSrc = MI.getOperand(2).getReg();
3997 
3998   // The typed intrinsics add an immediate after the registers.
3999   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4000 
4001   // The struct intrinsic variants add one additional operand over raw.
4002   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4003   Register VIndex;
4004   int OpOffset = 0;
4005   if (HasVIndex) {
4006     VIndex = MI.getOperand(3).getReg();
4007     OpOffset = 1;
4008   } else {
4009     VIndex = B.buildConstant(S32, 0).getReg(0);
4010   }
4011 
4012   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4013   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4014 
4015   unsigned Format = 0;
4016   if (IsTyped) {
4017     Format = MI.getOperand(5 + OpOffset).getImm();
4018     ++OpOffset;
4019   }
4020 
4021   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4022   unsigned ImmOffset;
4023 
4024   LLT Ty = MRI.getType(Dst);
4025   LLT EltTy = Ty.getScalarType();
4026   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4027   const bool Unpacked = ST.hasUnpackedD16VMem();
4028 
4029   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4030   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4031 
4032   unsigned Opc;
4033 
4034   if (IsTyped) {
4035     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4036                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4037   } else if (IsFormat) {
4038     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4039                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4040   } else {
4041     switch (MemTy.getSizeInBits()) {
4042     case 8:
4043       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4044       break;
4045     case 16:
4046       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4047       break;
4048     default:
4049       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4050       break;
4051     }
4052   }
4053 
4054   Register LoadDstReg;
4055 
4056   bool IsExtLoad =
4057       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4058   LLT UnpackedTy = Ty.changeElementSize(32);
4059 
4060   if (IsExtLoad)
4061     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4062   else if (Unpacked && IsD16 && Ty.isVector())
4063     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4064   else
4065     LoadDstReg = Dst;
4066 
4067   auto MIB = B.buildInstr(Opc)
4068     .addDef(LoadDstReg)         // vdata
4069     .addUse(RSrc)               // rsrc
4070     .addUse(VIndex)             // vindex
4071     .addUse(VOffset)            // voffset
4072     .addUse(SOffset)            // soffset
4073     .addImm(ImmOffset);         // offset(imm)
4074 
4075   if (IsTyped)
4076     MIB.addImm(Format);
4077 
4078   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4079      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4080      .addMemOperand(MMO);
4081 
4082   if (LoadDstReg != Dst) {
4083     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4084 
4085     // Widen result for extending loads was widened.
4086     if (IsExtLoad)
4087       B.buildTrunc(Dst, LoadDstReg);
4088     else {
4089       // Repack to original 16-bit vector result
4090       // FIXME: G_TRUNC should work, but legalization currently fails
4091       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4092       SmallVector<Register, 4> Repack;
4093       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4094         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4095       B.buildMerge(Dst, Repack);
4096     }
4097   }
4098 
4099   MI.eraseFromParent();
4100   return true;
4101 }
4102 
4103 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4104                                                MachineIRBuilder &B,
4105                                                bool IsInc) const {
4106   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4107                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4108   B.buildInstr(Opc)
4109     .addDef(MI.getOperand(0).getReg())
4110     .addUse(MI.getOperand(2).getReg())
4111     .addUse(MI.getOperand(3).getReg())
4112     .cloneMemRefs(MI);
4113   MI.eraseFromParent();
4114   return true;
4115 }
4116 
4117 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4118   switch (IntrID) {
4119   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4120   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4121     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4122   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4123   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4124     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4125   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4126   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4127     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4128   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4129   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4130     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4131   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4132   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4133     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4134   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4135   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4136     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4137   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4138   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4139     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4140   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4141   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4142     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4143   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4144   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4145     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4146   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4147   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4148     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4149   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4150   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4151     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4152   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4153   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4154     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4155   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4156   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4157     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4158   case Intrinsic::amdgcn_buffer_atomic_fadd:
4159   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4160   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4161     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4162   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4163   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4164     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4165   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4166   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4167     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4168   default:
4169     llvm_unreachable("unhandled atomic opcode");
4170   }
4171 }
4172 
4173 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4174                                                MachineIRBuilder &B,
4175                                                Intrinsic::ID IID) const {
4176   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4177                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4178   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4179 
4180   Register Dst;
4181 
4182   int OpOffset = 0;
4183   if (HasReturn) {
4184     // A few FP atomics do not support return values.
4185     Dst = MI.getOperand(0).getReg();
4186   } else {
4187     OpOffset = -1;
4188   }
4189 
4190   Register VData = MI.getOperand(2 + OpOffset).getReg();
4191   Register CmpVal;
4192 
4193   if (IsCmpSwap) {
4194     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4195     ++OpOffset;
4196   }
4197 
4198   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4199   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4200 
4201   // The struct intrinsic variants add one additional operand over raw.
4202   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4203   Register VIndex;
4204   if (HasVIndex) {
4205     VIndex = MI.getOperand(4 + OpOffset).getReg();
4206     ++OpOffset;
4207   } else {
4208     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4209   }
4210 
4211   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4212   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4213   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4214 
4215   MachineMemOperand *MMO = *MI.memoperands_begin();
4216 
4217   unsigned ImmOffset;
4218   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4219   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4220 
4221   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4222 
4223   if (HasReturn)
4224     MIB.addDef(Dst);
4225 
4226   MIB.addUse(VData); // vdata
4227 
4228   if (IsCmpSwap)
4229     MIB.addReg(CmpVal);
4230 
4231   MIB.addUse(RSrc)               // rsrc
4232      .addUse(VIndex)             // vindex
4233      .addUse(VOffset)            // voffset
4234      .addUse(SOffset)            // soffset
4235      .addImm(ImmOffset)          // offset(imm)
4236      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4237      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4238      .addMemOperand(MMO);
4239 
4240   MI.eraseFromParent();
4241   return true;
4242 }
4243 
4244 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4245 /// vector with s16 typed elements.
4246 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4247                                       SmallVectorImpl<Register> &PackedAddrs,
4248                                       unsigned ArgOffset,
4249                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4250                                       bool IsA16, bool IsG16) {
4251   const LLT S16 = LLT::scalar(16);
4252   const LLT V2S16 = LLT::fixed_vector(2, 16);
4253   auto EndIdx = Intr->VAddrEnd;
4254 
4255   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4256     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4257     if (!SrcOp.isReg())
4258       continue; // _L to _LZ may have eliminated this.
4259 
4260     Register AddrReg = SrcOp.getReg();
4261 
4262     if ((I < Intr->GradientStart) ||
4263         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4264         (I >= Intr->CoordStart && !IsA16)) {
4265       // Handle any gradient or coordinate operands that should not be packed
4266       if ((I < Intr->GradientStart) && IsA16 &&
4267           (B.getMRI()->getType(AddrReg) == S16)) {
4268         // Special handling of bias when A16 is on. Bias is of type half but
4269         // occupies full 32-bit.
4270         PackedAddrs.push_back(
4271             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4272                 .getReg(0));
4273       } else {
4274         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4275         PackedAddrs.push_back(AddrReg);
4276       }
4277     } else {
4278       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4279       // derivatives dx/dh and dx/dv are packed with undef.
4280       if (((I + 1) >= EndIdx) ||
4281           ((Intr->NumGradients / 2) % 2 == 1 &&
4282            (I == static_cast<unsigned>(Intr->GradientStart +
4283                                        (Intr->NumGradients / 2) - 1) ||
4284             I == static_cast<unsigned>(Intr->GradientStart +
4285                                        Intr->NumGradients - 1))) ||
4286           // Check for _L to _LZ optimization
4287           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4288         PackedAddrs.push_back(
4289             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4290                 .getReg(0));
4291       } else {
4292         PackedAddrs.push_back(
4293             B.buildBuildVector(
4294                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4295                 .getReg(0));
4296         ++I;
4297       }
4298     }
4299   }
4300 }
4301 
4302 /// Convert from separate vaddr components to a single vector address register,
4303 /// and replace the remaining operands with $noreg.
4304 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4305                                      int DimIdx, int NumVAddrs) {
4306   const LLT S32 = LLT::scalar(32);
4307 
4308   SmallVector<Register, 8> AddrRegs;
4309   for (int I = 0; I != NumVAddrs; ++I) {
4310     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4311     if (SrcOp.isReg()) {
4312       AddrRegs.push_back(SrcOp.getReg());
4313       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4314     }
4315   }
4316 
4317   int NumAddrRegs = AddrRegs.size();
4318   if (NumAddrRegs != 1) {
4319     // Above 8 elements round up to next power of 2 (i.e. 16).
4320     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4321       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4322       auto Undef = B.buildUndef(S32);
4323       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4324       NumAddrRegs = RoundedNumRegs;
4325     }
4326 
4327     auto VAddr =
4328         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4329     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4330   }
4331 
4332   for (int I = 1; I != NumVAddrs; ++I) {
4333     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4334     if (SrcOp.isReg())
4335       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4336   }
4337 }
4338 
4339 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4340 ///
4341 /// Depending on the subtarget, load/store with 16-bit element data need to be
4342 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4343 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4344 /// registers.
4345 ///
4346 /// We don't want to directly select image instructions just yet, but also want
4347 /// to exposes all register repacking to the legalizer/combiners. We also don't
4348 /// want a selected instrution entering RegBankSelect. In order to avoid
4349 /// defining a multitude of intermediate image instructions, directly hack on
4350 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4351 /// padding now unnecessary arguments with $noreg.
4352 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4353     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4354     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4355 
4356   const unsigned NumDefs = MI.getNumExplicitDefs();
4357   const unsigned ArgOffset = NumDefs + 1;
4358   bool IsTFE = NumDefs == 2;
4359   // We are only processing the operands of d16 image operations on subtargets
4360   // that use the unpacked register layout, or need to repack the TFE result.
4361 
4362   // TODO: Do we need to guard against already legalized intrinsics?
4363   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4364       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4365 
4366   MachineRegisterInfo *MRI = B.getMRI();
4367   const LLT S32 = LLT::scalar(32);
4368   const LLT S16 = LLT::scalar(16);
4369   const LLT V2S16 = LLT::fixed_vector(2, 16);
4370 
4371   unsigned DMask = 0;
4372   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4373   LLT Ty = MRI->getType(VData);
4374 
4375   // Check for 16 bit addresses and pack if true.
4376   LLT GradTy =
4377       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4378   LLT AddrTy =
4379       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4380   const bool IsG16 = GradTy == S16;
4381   const bool IsA16 = AddrTy == S16;
4382   const bool IsD16 = Ty.getScalarType() == S16;
4383 
4384   int DMaskLanes = 0;
4385   if (!BaseOpcode->Atomic) {
4386     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4387     if (BaseOpcode->Gather4) {
4388       DMaskLanes = 4;
4389     } else if (DMask != 0) {
4390       DMaskLanes = countPopulation(DMask);
4391     } else if (!IsTFE && !BaseOpcode->Store) {
4392       // If dmask is 0, this is a no-op load. This can be eliminated.
4393       B.buildUndef(MI.getOperand(0));
4394       MI.eraseFromParent();
4395       return true;
4396     }
4397   }
4398 
4399   Observer.changingInstr(MI);
4400   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4401 
4402   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4403                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4404   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4405                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4406   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4407 
4408   // Track that we legalized this
4409   MI.setDesc(B.getTII().get(NewOpcode));
4410 
4411   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4412   // dmask to be at least 1 otherwise the instruction will fail
4413   if (IsTFE && DMask == 0) {
4414     DMask = 0x1;
4415     DMaskLanes = 1;
4416     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4417   }
4418 
4419   if (BaseOpcode->Atomic) {
4420     Register VData0 = MI.getOperand(2).getReg();
4421     LLT Ty = MRI->getType(VData0);
4422 
4423     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4424     if (Ty.isVector())
4425       return false;
4426 
4427     if (BaseOpcode->AtomicX2) {
4428       Register VData1 = MI.getOperand(3).getReg();
4429       // The two values are packed in one register.
4430       LLT PackedTy = LLT::fixed_vector(2, Ty);
4431       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4432       MI.getOperand(2).setReg(Concat.getReg(0));
4433       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4434     }
4435   }
4436 
4437   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4438 
4439   // Optimize _L to _LZ when _L is zero
4440   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4441           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4442     const ConstantFP *ConstantLod;
4443 
4444     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4445                  m_GFCst(ConstantLod))) {
4446       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4447         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4448         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4449             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4450                                                      Intr->Dim);
4451 
4452         // The starting indexes should remain in the same place.
4453         --CorrectedNumVAddrs;
4454 
4455         MI.getOperand(NumDefs).setIntrinsicID(
4456             static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4457         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4458         Intr = NewImageDimIntr;
4459       }
4460     }
4461   }
4462 
4463   // Optimize _mip away, when 'lod' is zero
4464   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4465     int64_t ConstantLod;
4466     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4467                  m_ICst(ConstantLod))) {
4468       if (ConstantLod == 0) {
4469         // TODO: Change intrinsic opcode and remove operand instead or replacing
4470         // it with 0, as the _L to _LZ handling is done above.
4471         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4472         --CorrectedNumVAddrs;
4473       }
4474     }
4475   }
4476 
4477   // Rewrite the addressing register layout before doing anything else.
4478   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4479     // 16 bit gradients are supported, but are tied to the A16 control
4480     // so both gradients and addresses must be 16 bit
4481     return false;
4482   }
4483 
4484   if (IsA16 && !ST.hasA16()) {
4485     // A16 not supported
4486     return false;
4487   }
4488 
4489   if (IsA16 || IsG16) {
4490     if (Intr->NumVAddrs > 1) {
4491       SmallVector<Register, 4> PackedRegs;
4492 
4493       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4494                                 IsG16);
4495 
4496       // See also below in the non-a16 branch
4497       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4498                           PackedRegs.size() <= ST.getNSAMaxSize();
4499 
4500       if (!UseNSA && PackedRegs.size() > 1) {
4501         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4502         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4503         PackedRegs[0] = Concat.getReg(0);
4504         PackedRegs.resize(1);
4505       }
4506 
4507       const unsigned NumPacked = PackedRegs.size();
4508       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4509         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4510         if (!SrcOp.isReg()) {
4511           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4512           continue;
4513         }
4514 
4515         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4516 
4517         if (I - Intr->VAddrStart < NumPacked)
4518           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4519         else
4520           SrcOp.setReg(AMDGPU::NoRegister);
4521       }
4522     }
4523   } else {
4524     // If the register allocator cannot place the address registers contiguously
4525     // without introducing moves, then using the non-sequential address encoding
4526     // is always preferable, since it saves VALU instructions and is usually a
4527     // wash in terms of code size or even better.
4528     //
4529     // However, we currently have no way of hinting to the register allocator
4530     // that MIMG addresses should be placed contiguously when it is possible to
4531     // do so, so force non-NSA for the common 2-address case as a heuristic.
4532     //
4533     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4534     // allocation when possible.
4535     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4536                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4537 
4538     if (!UseNSA && Intr->NumVAddrs > 1)
4539       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4540                                Intr->NumVAddrs);
4541   }
4542 
4543   int Flags = 0;
4544   if (IsA16)
4545     Flags |= 1;
4546   if (IsG16)
4547     Flags |= 2;
4548   MI.addOperand(MachineOperand::CreateImm(Flags));
4549 
4550   if (BaseOpcode->Store) { // No TFE for stores?
4551     // TODO: Handle dmask trim
4552     if (!Ty.isVector() || !IsD16)
4553       return true;
4554 
4555     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4556     if (RepackedReg != VData) {
4557       MI.getOperand(1).setReg(RepackedReg);
4558     }
4559 
4560     return true;
4561   }
4562 
4563   Register DstReg = MI.getOperand(0).getReg();
4564   const LLT EltTy = Ty.getScalarType();
4565   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4566 
4567   // Confirm that the return type is large enough for the dmask specified
4568   if (NumElts < DMaskLanes)
4569     return false;
4570 
4571   if (NumElts > 4 || DMaskLanes > 4)
4572     return false;
4573 
4574   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4575   const LLT AdjustedTy =
4576       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4577 
4578   // The raw dword aligned data component of the load. The only legal cases
4579   // where this matters should be when using the packed D16 format, for
4580   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4581   LLT RoundedTy;
4582 
4583   // S32 vector to to cover all data, plus TFE result element.
4584   LLT TFETy;
4585 
4586   // Register type to use for each loaded component. Will be S32 or V2S16.
4587   LLT RegTy;
4588 
4589   if (IsD16 && ST.hasUnpackedD16VMem()) {
4590     RoundedTy =
4591         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4592     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4593     RegTy = S32;
4594   } else {
4595     unsigned EltSize = EltTy.getSizeInBits();
4596     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4597     unsigned RoundedSize = 32 * RoundedElts;
4598     RoundedTy = LLT::scalarOrVector(
4599         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4600     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4601     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4602   }
4603 
4604   // The return type does not need adjustment.
4605   // TODO: Should we change s16 case to s32 or <2 x s16>?
4606   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4607     return true;
4608 
4609   Register Dst1Reg;
4610 
4611   // Insert after the instruction.
4612   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4613 
4614   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4615   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4616   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4617   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4618 
4619   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4620 
4621   MI.getOperand(0).setReg(NewResultReg);
4622 
4623   // In the IR, TFE is supposed to be used with a 2 element struct return
4624   // type. The instruction really returns these two values in one contiguous
4625   // register, with one additional dword beyond the loaded data. Rewrite the
4626   // return type to use a single register result.
4627 
4628   if (IsTFE) {
4629     Dst1Reg = MI.getOperand(1).getReg();
4630     if (MRI->getType(Dst1Reg) != S32)
4631       return false;
4632 
4633     // TODO: Make sure the TFE operand bit is set.
4634     MI.RemoveOperand(1);
4635 
4636     // Handle the easy case that requires no repack instructions.
4637     if (Ty == S32) {
4638       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4639       return true;
4640     }
4641   }
4642 
4643   // Now figure out how to copy the new result register back into the old
4644   // result.
4645   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4646 
4647   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4648 
4649   if (ResultNumRegs == 1) {
4650     assert(!IsTFE);
4651     ResultRegs[0] = NewResultReg;
4652   } else {
4653     // We have to repack into a new vector of some kind.
4654     for (int I = 0; I != NumDataRegs; ++I)
4655       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4656     B.buildUnmerge(ResultRegs, NewResultReg);
4657 
4658     // Drop the final TFE element to get the data part. The TFE result is
4659     // directly written to the right place already.
4660     if (IsTFE)
4661       ResultRegs.resize(NumDataRegs);
4662   }
4663 
4664   // For an s16 scalar result, we form an s32 result with a truncate regardless
4665   // of packed vs. unpacked.
4666   if (IsD16 && !Ty.isVector()) {
4667     B.buildTrunc(DstReg, ResultRegs[0]);
4668     return true;
4669   }
4670 
4671   // Avoid a build/concat_vector of 1 entry.
4672   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4673     B.buildBitcast(DstReg, ResultRegs[0]);
4674     return true;
4675   }
4676 
4677   assert(Ty.isVector());
4678 
4679   if (IsD16) {
4680     // For packed D16 results with TFE enabled, all the data components are
4681     // S32. Cast back to the expected type.
4682     //
4683     // TODO: We don't really need to use load s32 elements. We would only need one
4684     // cast for the TFE result if a multiple of v2s16 was used.
4685     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4686       for (Register &Reg : ResultRegs)
4687         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4688     } else if (ST.hasUnpackedD16VMem()) {
4689       for (Register &Reg : ResultRegs)
4690         Reg = B.buildTrunc(S16, Reg).getReg(0);
4691     }
4692   }
4693 
4694   auto padWithUndef = [&](LLT Ty, int NumElts) {
4695     if (NumElts == 0)
4696       return;
4697     Register Undef = B.buildUndef(Ty).getReg(0);
4698     for (int I = 0; I != NumElts; ++I)
4699       ResultRegs.push_back(Undef);
4700   };
4701 
4702   // Pad out any elements eliminated due to the dmask.
4703   LLT ResTy = MRI->getType(ResultRegs[0]);
4704   if (!ResTy.isVector()) {
4705     padWithUndef(ResTy, NumElts - ResultRegs.size());
4706     B.buildBuildVector(DstReg, ResultRegs);
4707     return true;
4708   }
4709 
4710   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4711   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4712 
4713   // Deal with the one annoying legal case.
4714   const LLT V3S16 = LLT::fixed_vector(3, 16);
4715   if (Ty == V3S16) {
4716     if (IsTFE) {
4717       if (ResultRegs.size() == 1) {
4718         NewResultReg = ResultRegs[0];
4719       } else if (ResultRegs.size() == 2) {
4720         LLT V4S16 = LLT::fixed_vector(4, 16);
4721         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4722       } else {
4723         return false;
4724       }
4725     }
4726 
4727     if (MRI->getType(DstReg).getNumElements() <
4728         MRI->getType(NewResultReg).getNumElements()) {
4729       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4730     } else {
4731       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4732     }
4733     return true;
4734   }
4735 
4736   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4737   B.buildConcatVectors(DstReg, ResultRegs);
4738   return true;
4739 }
4740 
4741 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4742   LegalizerHelper &Helper, MachineInstr &MI) const {
4743   MachineIRBuilder &B = Helper.MIRBuilder;
4744   GISelChangeObserver &Observer = Helper.Observer;
4745 
4746   Register Dst = MI.getOperand(0).getReg();
4747   LLT Ty = B.getMRI()->getType(Dst);
4748   unsigned Size = Ty.getSizeInBits();
4749   MachineFunction &MF = B.getMF();
4750 
4751   Observer.changingInstr(MI);
4752 
4753   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4754     Ty = getBitcastRegisterType(Ty);
4755     Helper.bitcastDst(MI, Ty, 0);
4756     Dst = MI.getOperand(0).getReg();
4757     B.setInsertPt(B.getMBB(), MI);
4758   }
4759 
4760   // FIXME: We don't really need this intermediate instruction. The intrinsic
4761   // should be fixed to have a memory operand. Since it's readnone, we're not
4762   // allowed to add one.
4763   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4764   MI.RemoveOperand(1); // Remove intrinsic ID
4765 
4766   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4767   // TODO: Should this use datalayout alignment?
4768   const unsigned MemSize = (Size + 7) / 8;
4769   const Align MemAlign(4);
4770   MachineMemOperand *MMO = MF.getMachineMemOperand(
4771       MachinePointerInfo(),
4772       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4773           MachineMemOperand::MOInvariant,
4774       MemSize, MemAlign);
4775   MI.addMemOperand(MF, MMO);
4776 
4777   // There are no 96-bit result scalar loads, but widening to 128-bit should
4778   // always be legal. We may need to restore this to a 96-bit result if it turns
4779   // out this needs to be converted to a vector load during RegBankSelect.
4780   if (!isPowerOf2_32(Size)) {
4781     if (Ty.isVector())
4782       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4783     else
4784       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4785   }
4786 
4787   Observer.changedInstr(MI);
4788   return true;
4789 }
4790 
4791 // TODO: Move to selection
4792 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4793                                                 MachineRegisterInfo &MRI,
4794                                                 MachineIRBuilder &B) const {
4795   if (!ST.isTrapHandlerEnabled() ||
4796       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4797     return legalizeTrapEndpgm(MI, MRI, B);
4798 
4799   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4800     switch (*HsaAbiVer) {
4801     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4802     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4803       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4804     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4805       return ST.supportsGetDoorbellID() ?
4806           legalizeTrapHsa(MI, MRI, B) :
4807           legalizeTrapHsaQueuePtr(MI, MRI, B);
4808     }
4809   }
4810 
4811   llvm_unreachable("Unknown trap handler");
4812 }
4813 
4814 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4815     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4816   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4817   MI.eraseFromParent();
4818   return true;
4819 }
4820 
4821 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4822     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4823   // Pass queue pointer to trap handler as input, and insert trap instruction
4824   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4825   Register LiveIn =
4826     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4827   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4828     return false;
4829 
4830   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4831   B.buildCopy(SGPR01, LiveIn);
4832   B.buildInstr(AMDGPU::S_TRAP)
4833       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4834       .addReg(SGPR01, RegState::Implicit);
4835 
4836   MI.eraseFromParent();
4837   return true;
4838 }
4839 
4840 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4841     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4842   B.buildInstr(AMDGPU::S_TRAP)
4843       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4844   MI.eraseFromParent();
4845   return true;
4846 }
4847 
4848 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4849     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4850   // Is non-HSA path or trap-handler disabled? Then, report a warning
4851   // accordingly
4852   if (!ST.isTrapHandlerEnabled() ||
4853       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4854     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4855                                      "debugtrap handler not supported",
4856                                      MI.getDebugLoc(), DS_Warning);
4857     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4858     Ctx.diagnose(NoTrap);
4859   } else {
4860     // Insert debug-trap instruction
4861     B.buildInstr(AMDGPU::S_TRAP)
4862         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4863   }
4864 
4865   MI.eraseFromParent();
4866   return true;
4867 }
4868 
4869 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4870                                                MachineIRBuilder &B) const {
4871   MachineRegisterInfo &MRI = *B.getMRI();
4872   const LLT S16 = LLT::scalar(16);
4873   const LLT S32 = LLT::scalar(32);
4874 
4875   Register DstReg = MI.getOperand(0).getReg();
4876   Register NodePtr = MI.getOperand(2).getReg();
4877   Register RayExtent = MI.getOperand(3).getReg();
4878   Register RayOrigin = MI.getOperand(4).getReg();
4879   Register RayDir = MI.getOperand(5).getReg();
4880   Register RayInvDir = MI.getOperand(6).getReg();
4881   Register TDescr = MI.getOperand(7).getReg();
4882 
4883   if (!ST.hasGFX10_AEncoding()) {
4884     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4885                                         "intrinsic not supported on subtarget",
4886                                         MI.getDebugLoc());
4887     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4888     return false;
4889   }
4890 
4891   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4892   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4893   const unsigned NumVDataDwords = 4;
4894   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4895   const bool UseNSA =
4896       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4897   const unsigned BaseOpcodes[2][2] = {
4898       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4899       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4900        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4901   int Opcode;
4902   if (UseNSA) {
4903     Opcode =
4904         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4905                               NumVDataDwords, NumVAddrDwords);
4906   } else {
4907     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4908                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4909                                    PowerOf2Ceil(NumVAddrDwords));
4910   }
4911   assert(Opcode != -1);
4912 
4913   SmallVector<Register, 12> Ops;
4914   if (Is64) {
4915     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4916     Ops.push_back(Unmerge.getReg(0));
4917     Ops.push_back(Unmerge.getReg(1));
4918   } else {
4919     Ops.push_back(NodePtr);
4920   }
4921   Ops.push_back(RayExtent);
4922 
4923   auto packLanes = [&Ops, &S32, &B](Register Src) {
4924     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4925     Ops.push_back(Unmerge.getReg(0));
4926     Ops.push_back(Unmerge.getReg(1));
4927     Ops.push_back(Unmerge.getReg(2));
4928   };
4929 
4930   packLanes(RayOrigin);
4931   if (IsA16) {
4932     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4933     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4934     Register R1 = MRI.createGenericVirtualRegister(S32);
4935     Register R2 = MRI.createGenericVirtualRegister(S32);
4936     Register R3 = MRI.createGenericVirtualRegister(S32);
4937     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4938     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4939     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4940     Ops.push_back(R1);
4941     Ops.push_back(R2);
4942     Ops.push_back(R3);
4943   } else {
4944     packLanes(RayDir);
4945     packLanes(RayInvDir);
4946   }
4947 
4948   if (!UseNSA) {
4949     // Build a single vector containing all the operands so far prepared.
4950     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4951     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4952     Ops.clear();
4953     Ops.push_back(MergedOps);
4954   }
4955 
4956   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4957     .addDef(DstReg)
4958     .addImm(Opcode);
4959 
4960   for (Register R : Ops) {
4961     MIB.addUse(R);
4962   }
4963 
4964   MIB.addUse(TDescr)
4965      .addImm(IsA16 ? 1 : 0)
4966      .cloneMemRefs(MI);
4967 
4968   MI.eraseFromParent();
4969   return true;
4970 }
4971 
4972 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4973   B.buildConstant(MI.getOperand(0).getReg(), C);
4974   MI.eraseFromParent();
4975   return true;
4976 }
4977 
4978 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4979                                             MachineInstr &MI) const {
4980   MachineIRBuilder &B = Helper.MIRBuilder;
4981   MachineRegisterInfo &MRI = *B.getMRI();
4982 
4983   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4984   auto IntrID = MI.getIntrinsicID();
4985   switch (IntrID) {
4986   case Intrinsic::amdgcn_if:
4987   case Intrinsic::amdgcn_else: {
4988     MachineInstr *Br = nullptr;
4989     MachineBasicBlock *UncondBrTarget = nullptr;
4990     bool Negated = false;
4991     if (MachineInstr *BrCond =
4992             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4993       const SIRegisterInfo *TRI
4994         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4995 
4996       Register Def = MI.getOperand(1).getReg();
4997       Register Use = MI.getOperand(3).getReg();
4998 
4999       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5000 
5001       if (Negated)
5002         std::swap(CondBrTarget, UncondBrTarget);
5003 
5004       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5005       if (IntrID == Intrinsic::amdgcn_if) {
5006         B.buildInstr(AMDGPU::SI_IF)
5007           .addDef(Def)
5008           .addUse(Use)
5009           .addMBB(UncondBrTarget);
5010       } else {
5011         B.buildInstr(AMDGPU::SI_ELSE)
5012             .addDef(Def)
5013             .addUse(Use)
5014             .addMBB(UncondBrTarget);
5015       }
5016 
5017       if (Br) {
5018         Br->getOperand(0).setMBB(CondBrTarget);
5019       } else {
5020         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5021         // since we're swapping branch targets it needs to be reinserted.
5022         // FIXME: IRTranslator should probably not do this
5023         B.buildBr(*CondBrTarget);
5024       }
5025 
5026       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5027       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5028       MI.eraseFromParent();
5029       BrCond->eraseFromParent();
5030       return true;
5031     }
5032 
5033     return false;
5034   }
5035   case Intrinsic::amdgcn_loop: {
5036     MachineInstr *Br = nullptr;
5037     MachineBasicBlock *UncondBrTarget = nullptr;
5038     bool Negated = false;
5039     if (MachineInstr *BrCond =
5040             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5041       const SIRegisterInfo *TRI
5042         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5043 
5044       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5045       Register Reg = MI.getOperand(2).getReg();
5046 
5047       if (Negated)
5048         std::swap(CondBrTarget, UncondBrTarget);
5049 
5050       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5051       B.buildInstr(AMDGPU::SI_LOOP)
5052         .addUse(Reg)
5053         .addMBB(UncondBrTarget);
5054 
5055       if (Br)
5056         Br->getOperand(0).setMBB(CondBrTarget);
5057       else
5058         B.buildBr(*CondBrTarget);
5059 
5060       MI.eraseFromParent();
5061       BrCond->eraseFromParent();
5062       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5063       return true;
5064     }
5065 
5066     return false;
5067   }
5068   case Intrinsic::amdgcn_kernarg_segment_ptr:
5069     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5070       // This only makes sense to call in a kernel, so just lower to null.
5071       B.buildConstant(MI.getOperand(0).getReg(), 0);
5072       MI.eraseFromParent();
5073       return true;
5074     }
5075 
5076     return legalizePreloadedArgIntrin(
5077       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5078   case Intrinsic::amdgcn_implicitarg_ptr:
5079     return legalizeImplicitArgPtr(MI, MRI, B);
5080   case Intrinsic::amdgcn_workitem_id_x:
5081     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5082       return replaceWithConstant(B, MI, 0);
5083     return legalizePreloadedArgIntrin(MI, MRI, B,
5084                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5085   case Intrinsic::amdgcn_workitem_id_y:
5086     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5087       return replaceWithConstant(B, MI, 0);
5088 
5089     return legalizePreloadedArgIntrin(MI, MRI, B,
5090                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5091   case Intrinsic::amdgcn_workitem_id_z:
5092     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5093       return replaceWithConstant(B, MI, 0);
5094 
5095     return legalizePreloadedArgIntrin(MI, MRI, B,
5096                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5097   case Intrinsic::amdgcn_workgroup_id_x:
5098     return legalizePreloadedArgIntrin(MI, MRI, B,
5099                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5100   case Intrinsic::amdgcn_workgroup_id_y:
5101     return legalizePreloadedArgIntrin(MI, MRI, B,
5102                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5103   case Intrinsic::amdgcn_workgroup_id_z:
5104     return legalizePreloadedArgIntrin(MI, MRI, B,
5105                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5106   case Intrinsic::amdgcn_dispatch_ptr:
5107     return legalizePreloadedArgIntrin(MI, MRI, B,
5108                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5109   case Intrinsic::amdgcn_queue_ptr:
5110     return legalizePreloadedArgIntrin(MI, MRI, B,
5111                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5112   case Intrinsic::amdgcn_implicit_buffer_ptr:
5113     return legalizePreloadedArgIntrin(
5114       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5115   case Intrinsic::amdgcn_dispatch_id:
5116     return legalizePreloadedArgIntrin(MI, MRI, B,
5117                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5118   case Intrinsic::amdgcn_fdiv_fast:
5119     return legalizeFDIVFastIntrin(MI, MRI, B);
5120   case Intrinsic::amdgcn_is_shared:
5121     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5122   case Intrinsic::amdgcn_is_private:
5123     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5124   case Intrinsic::amdgcn_wavefrontsize: {
5125     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5126     MI.eraseFromParent();
5127     return true;
5128   }
5129   case Intrinsic::amdgcn_s_buffer_load:
5130     return legalizeSBufferLoad(Helper, MI);
5131   case Intrinsic::amdgcn_raw_buffer_store:
5132   case Intrinsic::amdgcn_struct_buffer_store:
5133     return legalizeBufferStore(MI, MRI, B, false, false);
5134   case Intrinsic::amdgcn_raw_buffer_store_format:
5135   case Intrinsic::amdgcn_struct_buffer_store_format:
5136     return legalizeBufferStore(MI, MRI, B, false, true);
5137   case Intrinsic::amdgcn_raw_tbuffer_store:
5138   case Intrinsic::amdgcn_struct_tbuffer_store:
5139     return legalizeBufferStore(MI, MRI, B, true, true);
5140   case Intrinsic::amdgcn_raw_buffer_load:
5141   case Intrinsic::amdgcn_struct_buffer_load:
5142     return legalizeBufferLoad(MI, MRI, B, false, false);
5143   case Intrinsic::amdgcn_raw_buffer_load_format:
5144   case Intrinsic::amdgcn_struct_buffer_load_format:
5145     return legalizeBufferLoad(MI, MRI, B, true, false);
5146   case Intrinsic::amdgcn_raw_tbuffer_load:
5147   case Intrinsic::amdgcn_struct_tbuffer_load:
5148     return legalizeBufferLoad(MI, MRI, B, true, true);
5149   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5150   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5151   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5152   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5153   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5154   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5155   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5156   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5157   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5158   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5159   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5160   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5161   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5162   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5163   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5164   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5165   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5166   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5167   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5168   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5169   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5170   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5171   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5172   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5173   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5174   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5175   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5176   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5177   case Intrinsic::amdgcn_buffer_atomic_fadd:
5178   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5179   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5180   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5181   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5182     return legalizeBufferAtomic(MI, B, IntrID);
5183   case Intrinsic::amdgcn_atomic_inc:
5184     return legalizeAtomicIncDec(MI, B, true);
5185   case Intrinsic::amdgcn_atomic_dec:
5186     return legalizeAtomicIncDec(MI, B, false);
5187   case Intrinsic::trap:
5188     return legalizeTrapIntrinsic(MI, MRI, B);
5189   case Intrinsic::debugtrap:
5190     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5191   case Intrinsic::amdgcn_rsq_clamp:
5192     return legalizeRsqClampIntrinsic(MI, MRI, B);
5193   case Intrinsic::amdgcn_ds_fadd:
5194   case Intrinsic::amdgcn_ds_fmin:
5195   case Intrinsic::amdgcn_ds_fmax:
5196     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5197   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5198     return legalizeBVHIntrinsic(MI, B);
5199   default: {
5200     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5201             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5202       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5203     return true;
5204   }
5205   }
5206 
5207   return true;
5208 }
5209