1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
843       .customFor({S16, S32})
844       .scalarize(0)
845       .lower();
846 
847   // Lower roundeven into G_FRINT
848   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
849     .scalarize(0)
850     .lower();
851 
852   if (ST.has16BitInsts()) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S16, S32, S64})
855       .clampScalar(0, S16, S64)
856       .scalarize(0);
857   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32, S64})
860       .clampScalar(0, S32, S64)
861       .scalarize(0);
862   } else {
863     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
864       .legalFor({S32})
865       .customFor({S64})
866       .clampScalar(0, S32, S64)
867       .scalarize(0);
868   }
869 
870   getActionDefinitionsBuilder(G_PTR_ADD)
871     .legalIf(all(isPointer(0), sameSize(0, 1)))
872     .scalarize(0)
873     .scalarSameSizeAs(1, 0);
874 
875   getActionDefinitionsBuilder(G_PTRMASK)
876     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
877     .scalarSameSizeAs(1, 0)
878     .scalarize(0);
879 
880   auto &CmpBuilder =
881     getActionDefinitionsBuilder(G_ICMP)
882     // The compare output type differs based on the register bank of the output,
883     // so make both s1 and s32 legal.
884     //
885     // Scalar compares producing output in scc will be promoted to s32, as that
886     // is the allocatable register type that will be needed for the copy from
887     // scc. This will be promoted during RegBankSelect, and we assume something
888     // before that won't try to use s32 result types.
889     //
890     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
891     // bank.
892     .legalForCartesianProduct(
893       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
894     .legalForCartesianProduct(
895       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
896   if (ST.has16BitInsts()) {
897     CmpBuilder.legalFor({{S1, S16}});
898   }
899 
900   CmpBuilder
901     .widenScalarToNextPow2(1)
902     .clampScalar(1, S32, S64)
903     .scalarize(0)
904     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
905 
906   getActionDefinitionsBuilder(G_FCMP)
907     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
908     .widenScalarToNextPow2(1)
909     .clampScalar(1, S32, S64)
910     .scalarize(0);
911 
912   // FIXME: fpow has a selection pattern that should move to custom lowering.
913   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
914   if (ST.has16BitInsts())
915     Exp2Ops.legalFor({S32, S16});
916   else
917     Exp2Ops.legalFor({S32});
918   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
919   Exp2Ops.scalarize(0);
920 
921   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
922   if (ST.has16BitInsts())
923     ExpOps.customFor({{S32}, {S16}});
924   else
925     ExpOps.customFor({S32});
926   ExpOps.clampScalar(0, MinScalarFPTy, S32)
927         .scalarize(0);
928 
929   getActionDefinitionsBuilder(G_FPOWI)
930     .clampScalar(0, MinScalarFPTy, S32)
931     .lower();
932 
933   // The 64-bit versions produce 32-bit results, but only on the SALU.
934   getActionDefinitionsBuilder(G_CTPOP)
935     .legalFor({{S32, S32}, {S32, S64}})
936     .clampScalar(0, S32, S32)
937     .widenScalarToNextPow2(1, 32)
938     .clampScalar(1, S32, S64)
939     .scalarize(0)
940     .widenScalarToNextPow2(0, 32);
941 
942 
943   // The hardware instructions return a different result on 0 than the generic
944   // instructions expect. The hardware produces -1, but these produce the
945   // bitwidth.
946   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
947     .scalarize(0)
948     .clampScalar(0, S32, S32)
949     .clampScalar(1, S32, S64)
950     .widenScalarToNextPow2(0, 32)
951     .widenScalarToNextPow2(1, 32)
952     .custom();
953 
954   // The 64-bit versions produce 32-bit results, but only on the SALU.
955   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
956     .legalFor({{S32, S32}, {S32, S64}})
957     .clampScalar(0, S32, S32)
958     .clampScalar(1, S32, S64)
959     .scalarize(0)
960     .widenScalarToNextPow2(0, 32)
961     .widenScalarToNextPow2(1, 32);
962 
963   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
964   // RegBankSelect.
965   getActionDefinitionsBuilder(G_BITREVERSE)
966     .legalFor({S32, S64})
967     .clampScalar(0, S32, S64)
968     .scalarize(0)
969     .widenScalarToNextPow2(0);
970 
971   if (ST.has16BitInsts()) {
972     getActionDefinitionsBuilder(G_BSWAP)
973       .legalFor({S16, S32, V2S16})
974       .clampMaxNumElementsStrict(0, S16, 2)
975       // FIXME: Fixing non-power-of-2 before clamp is workaround for
976       // narrowScalar limitation.
977       .widenScalarToNextPow2(0)
978       .clampScalar(0, S16, S32)
979       .scalarize(0);
980 
981     if (ST.hasVOP3PInsts()) {
982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983         .legalFor({S32, S16, V2S16})
984         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
985         .clampMaxNumElements(0, S16, 2)
986         .minScalar(0, S16)
987         .widenScalarToNextPow2(0)
988         .scalarize(0)
989         .lower();
990     } else {
991       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
992         .legalFor({S32, S16})
993         .widenScalarToNextPow2(0)
994         .minScalar(0, S16)
995         .scalarize(0)
996         .lower();
997     }
998   } else {
999     // TODO: Should have same legality without v_perm_b32
1000     getActionDefinitionsBuilder(G_BSWAP)
1001       .legalFor({S32})
1002       .lowerIf(scalarNarrowerThan(0, 32))
1003       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1004       // narrowScalar limitation.
1005       .widenScalarToNextPow2(0)
1006       .maxScalar(0, S32)
1007       .scalarize(0)
1008       .lower();
1009 
1010     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1011       .legalFor({S32})
1012       .minScalar(0, S32)
1013       .widenScalarToNextPow2(0)
1014       .scalarize(0)
1015       .lower();
1016   }
1017 
1018   getActionDefinitionsBuilder(G_INTTOPTR)
1019     // List the common cases
1020     .legalForCartesianProduct(AddrSpaces64, {S64})
1021     .legalForCartesianProduct(AddrSpaces32, {S32})
1022     .scalarize(0)
1023     // Accept any address space as long as the size matches
1024     .legalIf(sameSize(0, 1))
1025     .widenScalarIf(smallerThan(1, 0),
1026       [](const LegalityQuery &Query) {
1027         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1028       })
1029     .narrowScalarIf(largerThan(1, 0),
1030       [](const LegalityQuery &Query) {
1031         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1032       });
1033 
1034   getActionDefinitionsBuilder(G_PTRTOINT)
1035     // List the common cases
1036     .legalForCartesianProduct(AddrSpaces64, {S64})
1037     .legalForCartesianProduct(AddrSpaces32, {S32})
1038     .scalarize(0)
1039     // Accept any address space as long as the size matches
1040     .legalIf(sameSize(0, 1))
1041     .widenScalarIf(smallerThan(0, 1),
1042       [](const LegalityQuery &Query) {
1043         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044       })
1045     .narrowScalarIf(
1046       largerThan(0, 1),
1047       [](const LegalityQuery &Query) {
1048         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1049       });
1050 
1051   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1052     .scalarize(0)
1053     .custom();
1054 
1055   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1056                                     bool IsLoad) -> bool {
1057     const LLT DstTy = Query.Types[0];
1058 
1059     // Split vector extloads.
1060     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1061 
1062     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1063       return true;
1064 
1065     const LLT PtrTy = Query.Types[1];
1066     unsigned AS = PtrTy.getAddressSpace();
1067     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1068       return true;
1069 
1070     // Catch weird sized loads that don't evenly divide into the access sizes
1071     // TODO: May be able to widen depending on alignment etc.
1072     unsigned NumRegs = (MemSize + 31) / 32;
1073     if (NumRegs == 3) {
1074       if (!ST.hasDwordx3LoadStores())
1075         return true;
1076     } else {
1077       // If the alignment allows, these should have been widened.
1078       if (!isPowerOf2_32(NumRegs))
1079         return true;
1080     }
1081 
1082     return false;
1083   };
1084 
1085   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1086   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1087   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1088 
1089   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1090   // LDS
1091   // TODO: Unsupported flat for SI.
1092 
1093   for (unsigned Op : {G_LOAD, G_STORE}) {
1094     const bool IsStore = Op == G_STORE;
1095 
1096     auto &Actions = getActionDefinitionsBuilder(Op);
1097     // Explicitly list some common cases.
1098     // TODO: Does this help compile time at all?
1099     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1100                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1101                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1102                                       {S64, GlobalPtr, S64, GlobalAlign32},
1103                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1104                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1105                                       {S32, GlobalPtr, S8, GlobalAlign8},
1106                                       {S32, GlobalPtr, S16, GlobalAlign16},
1107 
1108                                       {S32, LocalPtr, S32, 32},
1109                                       {S64, LocalPtr, S64, 32},
1110                                       {V2S32, LocalPtr, V2S32, 32},
1111                                       {S32, LocalPtr, S8, 8},
1112                                       {S32, LocalPtr, S16, 16},
1113                                       {V2S16, LocalPtr, S32, 32},
1114 
1115                                       {S32, PrivatePtr, S32, 32},
1116                                       {S32, PrivatePtr, S8, 8},
1117                                       {S32, PrivatePtr, S16, 16},
1118                                       {V2S16, PrivatePtr, S32, 32},
1119 
1120                                       {S32, ConstantPtr, S32, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1122                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1123                                       {S64, ConstantPtr, S64, GlobalAlign32},
1124                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1125     Actions.legalIf(
1126       [=](const LegalityQuery &Query) -> bool {
1127         return isLoadStoreLegal(ST, Query);
1128       });
1129 
1130     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1131     // 64-bits.
1132     //
1133     // TODO: Should generalize bitcast action into coerce, which will also cover
1134     // inserting addrspacecasts.
1135     Actions.customIf(typeIs(1, Constant32Ptr));
1136 
1137     // Turn any illegal element vectors into something easier to deal
1138     // with. These will ultimately produce 32-bit scalar shifts to extract the
1139     // parts anyway.
1140     //
1141     // For odd 16-bit element vectors, prefer to split those into pieces with
1142     // 16-bit vector parts.
1143     Actions.bitcastIf(
1144       [=](const LegalityQuery &Query) -> bool {
1145         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1146                                           Query.MMODescrs[0].MemoryTy);
1147       }, bitcastToRegisterType(0));
1148 
1149     if (!IsStore) {
1150       // Widen suitably aligned loads by loading extra bytes. The standard
1151       // legalization actions can't properly express widening memory operands.
1152       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1153         return shouldWidenLoad(ST, Query, G_LOAD);
1154       });
1155     }
1156 
1157     // FIXME: load/store narrowing should be moved to lower action
1158     Actions
1159         .narrowScalarIf(
1160             [=](const LegalityQuery &Query) -> bool {
1161               return !Query.Types[0].isVector() &&
1162                      needToSplitMemOp(Query, Op == G_LOAD);
1163             },
1164             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1165               const LLT DstTy = Query.Types[0];
1166               const LLT PtrTy = Query.Types[1];
1167 
1168               const unsigned DstSize = DstTy.getSizeInBits();
1169               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1170 
1171               // Split extloads.
1172               if (DstSize > MemSize)
1173                 return std::make_pair(0, LLT::scalar(MemSize));
1174 
1175               unsigned MaxSize = maxSizeForAddrSpace(ST,
1176                                                      PtrTy.getAddressSpace(),
1177                                                      Op == G_LOAD);
1178               if (MemSize > MaxSize)
1179                 return std::make_pair(0, LLT::scalar(MaxSize));
1180 
1181               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1182               return std::make_pair(0, LLT::scalar(Align));
1183             })
1184         .fewerElementsIf(
1185             [=](const LegalityQuery &Query) -> bool {
1186               return Query.Types[0].isVector() &&
1187                      needToSplitMemOp(Query, Op == G_LOAD);
1188             },
1189             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190               const LLT DstTy = Query.Types[0];
1191               const LLT PtrTy = Query.Types[1];
1192 
1193               LLT EltTy = DstTy.getElementType();
1194               unsigned MaxSize = maxSizeForAddrSpace(ST,
1195                                                      PtrTy.getAddressSpace(),
1196                                                      Op == G_LOAD);
1197 
1198               // FIXME: Handle widened to power of 2 results better. This ends
1199               // up scalarizing.
1200               // FIXME: 3 element stores scalarized on SI
1201 
1202               // Split if it's too large for the address space.
1203               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1204               if (MemSize > MaxSize) {
1205                 unsigned NumElts = DstTy.getNumElements();
1206                 unsigned EltSize = EltTy.getSizeInBits();
1207 
1208                 if (MaxSize % EltSize == 0) {
1209                   return std::make_pair(
1210                       0, LLT::scalarOrVector(
1211                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1212                 }
1213 
1214                 unsigned NumPieces = MemSize / MaxSize;
1215 
1216                 // FIXME: Refine when odd breakdowns handled
1217                 // The scalars will need to be re-legalized.
1218                 if (NumPieces == 1 || NumPieces >= NumElts ||
1219                     NumElts % NumPieces != 0)
1220                   return std::make_pair(0, EltTy);
1221 
1222                 return std::make_pair(
1223                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1224               }
1225 
1226               // FIXME: We could probably handle weird extending loads better.
1227               if (DstTy.getSizeInBits() > MemSize)
1228                 return std::make_pair(0, EltTy);
1229 
1230               unsigned EltSize = EltTy.getSizeInBits();
1231               unsigned DstSize = DstTy.getSizeInBits();
1232               if (!isPowerOf2_32(DstSize)) {
1233                 // We're probably decomposing an odd sized store. Try to split
1234                 // to the widest type. TODO: Account for alignment. As-is it
1235                 // should be OK, since the new parts will be further legalized.
1236                 unsigned FloorSize = PowerOf2Floor(DstSize);
1237                 return std::make_pair(
1238                     0, LLT::scalarOrVector(
1239                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1240               }
1241 
1242               // May need relegalization for the scalars.
1243               return std::make_pair(0, EltTy);
1244             })
1245     .minScalar(0, S32)
1246     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1247     .widenScalarToNextPow2(0)
1248     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1249     .lower();
1250   }
1251 
1252   // FIXME: Unaligned accesses not lowered.
1253   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1254                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1255                                                   {S32, GlobalPtr, S16, 2 * 8},
1256                                                   {S32, LocalPtr, S8, 8},
1257                                                   {S32, LocalPtr, S16, 16},
1258                                                   {S32, PrivatePtr, S8, 8},
1259                                                   {S32, PrivatePtr, S16, 16},
1260                                                   {S32, ConstantPtr, S8, 8},
1261                                                   {S32, ConstantPtr, S16, 2 * 8}})
1262                        .legalIf(
1263                          [=](const LegalityQuery &Query) -> bool {
1264                            return isLoadStoreLegal(ST, Query);
1265                          });
1266 
1267   if (ST.hasFlatAddressSpace()) {
1268     ExtLoads.legalForTypesWithMemDesc(
1269         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1270   }
1271 
1272   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1273   // 64-bits.
1274   //
1275   // TODO: Should generalize bitcast action into coerce, which will also cover
1276   // inserting addrspacecasts.
1277   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1278 
1279   ExtLoads.clampScalar(0, S32, S32)
1280           .widenScalarToNextPow2(0)
1281           .lower();
1282 
1283   auto &Atomics = getActionDefinitionsBuilder(
1284     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287      G_ATOMICRMW_UMIN})
1288     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289                {S64, GlobalPtr}, {S64, LocalPtr},
1290                {S32, RegionPtr}, {S64, RegionPtr}});
1291   if (ST.hasFlatAddressSpace()) {
1292     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293   }
1294 
1295   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296   if (ST.hasLDSFPAtomicAdd()) {
1297     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298     if (ST.hasGFX90AInsts())
1299       Atomic.legalFor({{S64, LocalPtr}});
1300   }
1301   if (ST.hasAtomicFaddInsts())
1302     Atomic.legalFor({{S32, GlobalPtr}});
1303 
1304   if (ST.hasGFX90AInsts()) {
1305     // These are legal with some caveats, and should have undergone expansion in
1306     // the IR in most situations
1307     // TODO: Move atomic expansion into legalizer
1308     // TODO: Also supports <2 x f16>
1309     Atomic.legalFor({
1310         {S32, GlobalPtr},
1311         {S64, GlobalPtr},
1312         {S64, FlatPtr}
1313       });
1314   }
1315 
1316   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1317   // demarshalling
1318   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1319     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1320                 {S32, FlatPtr}, {S64, FlatPtr}})
1321     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1322                {S32, RegionPtr}, {S64, RegionPtr}});
1323   // TODO: Pointer types, any 32-bit or 64-bit vector
1324 
1325   // Condition should be s32 for scalar, s1 for vector.
1326   getActionDefinitionsBuilder(G_SELECT)
1327       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1328                                  LocalPtr, FlatPtr, PrivatePtr,
1329                                  LLT::fixed_vector(2, LocalPtr),
1330                                  LLT::fixed_vector(2, PrivatePtr)},
1331                                 {S1, S32})
1332       .clampScalar(0, S16, S64)
1333       .scalarize(1)
1334       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1335       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1336       .clampMaxNumElements(0, S32, 2)
1337       .clampMaxNumElements(0, LocalPtr, 2)
1338       .clampMaxNumElements(0, PrivatePtr, 2)
1339       .scalarize(0)
1340       .widenScalarToNextPow2(0)
1341       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1342 
1343   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1344   // be more flexible with the shift amount type.
1345   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1346     .legalFor({{S32, S32}, {S64, S32}});
1347   if (ST.has16BitInsts()) {
1348     if (ST.hasVOP3PInsts()) {
1349       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1350             .clampMaxNumElements(0, S16, 2);
1351     } else
1352       Shifts.legalFor({{S16, S16}});
1353 
1354     // TODO: Support 16-bit shift amounts for all types
1355     Shifts.widenScalarIf(
1356       [=](const LegalityQuery &Query) {
1357         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1358         // 32-bit amount.
1359         const LLT ValTy = Query.Types[0];
1360         const LLT AmountTy = Query.Types[1];
1361         return ValTy.getSizeInBits() <= 16 &&
1362                AmountTy.getSizeInBits() < 16;
1363       }, changeTo(1, S16));
1364     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1365     Shifts.clampScalar(1, S32, S32);
1366     Shifts.widenScalarToNextPow2(0, 16);
1367     Shifts.clampScalar(0, S16, S64);
1368 
1369     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1370       .minScalar(0, S16)
1371       .scalarize(0)
1372       .lower();
1373   } else {
1374     // Make sure we legalize the shift amount type first, as the general
1375     // expansion for the shifted type will produce much worse code if it hasn't
1376     // been truncated already.
1377     Shifts.clampScalar(1, S32, S32);
1378     Shifts.widenScalarToNextPow2(0, 32);
1379     Shifts.clampScalar(0, S32, S64);
1380 
1381     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1382       .minScalar(0, S32)
1383       .scalarize(0)
1384       .lower();
1385   }
1386   Shifts.scalarize(0);
1387 
1388   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1389     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1390     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1391     unsigned IdxTypeIdx = 2;
1392 
1393     getActionDefinitionsBuilder(Op)
1394       .customIf([=](const LegalityQuery &Query) {
1395           const LLT EltTy = Query.Types[EltTypeIdx];
1396           const LLT VecTy = Query.Types[VecTypeIdx];
1397           const LLT IdxTy = Query.Types[IdxTypeIdx];
1398           const unsigned EltSize = EltTy.getSizeInBits();
1399           return (EltSize == 32 || EltSize == 64) &&
1400                   VecTy.getSizeInBits() % 32 == 0 &&
1401                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1402                   IdxTy.getSizeInBits() == 32;
1403         })
1404       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1405                  bitcastToVectorElement32(VecTypeIdx))
1406       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1407       .bitcastIf(
1408         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1409         [=](const LegalityQuery &Query) {
1410           // For > 64-bit element types, try to turn this into a 64-bit
1411           // element vector since we may be able to do better indexing
1412           // if this is scalar. If not, fall back to 32.
1413           const LLT EltTy = Query.Types[EltTypeIdx];
1414           const LLT VecTy = Query.Types[VecTypeIdx];
1415           const unsigned DstEltSize = EltTy.getSizeInBits();
1416           const unsigned VecSize = VecTy.getSizeInBits();
1417 
1418           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1419           return std::make_pair(
1420               VecTypeIdx,
1421               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1422         })
1423       .clampScalar(EltTypeIdx, S32, S64)
1424       .clampScalar(VecTypeIdx, S32, S64)
1425       .clampScalar(IdxTypeIdx, S32, S32)
1426       .clampMaxNumElements(VecTypeIdx, S32, 32)
1427       // TODO: Clamp elements for 64-bit vectors?
1428       // It should only be necessary with variable indexes.
1429       // As a last resort, lower to the stack
1430       .lower();
1431   }
1432 
1433   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1434     .unsupportedIf([=](const LegalityQuery &Query) {
1435         const LLT &EltTy = Query.Types[1].getElementType();
1436         return Query.Types[0] != EltTy;
1437       });
1438 
1439   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1440     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1441     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1442 
1443     // FIXME: Doesn't handle extract of illegal sizes.
1444     getActionDefinitionsBuilder(Op)
1445       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1446       .lowerIf([=](const LegalityQuery &Query) {
1447           // Sub-vector(or single element) insert and extract.
1448           // TODO: verify immediate offset here since lower only works with
1449           // whole elements.
1450           const LLT BigTy = Query.Types[BigTyIdx];
1451           return BigTy.isVector();
1452         })
1453       // FIXME: Multiples of 16 should not be legal.
1454       .legalIf([=](const LegalityQuery &Query) {
1455           const LLT BigTy = Query.Types[BigTyIdx];
1456           const LLT LitTy = Query.Types[LitTyIdx];
1457           return (BigTy.getSizeInBits() % 32 == 0) &&
1458                  (LitTy.getSizeInBits() % 16 == 0);
1459         })
1460       .widenScalarIf(
1461         [=](const LegalityQuery &Query) {
1462           const LLT BigTy = Query.Types[BigTyIdx];
1463           return (BigTy.getScalarSizeInBits() < 16);
1464         },
1465         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1466       .widenScalarIf(
1467         [=](const LegalityQuery &Query) {
1468           const LLT LitTy = Query.Types[LitTyIdx];
1469           return (LitTy.getScalarSizeInBits() < 16);
1470         },
1471         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1472       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1473       .widenScalarToNextPow2(BigTyIdx, 32);
1474 
1475   }
1476 
1477   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1478     .legalForCartesianProduct(AllS32Vectors, {S32})
1479     .legalForCartesianProduct(AllS64Vectors, {S64})
1480     .clampNumElements(0, V16S32, V32S32)
1481     .clampNumElements(0, V2S64, V16S64)
1482     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1483 
1484   if (ST.hasScalarPackInsts()) {
1485     BuildVector
1486       // FIXME: Should probably widen s1 vectors straight to s32
1487       .minScalarOrElt(0, S16)
1488       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1489       .minScalar(1, S32);
1490 
1491     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1492       .legalFor({V2S16, S32})
1493       .lower();
1494     BuildVector.minScalarOrElt(0, S32);
1495   } else {
1496     BuildVector.customFor({V2S16, S16});
1497     BuildVector.minScalarOrElt(0, S32);
1498 
1499     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1500       .customFor({V2S16, S32})
1501       .lower();
1502   }
1503 
1504   BuildVector.legalIf(isRegisterType(0));
1505 
1506   // FIXME: Clamp maximum size
1507   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1508     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1509     .clampMaxNumElements(0, S32, 32)
1510     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1511     .clampMaxNumElements(0, S16, 64);
1512 
1513   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1514   // pre-legalize.
1515   if (ST.hasVOP3PInsts()) {
1516     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1517       .customFor({V2S16, V2S16})
1518       .lower();
1519   } else
1520     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1521 
1522   // Merge/Unmerge
1523   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1524     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1525     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1526 
1527     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1528       const LLT Ty = Query.Types[TypeIdx];
1529       if (Ty.isVector()) {
1530         const LLT &EltTy = Ty.getElementType();
1531         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1532           return true;
1533         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1534           return true;
1535       }
1536       return false;
1537     };
1538 
1539     auto &Builder = getActionDefinitionsBuilder(Op)
1540       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1541       .lowerFor({{S16, V2S16}})
1542       .lowerIf([=](const LegalityQuery &Query) {
1543           const LLT BigTy = Query.Types[BigTyIdx];
1544           return BigTy.getSizeInBits() == 32;
1545         })
1546       // Try to widen to s16 first for small types.
1547       // TODO: Only do this on targets with legal s16 shifts
1548       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1549       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1550       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1551       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1552                            elementTypeIs(1, S16)),
1553                        changeTo(1, V2S16))
1554       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1555       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1556       // valid.
1557       .clampScalar(LitTyIdx, S32, S512)
1558       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1559       // Break up vectors with weird elements into scalars
1560       .fewerElementsIf(
1561         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1562         scalarize(0))
1563       .fewerElementsIf(
1564         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1565         scalarize(1))
1566       .clampScalar(BigTyIdx, S32, MaxScalar);
1567 
1568     if (Op == G_MERGE_VALUES) {
1569       Builder.widenScalarIf(
1570         // TODO: Use 16-bit shifts if legal for 8-bit values?
1571         [=](const LegalityQuery &Query) {
1572           const LLT Ty = Query.Types[LitTyIdx];
1573           return Ty.getSizeInBits() < 32;
1574         },
1575         changeTo(LitTyIdx, S32));
1576     }
1577 
1578     Builder.widenScalarIf(
1579       [=](const LegalityQuery &Query) {
1580         const LLT Ty = Query.Types[BigTyIdx];
1581         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1582           Ty.getSizeInBits() % 16 != 0;
1583       },
1584       [=](const LegalityQuery &Query) {
1585         // Pick the next power of 2, or a multiple of 64 over 128.
1586         // Whichever is smaller.
1587         const LLT &Ty = Query.Types[BigTyIdx];
1588         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1589         if (NewSizeInBits >= 256) {
1590           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1591           if (RoundedTo < NewSizeInBits)
1592             NewSizeInBits = RoundedTo;
1593         }
1594         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1595       })
1596       // Any vectors left are the wrong size. Scalarize them.
1597       .scalarize(0)
1598       .scalarize(1);
1599   }
1600 
1601   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1602   // RegBankSelect.
1603   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1604     .legalFor({{S32}, {S64}});
1605 
1606   if (ST.hasVOP3PInsts()) {
1607     SextInReg.lowerFor({{V2S16}})
1608       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1609       // get more vector shift opportunities, since we'll get those when
1610       // expanded.
1611       .clampMaxNumElementsStrict(0, S16, 2);
1612   } else if (ST.has16BitInsts()) {
1613     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1614   } else {
1615     // Prefer to promote to s32 before lowering if we don't have 16-bit
1616     // shifts. This avoid a lot of intermediate truncate and extend operations.
1617     SextInReg.lowerFor({{S32}, {S64}});
1618   }
1619 
1620   SextInReg
1621     .scalarize(0)
1622     .clampScalar(0, S32, S64)
1623     .lower();
1624 
1625   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1626     .scalarize(0)
1627     .lower();
1628 
1629   // TODO: Only Try to form v2s16 with legal packed instructions.
1630   getActionDefinitionsBuilder(G_FSHR)
1631     .legalFor({{S32, S32}})
1632     .lowerFor({{V2S16, V2S16}})
1633     .clampMaxNumElementsStrict(0, S16, 2)
1634     .scalarize(0)
1635     .lower();
1636 
1637   if (ST.hasVOP3PInsts()) {
1638     getActionDefinitionsBuilder(G_FSHL)
1639       .lowerFor({{V2S16, V2S16}})
1640       .clampMaxNumElementsStrict(0, S16, 2)
1641       .scalarize(0)
1642       .lower();
1643   } else {
1644     getActionDefinitionsBuilder(G_FSHL)
1645       .scalarize(0)
1646       .lower();
1647   }
1648 
1649   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650     .legalFor({S64});
1651 
1652   getActionDefinitionsBuilder(G_FENCE)
1653     .alwaysLegal();
1654 
1655   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656       .scalarize(0)
1657       .minScalar(0, S32)
1658       .lower();
1659 
1660   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661       .legalFor({{S32, S32}, {S64, S32}})
1662       .clampScalar(1, S32, S32)
1663       .clampScalar(0, S32, S64)
1664       .widenScalarToNextPow2(0)
1665       .scalarize(0);
1666 
1667   getActionDefinitionsBuilder({
1668       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669       G_FCOPYSIGN,
1670 
1671       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672       G_ATOMICRMW_NAND,
1673       G_ATOMICRMW_FSUB,
1674       G_READ_REGISTER,
1675       G_WRITE_REGISTER,
1676 
1677       G_SADDO, G_SSUBO,
1678 
1679        // TODO: Implement
1680       G_FMINIMUM, G_FMAXIMUM}).lower();
1681 
1682   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1683       .lower();
1684 
1685   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1686         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1687         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1688     .unsupported();
1689 
1690   getLegacyLegalizerInfo().computeTables();
1691   verify(*ST.getInstrInfo());
1692 }
1693 
1694 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1695                                          MachineInstr &MI) const {
1696   MachineIRBuilder &B = Helper.MIRBuilder;
1697   MachineRegisterInfo &MRI = *B.getMRI();
1698 
1699   switch (MI.getOpcode()) {
1700   case TargetOpcode::G_ADDRSPACE_CAST:
1701     return legalizeAddrSpaceCast(MI, MRI, B);
1702   case TargetOpcode::G_FRINT:
1703     return legalizeFrint(MI, MRI, B);
1704   case TargetOpcode::G_FCEIL:
1705     return legalizeFceil(MI, MRI, B);
1706   case TargetOpcode::G_FREM:
1707     return legalizeFrem(MI, MRI, B);
1708   case TargetOpcode::G_INTRINSIC_TRUNC:
1709     return legalizeIntrinsicTrunc(MI, MRI, B);
1710   case TargetOpcode::G_SITOFP:
1711     return legalizeITOFP(MI, MRI, B, true);
1712   case TargetOpcode::G_UITOFP:
1713     return legalizeITOFP(MI, MRI, B, false);
1714   case TargetOpcode::G_FPTOSI:
1715     return legalizeFPTOI(MI, MRI, B, true);
1716   case TargetOpcode::G_FPTOUI:
1717     return legalizeFPTOI(MI, MRI, B, false);
1718   case TargetOpcode::G_FMINNUM:
1719   case TargetOpcode::G_FMAXNUM:
1720   case TargetOpcode::G_FMINNUM_IEEE:
1721   case TargetOpcode::G_FMAXNUM_IEEE:
1722     return legalizeMinNumMaxNum(Helper, MI);
1723   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1724     return legalizeExtractVectorElt(MI, MRI, B);
1725   case TargetOpcode::G_INSERT_VECTOR_ELT:
1726     return legalizeInsertVectorElt(MI, MRI, B);
1727   case TargetOpcode::G_SHUFFLE_VECTOR:
1728     return legalizeShuffleVector(MI, MRI, B);
1729   case TargetOpcode::G_FSIN:
1730   case TargetOpcode::G_FCOS:
1731     return legalizeSinCos(MI, MRI, B);
1732   case TargetOpcode::G_GLOBAL_VALUE:
1733     return legalizeGlobalValue(MI, MRI, B);
1734   case TargetOpcode::G_LOAD:
1735   case TargetOpcode::G_SEXTLOAD:
1736   case TargetOpcode::G_ZEXTLOAD:
1737     return legalizeLoad(Helper, MI);
1738   case TargetOpcode::G_FMAD:
1739     return legalizeFMad(MI, MRI, B);
1740   case TargetOpcode::G_FDIV:
1741     return legalizeFDIV(MI, MRI, B);
1742   case TargetOpcode::G_UDIV:
1743   case TargetOpcode::G_UREM:
1744   case TargetOpcode::G_UDIVREM:
1745     return legalizeUnsignedDIV_REM(MI, MRI, B);
1746   case TargetOpcode::G_SDIV:
1747   case TargetOpcode::G_SREM:
1748   case TargetOpcode::G_SDIVREM:
1749     return legalizeSignedDIV_REM(MI, MRI, B);
1750   case TargetOpcode::G_ATOMIC_CMPXCHG:
1751     return legalizeAtomicCmpXChg(MI, MRI, B);
1752   case TargetOpcode::G_FLOG:
1753     return legalizeFlog(MI, B, numbers::ln2f);
1754   case TargetOpcode::G_FLOG10:
1755     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1756   case TargetOpcode::G_FEXP:
1757     return legalizeFExp(MI, B);
1758   case TargetOpcode::G_FPOW:
1759     return legalizeFPow(MI, B);
1760   case TargetOpcode::G_FFLOOR:
1761     return legalizeFFloor(MI, MRI, B);
1762   case TargetOpcode::G_BUILD_VECTOR:
1763     return legalizeBuildVector(MI, MRI, B);
1764   case TargetOpcode::G_CTLZ:
1765   case TargetOpcode::G_CTTZ:
1766     return legalizeCTLZ_CTTZ(MI, MRI, B);
1767   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1768     return legalizeFPTruncRound(MI, B);
1769   default:
1770     return false;
1771   }
1772 
1773   llvm_unreachable("expected switch to return");
1774 }
1775 
1776 Register AMDGPULegalizerInfo::getSegmentAperture(
1777   unsigned AS,
1778   MachineRegisterInfo &MRI,
1779   MachineIRBuilder &B) const {
1780   MachineFunction &MF = B.getMF();
1781   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1782   const LLT S32 = LLT::scalar(32);
1783 
1784   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1785 
1786   if (ST.hasApertureRegs()) {
1787     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1788     // getreg.
1789     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1790         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1791         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1792     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1793         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1794         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1795     unsigned Encoding =
1796         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1797         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1798         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1799 
1800     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1801 
1802     B.buildInstr(AMDGPU::S_GETREG_B32)
1803       .addDef(GetReg)
1804       .addImm(Encoding);
1805     MRI.setType(GetReg, S32);
1806 
1807     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1808     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1809   }
1810 
1811   Register QueuePtr = MRI.createGenericVirtualRegister(
1812     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1813 
1814   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1815     return Register();
1816 
1817   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1818   // private_segment_aperture_base_hi.
1819   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1820 
1821   // TODO: can we be smarter about machine pointer info?
1822   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1823   MachineMemOperand *MMO = MF.getMachineMemOperand(
1824       PtrInfo,
1825       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1826           MachineMemOperand::MOInvariant,
1827       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1828 
1829   Register LoadAddr;
1830 
1831   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1832   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1833 }
1834 
1835 /// Return true if the value is a known valid address, such that a null check is
1836 /// not necessary.
1837 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1838                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1839   MachineInstr *Def = MRI.getVRegDef(Val);
1840   switch (Def->getOpcode()) {
1841   case AMDGPU::G_FRAME_INDEX:
1842   case AMDGPU::G_GLOBAL_VALUE:
1843   case AMDGPU::G_BLOCK_ADDR:
1844     return true;
1845   case AMDGPU::G_CONSTANT: {
1846     const ConstantInt *CI = Def->getOperand(1).getCImm();
1847     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1848   }
1849   default:
1850     return false;
1851   }
1852 
1853   return false;
1854 }
1855 
1856 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1857   MachineInstr &MI, MachineRegisterInfo &MRI,
1858   MachineIRBuilder &B) const {
1859   MachineFunction &MF = B.getMF();
1860 
1861   const LLT S32 = LLT::scalar(32);
1862   Register Dst = MI.getOperand(0).getReg();
1863   Register Src = MI.getOperand(1).getReg();
1864 
1865   LLT DstTy = MRI.getType(Dst);
1866   LLT SrcTy = MRI.getType(Src);
1867   unsigned DestAS = DstTy.getAddressSpace();
1868   unsigned SrcAS = SrcTy.getAddressSpace();
1869 
1870   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1871   // vector element.
1872   assert(!DstTy.isVector());
1873 
1874   const AMDGPUTargetMachine &TM
1875     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1876 
1877   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1878     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1879     return true;
1880   }
1881 
1882   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1883       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1884        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1885     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1886       // Extract low 32-bits of the pointer.
1887       B.buildExtract(Dst, Src, 0);
1888       MI.eraseFromParent();
1889       return true;
1890     }
1891 
1892     unsigned NullVal = TM.getNullPointerValue(DestAS);
1893 
1894     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1895     auto FlatNull = B.buildConstant(SrcTy, 0);
1896 
1897     // Extract low 32-bits of the pointer.
1898     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1899 
1900     auto CmpRes =
1901         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1902     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1903 
1904     MI.eraseFromParent();
1905     return true;
1906   }
1907 
1908   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1909       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1910        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1911     if (!ST.hasFlatAddressSpace())
1912       return false;
1913 
1914     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1915     if (!ApertureReg.isValid())
1916       return false;
1917 
1918     // Coerce the type of the low half of the result so we can use merge_values.
1919     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1920 
1921     // TODO: Should we allow mismatched types but matching sizes in merges to
1922     // avoid the ptrtoint?
1923     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1924 
1925     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1926       B.buildCopy(Dst, BuildPtr);
1927       MI.eraseFromParent();
1928       return true;
1929     }
1930 
1931     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1932     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1933 
1934     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1935                               SegmentNull.getReg(0));
1936 
1937     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1938 
1939     MI.eraseFromParent();
1940     return true;
1941   }
1942 
1943   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1944       SrcTy.getSizeInBits() == 64) {
1945     // Truncate.
1946     B.buildExtract(Dst, Src, 0);
1947     MI.eraseFromParent();
1948     return true;
1949   }
1950 
1951   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1952       DstTy.getSizeInBits() == 64) {
1953     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1954     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1955 
1956     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1957     // another. Merge operands are required to be the same type, but creating an
1958     // extra ptrtoint would be kind of pointless.
1959     auto HighAddr = B.buildConstant(
1960         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1961     B.buildMerge(Dst, {Src, HighAddr});
1962     MI.eraseFromParent();
1963     return true;
1964   }
1965 
1966   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1967       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
1968 
1969   LLVMContext &Ctx = MF.getFunction().getContext();
1970   Ctx.diagnose(InvalidAddrSpaceCast);
1971   B.buildUndef(Dst);
1972   MI.eraseFromParent();
1973   return true;
1974 }
1975 
1976 bool AMDGPULegalizerInfo::legalizeFrint(
1977   MachineInstr &MI, MachineRegisterInfo &MRI,
1978   MachineIRBuilder &B) const {
1979   Register Src = MI.getOperand(1).getReg();
1980   LLT Ty = MRI.getType(Src);
1981   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1982 
1983   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1984   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1985 
1986   auto C1 = B.buildFConstant(Ty, C1Val);
1987   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1988 
1989   // TODO: Should this propagate fast-math-flags?
1990   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1991   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1992 
1993   auto C2 = B.buildFConstant(Ty, C2Val);
1994   auto Fabs = B.buildFAbs(Ty, Src);
1995 
1996   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1997   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1998   MI.eraseFromParent();
1999   return true;
2000 }
2001 
2002 bool AMDGPULegalizerInfo::legalizeFceil(
2003   MachineInstr &MI, MachineRegisterInfo &MRI,
2004   MachineIRBuilder &B) const {
2005 
2006   const LLT S1 = LLT::scalar(1);
2007   const LLT S64 = LLT::scalar(64);
2008 
2009   Register Src = MI.getOperand(1).getReg();
2010   assert(MRI.getType(Src) == S64);
2011 
2012   // result = trunc(src)
2013   // if (src > 0.0 && src != result)
2014   //   result += 1.0
2015 
2016   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2017 
2018   const auto Zero = B.buildFConstant(S64, 0.0);
2019   const auto One = B.buildFConstant(S64, 1.0);
2020   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2021   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2022   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2023   auto Add = B.buildSelect(S64, And, One, Zero);
2024 
2025   // TODO: Should this propagate fast-math-flags?
2026   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2027   MI.eraseFromParent();
2028   return true;
2029 }
2030 
2031 bool AMDGPULegalizerInfo::legalizeFrem(
2032   MachineInstr &MI, MachineRegisterInfo &MRI,
2033   MachineIRBuilder &B) const {
2034     Register DstReg = MI.getOperand(0).getReg();
2035     Register Src0Reg = MI.getOperand(1).getReg();
2036     Register Src1Reg = MI.getOperand(2).getReg();
2037     auto Flags = MI.getFlags();
2038     LLT Ty = MRI.getType(DstReg);
2039 
2040     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2041     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2042     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2043     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2044     MI.eraseFromParent();
2045     return true;
2046 }
2047 
2048 static MachineInstrBuilder extractF64Exponent(Register Hi,
2049                                               MachineIRBuilder &B) {
2050   const unsigned FractBits = 52;
2051   const unsigned ExpBits = 11;
2052   LLT S32 = LLT::scalar(32);
2053 
2054   auto Const0 = B.buildConstant(S32, FractBits - 32);
2055   auto Const1 = B.buildConstant(S32, ExpBits);
2056 
2057   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2058     .addUse(Hi)
2059     .addUse(Const0.getReg(0))
2060     .addUse(Const1.getReg(0));
2061 
2062   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2063 }
2064 
2065 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2066   MachineInstr &MI, MachineRegisterInfo &MRI,
2067   MachineIRBuilder &B) const {
2068   const LLT S1 = LLT::scalar(1);
2069   const LLT S32 = LLT::scalar(32);
2070   const LLT S64 = LLT::scalar(64);
2071 
2072   Register Src = MI.getOperand(1).getReg();
2073   assert(MRI.getType(Src) == S64);
2074 
2075   // TODO: Should this use extract since the low half is unused?
2076   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2077   Register Hi = Unmerge.getReg(1);
2078 
2079   // Extract the upper half, since this is where we will find the sign and
2080   // exponent.
2081   auto Exp = extractF64Exponent(Hi, B);
2082 
2083   const unsigned FractBits = 52;
2084 
2085   // Extract the sign bit.
2086   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2087   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2088 
2089   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2090 
2091   const auto Zero32 = B.buildConstant(S32, 0);
2092 
2093   // Extend back to 64-bits.
2094   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2095 
2096   auto Shr = B.buildAShr(S64, FractMask, Exp);
2097   auto Not = B.buildNot(S64, Shr);
2098   auto Tmp0 = B.buildAnd(S64, Src, Not);
2099   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2100 
2101   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2102   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2103 
2104   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2105   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2106   MI.eraseFromParent();
2107   return true;
2108 }
2109 
2110 bool AMDGPULegalizerInfo::legalizeITOFP(
2111   MachineInstr &MI, MachineRegisterInfo &MRI,
2112   MachineIRBuilder &B, bool Signed) const {
2113 
2114   Register Dst = MI.getOperand(0).getReg();
2115   Register Src = MI.getOperand(1).getReg();
2116 
2117   const LLT S64 = LLT::scalar(64);
2118   const LLT S32 = LLT::scalar(32);
2119 
2120   assert(MRI.getType(Src) == S64);
2121 
2122   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2123   auto ThirtyTwo = B.buildConstant(S32, 32);
2124 
2125   if (MRI.getType(Dst) == S64) {
2126     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2127                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2128 
2129     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2130     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2131                      .addUse(CvtHi.getReg(0))
2132                      .addUse(ThirtyTwo.getReg(0));
2133 
2134     // TODO: Should this propagate fast-math-flags?
2135     B.buildFAdd(Dst, LdExp, CvtLo);
2136     MI.eraseFromParent();
2137     return true;
2138   }
2139 
2140   assert(MRI.getType(Dst) == S32);
2141 
2142   auto One = B.buildConstant(S32, 1);
2143 
2144   MachineInstrBuilder ShAmt;
2145   if (Signed) {
2146     auto ThirtyOne = B.buildConstant(S32, 31);
2147     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2148     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2149     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2150     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2151                                /*HasSideEffects=*/false)
2152                   .addUse(Unmerge.getReg(1));
2153     auto LS2 = B.buildSub(S32, LS, One);
2154     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2155   } else
2156     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2157   auto Norm = B.buildShl(S64, Src, ShAmt);
2158   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2159   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2160   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2161   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2162   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2163   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2164                    /*HasSideEffects=*/false)
2165       .addUse(FVal.getReg(0))
2166       .addUse(Scale.getReg(0));
2167   MI.eraseFromParent();
2168   return true;
2169 }
2170 
2171 // TODO: Copied from DAG implementation. Verify logic and document how this
2172 // actually works.
2173 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2174                                         MachineRegisterInfo &MRI,
2175                                         MachineIRBuilder &B,
2176                                         bool Signed) const {
2177 
2178   Register Dst = MI.getOperand(0).getReg();
2179   Register Src = MI.getOperand(1).getReg();
2180 
2181   const LLT S64 = LLT::scalar(64);
2182   const LLT S32 = LLT::scalar(32);
2183 
2184   const LLT SrcLT = MRI.getType(Src);
2185   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2186 
2187   unsigned Flags = MI.getFlags();
2188 
2189   // The basic idea of converting a floating point number into a pair of 32-bit
2190   // integers is illustrated as follows:
2191   //
2192   //     tf := trunc(val);
2193   //    hif := floor(tf * 2^-32);
2194   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2195   //     hi := fptoi(hif);
2196   //     lo := fptoi(lof);
2197   //
2198   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2199   MachineInstrBuilder Sign;
2200   if (Signed && SrcLT == S32) {
2201     // However, a 32-bit floating point number has only 23 bits mantissa and
2202     // it's not enough to hold all the significant bits of `lof` if val is
2203     // negative. To avoid the loss of precision, We need to take the absolute
2204     // value after truncating and flip the result back based on the original
2205     // signedness.
2206     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2207     Trunc = B.buildFAbs(S32, Trunc, Flags);
2208   }
2209   MachineInstrBuilder K0, K1;
2210   if (SrcLT == S64) {
2211     K0 = B.buildFConstant(S64,
2212                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2213     K1 = B.buildFConstant(S64,
2214                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2215   } else {
2216     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2217     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2218   }
2219 
2220   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2221   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2222   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2223 
2224   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2225                                      : B.buildFPTOUI(S32, FloorMul);
2226   auto Lo = B.buildFPTOUI(S32, Fma);
2227 
2228   if (Signed && SrcLT == S32) {
2229     // Flip the result based on the signedness, which is either all 0s or 1s.
2230     Sign = B.buildMerge(S64, {Sign, Sign});
2231     // r := xor({lo, hi}, sign) - sign;
2232     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2233   } else
2234     B.buildMerge(Dst, {Lo, Hi});
2235   MI.eraseFromParent();
2236 
2237   return true;
2238 }
2239 
2240 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2241                                                MachineInstr &MI) const {
2242   MachineFunction &MF = Helper.MIRBuilder.getMF();
2243   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2244 
2245   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2246                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2247 
2248   // With ieee_mode disabled, the instructions have the correct behavior
2249   // already for G_FMINNUM/G_FMAXNUM
2250   if (!MFI->getMode().IEEE)
2251     return !IsIEEEOp;
2252 
2253   if (IsIEEEOp)
2254     return true;
2255 
2256   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2257 }
2258 
2259 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2260   MachineInstr &MI, MachineRegisterInfo &MRI,
2261   MachineIRBuilder &B) const {
2262   // TODO: Should move some of this into LegalizerHelper.
2263 
2264   // TODO: Promote dynamic indexing of s16 to s32
2265 
2266   // FIXME: Artifact combiner probably should have replaced the truncated
2267   // constant before this, so we shouldn't need
2268   // getIConstantVRegValWithLookThrough.
2269   Optional<ValueAndVReg> MaybeIdxVal =
2270       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2271   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2272     return true;
2273   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2274 
2275   Register Dst = MI.getOperand(0).getReg();
2276   Register Vec = MI.getOperand(1).getReg();
2277 
2278   LLT VecTy = MRI.getType(Vec);
2279   LLT EltTy = VecTy.getElementType();
2280   assert(EltTy == MRI.getType(Dst));
2281 
2282   if (IdxVal < VecTy.getNumElements()) {
2283     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2284     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2285   } else {
2286     B.buildUndef(Dst);
2287   }
2288 
2289   MI.eraseFromParent();
2290   return true;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2294   MachineInstr &MI, MachineRegisterInfo &MRI,
2295   MachineIRBuilder &B) const {
2296   // TODO: Should move some of this into LegalizerHelper.
2297 
2298   // TODO: Promote dynamic indexing of s16 to s32
2299 
2300   // FIXME: Artifact combiner probably should have replaced the truncated
2301   // constant before this, so we shouldn't need
2302   // getIConstantVRegValWithLookThrough.
2303   Optional<ValueAndVReg> MaybeIdxVal =
2304       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2305   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2306     return true;
2307 
2308   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2309   Register Dst = MI.getOperand(0).getReg();
2310   Register Vec = MI.getOperand(1).getReg();
2311   Register Ins = MI.getOperand(2).getReg();
2312 
2313   LLT VecTy = MRI.getType(Vec);
2314   LLT EltTy = VecTy.getElementType();
2315   assert(EltTy == MRI.getType(Ins));
2316   (void)Ins;
2317 
2318   unsigned NumElts = VecTy.getNumElements();
2319   if (IdxVal < NumElts) {
2320     SmallVector<Register, 8> SrcRegs;
2321     for (unsigned i = 0; i < NumElts; ++i)
2322       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2323     B.buildUnmerge(SrcRegs, Vec);
2324 
2325     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2326     B.buildMerge(Dst, SrcRegs);
2327   } else {
2328     B.buildUndef(Dst);
2329   }
2330 
2331   MI.eraseFromParent();
2332   return true;
2333 }
2334 
2335 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2336   MachineInstr &MI, MachineRegisterInfo &MRI,
2337   MachineIRBuilder &B) const {
2338   const LLT V2S16 = LLT::fixed_vector(2, 16);
2339 
2340   Register Dst = MI.getOperand(0).getReg();
2341   Register Src0 = MI.getOperand(1).getReg();
2342   LLT DstTy = MRI.getType(Dst);
2343   LLT SrcTy = MRI.getType(Src0);
2344 
2345   if (SrcTy == V2S16 && DstTy == V2S16 &&
2346       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2347     return true;
2348 
2349   MachineIRBuilder HelperBuilder(MI);
2350   GISelObserverWrapper DummyObserver;
2351   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2352   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2353 }
2354 
2355 bool AMDGPULegalizerInfo::legalizeSinCos(
2356   MachineInstr &MI, MachineRegisterInfo &MRI,
2357   MachineIRBuilder &B) const {
2358 
2359   Register DstReg = MI.getOperand(0).getReg();
2360   Register SrcReg = MI.getOperand(1).getReg();
2361   LLT Ty = MRI.getType(DstReg);
2362   unsigned Flags = MI.getFlags();
2363 
2364   Register TrigVal;
2365   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2366   if (ST.hasTrigReducedRange()) {
2367     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2368     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2369       .addUse(MulVal.getReg(0))
2370       .setMIFlags(Flags).getReg(0);
2371   } else
2372     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2373 
2374   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2375     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2376   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2377     .addUse(TrigVal)
2378     .setMIFlags(Flags);
2379   MI.eraseFromParent();
2380   return true;
2381 }
2382 
2383 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2384                                                   MachineIRBuilder &B,
2385                                                   const GlobalValue *GV,
2386                                                   int64_t Offset,
2387                                                   unsigned GAFlags) const {
2388   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2389   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2390   // to the following code sequence:
2391   //
2392   // For constant address space:
2393   //   s_getpc_b64 s[0:1]
2394   //   s_add_u32 s0, s0, $symbol
2395   //   s_addc_u32 s1, s1, 0
2396   //
2397   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2398   //   a fixup or relocation is emitted to replace $symbol with a literal
2399   //   constant, which is a pc-relative offset from the encoding of the $symbol
2400   //   operand to the global variable.
2401   //
2402   // For global address space:
2403   //   s_getpc_b64 s[0:1]
2404   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2405   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2406   //
2407   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2408   //   fixups or relocations are emitted to replace $symbol@*@lo and
2409   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2410   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2411   //   operand to the global variable.
2412   //
2413   // What we want here is an offset from the value returned by s_getpc
2414   // (which is the address of the s_add_u32 instruction) to the global
2415   // variable, but since the encoding of $symbol starts 4 bytes after the start
2416   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2417   // small. This requires us to add 4 to the global variable offset in order to
2418   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2419   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2420   // instruction.
2421 
2422   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2423 
2424   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2425     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2426 
2427   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2428     .addDef(PCReg);
2429 
2430   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2431   if (GAFlags == SIInstrInfo::MO_NONE)
2432     MIB.addImm(0);
2433   else
2434     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2435 
2436   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2437 
2438   if (PtrTy.getSizeInBits() == 32)
2439     B.buildExtract(DstReg, PCReg, 0);
2440   return true;
2441  }
2442 
2443 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2444   MachineInstr &MI, MachineRegisterInfo &MRI,
2445   MachineIRBuilder &B) const {
2446   Register DstReg = MI.getOperand(0).getReg();
2447   LLT Ty = MRI.getType(DstReg);
2448   unsigned AS = Ty.getAddressSpace();
2449 
2450   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2451   MachineFunction &MF = B.getMF();
2452   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2453 
2454   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2455     if (!MFI->isModuleEntryFunction() &&
2456         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2457       const Function &Fn = MF.getFunction();
2458       DiagnosticInfoUnsupported BadLDSDecl(
2459         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2460         DS_Warning);
2461       Fn.getContext().diagnose(BadLDSDecl);
2462 
2463       // We currently don't have a way to correctly allocate LDS objects that
2464       // aren't directly associated with a kernel. We do force inlining of
2465       // functions that use local objects. However, if these dead functions are
2466       // not eliminated, we don't want a compile time error. Just emit a warning
2467       // and a trap, since there should be no callable path here.
2468       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2469       B.buildUndef(DstReg);
2470       MI.eraseFromParent();
2471       return true;
2472     }
2473 
2474     // TODO: We could emit code to handle the initialization somewhere.
2475     // We ignore the initializer for now and legalize it to allow selection.
2476     // The initializer will anyway get errored out during assembly emission.
2477     const SITargetLowering *TLI = ST.getTargetLowering();
2478     if (!TLI->shouldUseLDSConstAddress(GV)) {
2479       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2480       return true; // Leave in place;
2481     }
2482 
2483     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2484       Type *Ty = GV->getValueType();
2485       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2486       // zero-sized type in other languages to declare the dynamic shared
2487       // memory which size is not known at the compile time. They will be
2488       // allocated by the runtime and placed directly after the static
2489       // allocated ones. They all share the same offset.
2490       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2491         // Adjust alignment for that dynamic shared memory array.
2492         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2493         LLT S32 = LLT::scalar(32);
2494         auto Sz =
2495             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2496         B.buildIntToPtr(DstReg, Sz);
2497         MI.eraseFromParent();
2498         return true;
2499       }
2500     }
2501 
2502     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2503                                                    *cast<GlobalVariable>(GV)));
2504     MI.eraseFromParent();
2505     return true;
2506   }
2507 
2508   const SITargetLowering *TLI = ST.getTargetLowering();
2509 
2510   if (TLI->shouldEmitFixup(GV)) {
2511     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2512     MI.eraseFromParent();
2513     return true;
2514   }
2515 
2516   if (TLI->shouldEmitPCReloc(GV)) {
2517     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2518     MI.eraseFromParent();
2519     return true;
2520   }
2521 
2522   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2523   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2524 
2525   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2526   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2527       MachinePointerInfo::getGOT(MF),
2528       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2529           MachineMemOperand::MOInvariant,
2530       LoadTy, Align(8));
2531 
2532   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2533 
2534   if (Ty.getSizeInBits() == 32) {
2535     // Truncate if this is a 32-bit constant address.
2536     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2537     B.buildExtract(DstReg, Load, 0);
2538   } else
2539     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2540 
2541   MI.eraseFromParent();
2542   return true;
2543 }
2544 
2545 static LLT widenToNextPowerOf2(LLT Ty) {
2546   if (Ty.isVector())
2547     return Ty.changeElementCount(
2548         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2549   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2550 }
2551 
2552 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2553                                        MachineInstr &MI) const {
2554   MachineIRBuilder &B = Helper.MIRBuilder;
2555   MachineRegisterInfo &MRI = *B.getMRI();
2556   GISelChangeObserver &Observer = Helper.Observer;
2557 
2558   Register PtrReg = MI.getOperand(1).getReg();
2559   LLT PtrTy = MRI.getType(PtrReg);
2560   unsigned AddrSpace = PtrTy.getAddressSpace();
2561 
2562   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2563     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2564     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2565     Observer.changingInstr(MI);
2566     MI.getOperand(1).setReg(Cast.getReg(0));
2567     Observer.changedInstr(MI);
2568     return true;
2569   }
2570 
2571   if (MI.getOpcode() != AMDGPU::G_LOAD)
2572     return false;
2573 
2574   Register ValReg = MI.getOperand(0).getReg();
2575   LLT ValTy = MRI.getType(ValReg);
2576 
2577   MachineMemOperand *MMO = *MI.memoperands_begin();
2578   const unsigned ValSize = ValTy.getSizeInBits();
2579   const LLT MemTy = MMO->getMemoryType();
2580   const Align MemAlign = MMO->getAlign();
2581   const unsigned MemSize = MemTy.getSizeInBits();
2582   const uint64_t AlignInBits = 8 * MemAlign.value();
2583 
2584   // Widen non-power-of-2 loads to the alignment if needed
2585   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2586     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2587 
2588     // This was already the correct extending load result type, so just adjust
2589     // the memory type.
2590     if (WideMemSize == ValSize) {
2591       MachineFunction &MF = B.getMF();
2592 
2593       MachineMemOperand *WideMMO =
2594           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2595       Observer.changingInstr(MI);
2596       MI.setMemRefs(MF, {WideMMO});
2597       Observer.changedInstr(MI);
2598       return true;
2599     }
2600 
2601     // Don't bother handling edge case that should probably never be produced.
2602     if (ValSize > WideMemSize)
2603       return false;
2604 
2605     LLT WideTy = widenToNextPowerOf2(ValTy);
2606 
2607     Register WideLoad;
2608     if (!WideTy.isVector()) {
2609       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2610       B.buildTrunc(ValReg, WideLoad).getReg(0);
2611     } else {
2612       // Extract the subvector.
2613 
2614       if (isRegisterType(ValTy)) {
2615         // If this a case where G_EXTRACT is legal, use it.
2616         // (e.g. <3 x s32> -> <4 x s32>)
2617         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2618         B.buildExtract(ValReg, WideLoad, 0);
2619       } else {
2620         // For cases where the widened type isn't a nice register value, unmerge
2621         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2622         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2623         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2624       }
2625     }
2626 
2627     MI.eraseFromParent();
2628     return true;
2629   }
2630 
2631   return false;
2632 }
2633 
2634 bool AMDGPULegalizerInfo::legalizeFMad(
2635   MachineInstr &MI, MachineRegisterInfo &MRI,
2636   MachineIRBuilder &B) const {
2637   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2638   assert(Ty.isScalar());
2639 
2640   MachineFunction &MF = B.getMF();
2641   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2642 
2643   // TODO: Always legal with future ftz flag.
2644   // FIXME: Do we need just output?
2645   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2646     return true;
2647   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2648     return true;
2649 
2650   MachineIRBuilder HelperBuilder(MI);
2651   GISelObserverWrapper DummyObserver;
2652   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2653   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2654 }
2655 
2656 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2657   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2658   Register DstReg = MI.getOperand(0).getReg();
2659   Register PtrReg = MI.getOperand(1).getReg();
2660   Register CmpVal = MI.getOperand(2).getReg();
2661   Register NewVal = MI.getOperand(3).getReg();
2662 
2663   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2664          "this should not have been custom lowered");
2665 
2666   LLT ValTy = MRI.getType(CmpVal);
2667   LLT VecTy = LLT::fixed_vector(2, ValTy);
2668 
2669   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2670 
2671   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2672     .addDef(DstReg)
2673     .addUse(PtrReg)
2674     .addUse(PackedVal)
2675     .setMemRefs(MI.memoperands());
2676 
2677   MI.eraseFromParent();
2678   return true;
2679 }
2680 
2681 bool AMDGPULegalizerInfo::legalizeFlog(
2682   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2683   Register Dst = MI.getOperand(0).getReg();
2684   Register Src = MI.getOperand(1).getReg();
2685   LLT Ty = B.getMRI()->getType(Dst);
2686   unsigned Flags = MI.getFlags();
2687 
2688   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2689   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2690 
2691   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2692   MI.eraseFromParent();
2693   return true;
2694 }
2695 
2696 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2697                                        MachineIRBuilder &B) const {
2698   Register Dst = MI.getOperand(0).getReg();
2699   Register Src = MI.getOperand(1).getReg();
2700   unsigned Flags = MI.getFlags();
2701   LLT Ty = B.getMRI()->getType(Dst);
2702 
2703   auto K = B.buildFConstant(Ty, numbers::log2e);
2704   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2705   B.buildFExp2(Dst, Mul, Flags);
2706   MI.eraseFromParent();
2707   return true;
2708 }
2709 
2710 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2711                                        MachineIRBuilder &B) const {
2712   Register Dst = MI.getOperand(0).getReg();
2713   Register Src0 = MI.getOperand(1).getReg();
2714   Register Src1 = MI.getOperand(2).getReg();
2715   unsigned Flags = MI.getFlags();
2716   LLT Ty = B.getMRI()->getType(Dst);
2717   const LLT S16 = LLT::scalar(16);
2718   const LLT S32 = LLT::scalar(32);
2719 
2720   if (Ty == S32) {
2721     auto Log = B.buildFLog2(S32, Src0, Flags);
2722     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2723       .addUse(Log.getReg(0))
2724       .addUse(Src1)
2725       .setMIFlags(Flags);
2726     B.buildFExp2(Dst, Mul, Flags);
2727   } else if (Ty == S16) {
2728     // There's no f16 fmul_legacy, so we need to convert for it.
2729     auto Log = B.buildFLog2(S16, Src0, Flags);
2730     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2731     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2732     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2733       .addUse(Ext0.getReg(0))
2734       .addUse(Ext1.getReg(0))
2735       .setMIFlags(Flags);
2736 
2737     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2738   } else
2739     return false;
2740 
2741   MI.eraseFromParent();
2742   return true;
2743 }
2744 
2745 // Find a source register, ignoring any possible source modifiers.
2746 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2747   Register ModSrc = OrigSrc;
2748   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2749     ModSrc = SrcFNeg->getOperand(1).getReg();
2750     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2751       ModSrc = SrcFAbs->getOperand(1).getReg();
2752   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2753     ModSrc = SrcFAbs->getOperand(1).getReg();
2754   return ModSrc;
2755 }
2756 
2757 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2758                                          MachineRegisterInfo &MRI,
2759                                          MachineIRBuilder &B) const {
2760 
2761   const LLT S1 = LLT::scalar(1);
2762   const LLT S64 = LLT::scalar(64);
2763   Register Dst = MI.getOperand(0).getReg();
2764   Register OrigSrc = MI.getOperand(1).getReg();
2765   unsigned Flags = MI.getFlags();
2766   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2767          "this should not have been custom lowered");
2768 
2769   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2770   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2771   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2772   // V_FRACT bug is:
2773   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2774   //
2775   // Convert floor(x) to (x - fract(x))
2776 
2777   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2778     .addUse(OrigSrc)
2779     .setMIFlags(Flags);
2780 
2781   // Give source modifier matching some assistance before obscuring a foldable
2782   // pattern.
2783 
2784   // TODO: We can avoid the neg on the fract? The input sign to fract
2785   // shouldn't matter?
2786   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2787 
2788   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2789 
2790   Register Min = MRI.createGenericVirtualRegister(S64);
2791 
2792   // We don't need to concern ourselves with the snan handling difference, so
2793   // use the one which will directly select.
2794   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2795   if (MFI->getMode().IEEE)
2796     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2797   else
2798     B.buildFMinNum(Min, Fract, Const, Flags);
2799 
2800   Register CorrectedFract = Min;
2801   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2802     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2803     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2804   }
2805 
2806   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2807   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2808 
2809   MI.eraseFromParent();
2810   return true;
2811 }
2812 
2813 // Turn an illegal packed v2s16 build vector into bit operations.
2814 // TODO: This should probably be a bitcast action in LegalizerHelper.
2815 bool AMDGPULegalizerInfo::legalizeBuildVector(
2816   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2817   Register Dst = MI.getOperand(0).getReg();
2818   const LLT S32 = LLT::scalar(32);
2819   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2820 
2821   Register Src0 = MI.getOperand(1).getReg();
2822   Register Src1 = MI.getOperand(2).getReg();
2823   assert(MRI.getType(Src0) == LLT::scalar(16));
2824 
2825   auto Merge = B.buildMerge(S32, {Src0, Src1});
2826   B.buildBitcast(Dst, Merge);
2827 
2828   MI.eraseFromParent();
2829   return true;
2830 }
2831 
2832 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2833 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2834 // case with a single min instruction instead of a compare+select.
2835 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2836                                             MachineRegisterInfo &MRI,
2837                                             MachineIRBuilder &B) const {
2838   Register Dst = MI.getOperand(0).getReg();
2839   Register Src = MI.getOperand(1).getReg();
2840   LLT DstTy = MRI.getType(Dst);
2841   LLT SrcTy = MRI.getType(Src);
2842 
2843   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2844                         ? AMDGPU::G_AMDGPU_FFBH_U32
2845                         : AMDGPU::G_AMDGPU_FFBL_B32;
2846   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2847   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2848 
2849   MI.eraseFromParent();
2850   return true;
2851 }
2852 
2853 // Check that this is a G_XOR x, -1
2854 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2855   if (MI.getOpcode() != TargetOpcode::G_XOR)
2856     return false;
2857   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2858   return ConstVal && *ConstVal == -1;
2859 }
2860 
2861 // Return the use branch instruction, otherwise null if the usage is invalid.
2862 static MachineInstr *
2863 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2864                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2865   Register CondDef = MI.getOperand(0).getReg();
2866   if (!MRI.hasOneNonDBGUse(CondDef))
2867     return nullptr;
2868 
2869   MachineBasicBlock *Parent = MI.getParent();
2870   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2871 
2872   if (isNot(MRI, *UseMI)) {
2873     Register NegatedCond = UseMI->getOperand(0).getReg();
2874     if (!MRI.hasOneNonDBGUse(NegatedCond))
2875       return nullptr;
2876 
2877     // We're deleting the def of this value, so we need to remove it.
2878     eraseInstr(*UseMI, MRI);
2879 
2880     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2881     Negated = true;
2882   }
2883 
2884   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2885     return nullptr;
2886 
2887   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2888   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2889   if (Next == Parent->end()) {
2890     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2891     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2892       return nullptr;
2893     UncondBrTarget = &*NextMBB;
2894   } else {
2895     if (Next->getOpcode() != AMDGPU::G_BR)
2896       return nullptr;
2897     Br = &*Next;
2898     UncondBrTarget = Br->getOperand(0).getMBB();
2899   }
2900 
2901   return UseMI;
2902 }
2903 
2904 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2905                                          const ArgDescriptor *Arg,
2906                                          const TargetRegisterClass *ArgRC,
2907                                          LLT ArgTy) const {
2908   MCRegister SrcReg = Arg->getRegister();
2909   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2910   assert(DstReg.isVirtual() && "Virtual register expected");
2911 
2912   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2913                                              *ArgRC, B.getDebugLoc(), ArgTy);
2914   if (Arg->isMasked()) {
2915     // TODO: Should we try to emit this once in the entry block?
2916     const LLT S32 = LLT::scalar(32);
2917     const unsigned Mask = Arg->getMask();
2918     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2919 
2920     Register AndMaskSrc = LiveIn;
2921 
2922     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2923     // 0.
2924     if (Shift != 0) {
2925       auto ShiftAmt = B.buildConstant(S32, Shift);
2926       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2927     }
2928 
2929     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2930   } else {
2931     B.buildCopy(DstReg, LiveIn);
2932   }
2933 
2934   return true;
2935 }
2936 
2937 bool AMDGPULegalizerInfo::loadInputValue(
2938     Register DstReg, MachineIRBuilder &B,
2939     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2940   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2941   const ArgDescriptor *Arg;
2942   const TargetRegisterClass *ArgRC;
2943   LLT ArgTy;
2944   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2945 
2946   if (!Arg) {
2947     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2948       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2949       // case the pointer argument may be missing and we use null.
2950       B.buildConstant(DstReg, 0);
2951       return true;
2952     }
2953 
2954     // It's undefined behavior if a function marked with the amdgpu-no-*
2955     // attributes uses the corresponding intrinsic.
2956     B.buildUndef(DstReg);
2957     return true;
2958   }
2959 
2960   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2961     return false; // TODO: Handle these
2962   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2963 }
2964 
2965 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2966     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2967     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2968   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2969     return false;
2970 
2971   MI.eraseFromParent();
2972   return true;
2973 }
2974 
2975 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2976                                        MachineRegisterInfo &MRI,
2977                                        MachineIRBuilder &B) const {
2978   Register Dst = MI.getOperand(0).getReg();
2979   LLT DstTy = MRI.getType(Dst);
2980   LLT S16 = LLT::scalar(16);
2981   LLT S32 = LLT::scalar(32);
2982   LLT S64 = LLT::scalar(64);
2983 
2984   if (DstTy == S16)
2985     return legalizeFDIV16(MI, MRI, B);
2986   if (DstTy == S32)
2987     return legalizeFDIV32(MI, MRI, B);
2988   if (DstTy == S64)
2989     return legalizeFDIV64(MI, MRI, B);
2990 
2991   return false;
2992 }
2993 
2994 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2995                                                         Register DstDivReg,
2996                                                         Register DstRemReg,
2997                                                         Register X,
2998                                                         Register Y) const {
2999   const LLT S1 = LLT::scalar(1);
3000   const LLT S32 = LLT::scalar(32);
3001 
3002   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3003   // algorithm used here.
3004 
3005   // Initial estimate of inv(y).
3006   auto FloatY = B.buildUITOFP(S32, Y);
3007   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3008   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3009   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3010   auto Z = B.buildFPTOUI(S32, ScaledY);
3011 
3012   // One round of UNR.
3013   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3014   auto NegYZ = B.buildMul(S32, NegY, Z);
3015   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3016 
3017   // Quotient/remainder estimate.
3018   auto Q = B.buildUMulH(S32, X, Z);
3019   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3020 
3021   // First quotient/remainder refinement.
3022   auto One = B.buildConstant(S32, 1);
3023   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3024   if (DstDivReg)
3025     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3026   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3027 
3028   // Second quotient/remainder refinement.
3029   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3030   if (DstDivReg)
3031     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3032 
3033   if (DstRemReg)
3034     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3035 }
3036 
3037 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3038 //
3039 // Return lo, hi of result
3040 //
3041 // %cvt.lo = G_UITOFP Val.lo
3042 // %cvt.hi = G_UITOFP Val.hi
3043 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3044 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3045 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3046 // %mul2 = G_FMUL %mul1, 2**(-32)
3047 // %trunc = G_INTRINSIC_TRUNC %mul2
3048 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3049 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3050 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3051                                                        Register Val) {
3052   const LLT S32 = LLT::scalar(32);
3053   auto Unmerge = B.buildUnmerge(S32, Val);
3054 
3055   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3056   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3057 
3058   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3059                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3060 
3061   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3062   auto Mul1 =
3063       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3064 
3065   // 2**(-32)
3066   auto Mul2 =
3067       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3068   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3069 
3070   // -(2**32)
3071   auto Mad2 = B.buildFMAD(S32, Trunc,
3072                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3073 
3074   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3075   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3076 
3077   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3078 }
3079 
3080 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3081                                                         Register DstDivReg,
3082                                                         Register DstRemReg,
3083                                                         Register Numer,
3084                                                         Register Denom) const {
3085   const LLT S32 = LLT::scalar(32);
3086   const LLT S64 = LLT::scalar(64);
3087   const LLT S1 = LLT::scalar(1);
3088   Register RcpLo, RcpHi;
3089 
3090   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3091 
3092   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3093 
3094   auto Zero64 = B.buildConstant(S64, 0);
3095   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3096 
3097   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3098   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3099 
3100   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3101   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3102   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3103 
3104   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3105   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3106   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3107 
3108   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3109   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3110   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3111   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3112   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3113 
3114   auto Zero32 = B.buildConstant(S32, 0);
3115   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3116   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3117   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3118 
3119   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3120   Register NumerLo = UnmergeNumer.getReg(0);
3121   Register NumerHi = UnmergeNumer.getReg(1);
3122 
3123   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3124   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3125   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3126   Register Mul3_Lo = UnmergeMul3.getReg(0);
3127   Register Mul3_Hi = UnmergeMul3.getReg(1);
3128   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3129   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3130   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3131   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3132 
3133   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3134   Register DenomLo = UnmergeDenom.getReg(0);
3135   Register DenomHi = UnmergeDenom.getReg(1);
3136 
3137   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3138   auto C1 = B.buildSExt(S32, CmpHi);
3139 
3140   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3141   auto C2 = B.buildSExt(S32, CmpLo);
3142 
3143   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3144   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3145 
3146   // TODO: Here and below portions of the code can be enclosed into if/endif.
3147   // Currently control flow is unconditional and we have 4 selects after
3148   // potential endif to substitute PHIs.
3149 
3150   // if C3 != 0 ...
3151   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3152   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3153   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3154   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3155 
3156   auto One64 = B.buildConstant(S64, 1);
3157   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3158 
3159   auto C4 =
3160       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3161   auto C5 =
3162       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3163   auto C6 = B.buildSelect(
3164       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3165 
3166   // if (C6 != 0)
3167   auto Add4 = B.buildAdd(S64, Add3, One64);
3168   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3169 
3170   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3171   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3172   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3173 
3174   // endif C6
3175   // endif C3
3176 
3177   if (DstDivReg) {
3178     auto Sel1 = B.buildSelect(
3179         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3180     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3181                   Sel1, MulHi3);
3182   }
3183 
3184   if (DstRemReg) {
3185     auto Sel2 = B.buildSelect(
3186         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3187     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3188                   Sel2, Sub1);
3189   }
3190 }
3191 
3192 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3193                                                   MachineRegisterInfo &MRI,
3194                                                   MachineIRBuilder &B) const {
3195   Register DstDivReg, DstRemReg;
3196   switch (MI.getOpcode()) {
3197   default:
3198     llvm_unreachable("Unexpected opcode!");
3199   case AMDGPU::G_UDIV: {
3200     DstDivReg = MI.getOperand(0).getReg();
3201     break;
3202   }
3203   case AMDGPU::G_UREM: {
3204     DstRemReg = MI.getOperand(0).getReg();
3205     break;
3206   }
3207   case AMDGPU::G_UDIVREM: {
3208     DstDivReg = MI.getOperand(0).getReg();
3209     DstRemReg = MI.getOperand(1).getReg();
3210     break;
3211   }
3212   }
3213 
3214   const LLT S64 = LLT::scalar(64);
3215   const LLT S32 = LLT::scalar(32);
3216   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3217   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3218   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3219   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3220 
3221   if (Ty == S32)
3222     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3223   else if (Ty == S64)
3224     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3225   else
3226     return false;
3227 
3228   MI.eraseFromParent();
3229   return true;
3230 }
3231 
3232 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3233                                                 MachineRegisterInfo &MRI,
3234                                                 MachineIRBuilder &B) const {
3235   const LLT S64 = LLT::scalar(64);
3236   const LLT S32 = LLT::scalar(32);
3237 
3238   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3239   if (Ty != S32 && Ty != S64)
3240     return false;
3241 
3242   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3243   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3244   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3245 
3246   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3247   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3248   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3249 
3250   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3251   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3252 
3253   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3254   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3255 
3256   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3257   switch (MI.getOpcode()) {
3258   default:
3259     llvm_unreachable("Unexpected opcode!");
3260   case AMDGPU::G_SDIV: {
3261     DstDivReg = MI.getOperand(0).getReg();
3262     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3263     break;
3264   }
3265   case AMDGPU::G_SREM: {
3266     DstRemReg = MI.getOperand(0).getReg();
3267     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3268     break;
3269   }
3270   case AMDGPU::G_SDIVREM: {
3271     DstDivReg = MI.getOperand(0).getReg();
3272     DstRemReg = MI.getOperand(1).getReg();
3273     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3274     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3275     break;
3276   }
3277   }
3278 
3279   if (Ty == S32)
3280     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3281   else
3282     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3283 
3284   if (DstDivReg) {
3285     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3286     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3287     B.buildSub(DstDivReg, SignXor, Sign);
3288   }
3289 
3290   if (DstRemReg) {
3291     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3292     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3293     B.buildSub(DstRemReg, SignXor, Sign);
3294   }
3295 
3296   MI.eraseFromParent();
3297   return true;
3298 }
3299 
3300 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3301                                                  MachineRegisterInfo &MRI,
3302                                                  MachineIRBuilder &B) const {
3303   Register Res = MI.getOperand(0).getReg();
3304   Register LHS = MI.getOperand(1).getReg();
3305   Register RHS = MI.getOperand(2).getReg();
3306   uint16_t Flags = MI.getFlags();
3307   LLT ResTy = MRI.getType(Res);
3308 
3309   const MachineFunction &MF = B.getMF();
3310   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3311                             MI.getFlag(MachineInstr::FmAfn);
3312 
3313   if (!AllowInaccurateRcp)
3314     return false;
3315 
3316   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3317     // 1 / x -> RCP(x)
3318     if (CLHS->isExactlyValue(1.0)) {
3319       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3320         .addUse(RHS)
3321         .setMIFlags(Flags);
3322 
3323       MI.eraseFromParent();
3324       return true;
3325     }
3326 
3327     // -1 / x -> RCP( FNEG(x) )
3328     if (CLHS->isExactlyValue(-1.0)) {
3329       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3330       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3331         .addUse(FNeg.getReg(0))
3332         .setMIFlags(Flags);
3333 
3334       MI.eraseFromParent();
3335       return true;
3336     }
3337   }
3338 
3339   // x / y -> x * (1.0 / y)
3340   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3341     .addUse(RHS)
3342     .setMIFlags(Flags);
3343   B.buildFMul(Res, LHS, RCP, Flags);
3344 
3345   MI.eraseFromParent();
3346   return true;
3347 }
3348 
3349 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3350                                                    MachineRegisterInfo &MRI,
3351                                                    MachineIRBuilder &B) const {
3352   Register Res = MI.getOperand(0).getReg();
3353   Register X = MI.getOperand(1).getReg();
3354   Register Y = MI.getOperand(2).getReg();
3355   uint16_t Flags = MI.getFlags();
3356   LLT ResTy = MRI.getType(Res);
3357 
3358   const MachineFunction &MF = B.getMF();
3359   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3360                             MI.getFlag(MachineInstr::FmAfn);
3361 
3362   if (!AllowInaccurateRcp)
3363     return false;
3364 
3365   auto NegY = B.buildFNeg(ResTy, Y);
3366   auto One = B.buildFConstant(ResTy, 1.0);
3367 
3368   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3369     .addUse(Y)
3370     .setMIFlags(Flags);
3371 
3372   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3373   R = B.buildFMA(ResTy, Tmp0, R, R);
3374 
3375   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3376   R = B.buildFMA(ResTy, Tmp1, R, R);
3377 
3378   auto Ret = B.buildFMul(ResTy, X, R);
3379   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3380 
3381   B.buildFMA(Res, Tmp2, R, Ret);
3382   MI.eraseFromParent();
3383   return true;
3384 }
3385 
3386 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3387                                          MachineRegisterInfo &MRI,
3388                                          MachineIRBuilder &B) const {
3389   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3390     return true;
3391 
3392   Register Res = MI.getOperand(0).getReg();
3393   Register LHS = MI.getOperand(1).getReg();
3394   Register RHS = MI.getOperand(2).getReg();
3395 
3396   uint16_t Flags = MI.getFlags();
3397 
3398   LLT S16 = LLT::scalar(16);
3399   LLT S32 = LLT::scalar(32);
3400 
3401   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3402   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3403 
3404   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3405     .addUse(RHSExt.getReg(0))
3406     .setMIFlags(Flags);
3407 
3408   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3409   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3410 
3411   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3412     .addUse(RDst.getReg(0))
3413     .addUse(RHS)
3414     .addUse(LHS)
3415     .setMIFlags(Flags);
3416 
3417   MI.eraseFromParent();
3418   return true;
3419 }
3420 
3421 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3422 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3423 static void toggleSPDenormMode(bool Enable,
3424                                MachineIRBuilder &B,
3425                                const GCNSubtarget &ST,
3426                                AMDGPU::SIModeRegisterDefaults Mode) {
3427   // Set SP denorm mode to this value.
3428   unsigned SPDenormMode =
3429     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3430 
3431   if (ST.hasDenormModeInst()) {
3432     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3433     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3434 
3435     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3436     B.buildInstr(AMDGPU::S_DENORM_MODE)
3437       .addImm(NewDenormModeValue);
3438 
3439   } else {
3440     // Select FP32 bit field in mode register.
3441     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3442                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3443                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3444 
3445     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3446       .addImm(SPDenormMode)
3447       .addImm(SPDenormModeBitField);
3448   }
3449 }
3450 
3451 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3452                                          MachineRegisterInfo &MRI,
3453                                          MachineIRBuilder &B) const {
3454   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3455     return true;
3456 
3457   Register Res = MI.getOperand(0).getReg();
3458   Register LHS = MI.getOperand(1).getReg();
3459   Register RHS = MI.getOperand(2).getReg();
3460   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3461   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3462 
3463   uint16_t Flags = MI.getFlags();
3464 
3465   LLT S32 = LLT::scalar(32);
3466   LLT S1 = LLT::scalar(1);
3467 
3468   auto One = B.buildFConstant(S32, 1.0f);
3469 
3470   auto DenominatorScaled =
3471     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3472       .addUse(LHS)
3473       .addUse(RHS)
3474       .addImm(0)
3475       .setMIFlags(Flags);
3476   auto NumeratorScaled =
3477     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3478       .addUse(LHS)
3479       .addUse(RHS)
3480       .addImm(1)
3481       .setMIFlags(Flags);
3482 
3483   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3484     .addUse(DenominatorScaled.getReg(0))
3485     .setMIFlags(Flags);
3486   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3487 
3488   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3489   // aren't modeled as reading it.
3490   if (!Mode.allFP32Denormals())
3491     toggleSPDenormMode(true, B, ST, Mode);
3492 
3493   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3494   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3495   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3496   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3497   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3498   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3499 
3500   if (!Mode.allFP32Denormals())
3501     toggleSPDenormMode(false, B, ST, Mode);
3502 
3503   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3504     .addUse(Fma4.getReg(0))
3505     .addUse(Fma1.getReg(0))
3506     .addUse(Fma3.getReg(0))
3507     .addUse(NumeratorScaled.getReg(1))
3508     .setMIFlags(Flags);
3509 
3510   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3511     .addUse(Fmas.getReg(0))
3512     .addUse(RHS)
3513     .addUse(LHS)
3514     .setMIFlags(Flags);
3515 
3516   MI.eraseFromParent();
3517   return true;
3518 }
3519 
3520 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3521                                          MachineRegisterInfo &MRI,
3522                                          MachineIRBuilder &B) const {
3523   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3524     return true;
3525 
3526   Register Res = MI.getOperand(0).getReg();
3527   Register LHS = MI.getOperand(1).getReg();
3528   Register RHS = MI.getOperand(2).getReg();
3529 
3530   uint16_t Flags = MI.getFlags();
3531 
3532   LLT S64 = LLT::scalar(64);
3533   LLT S1 = LLT::scalar(1);
3534 
3535   auto One = B.buildFConstant(S64, 1.0);
3536 
3537   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3538     .addUse(LHS)
3539     .addUse(RHS)
3540     .addImm(0)
3541     .setMIFlags(Flags);
3542 
3543   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3544 
3545   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3546     .addUse(DivScale0.getReg(0))
3547     .setMIFlags(Flags);
3548 
3549   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3550   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3551   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3552 
3553   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3554     .addUse(LHS)
3555     .addUse(RHS)
3556     .addImm(1)
3557     .setMIFlags(Flags);
3558 
3559   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3560   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3561   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3562 
3563   Register Scale;
3564   if (!ST.hasUsableDivScaleConditionOutput()) {
3565     // Workaround a hardware bug on SI where the condition output from div_scale
3566     // is not usable.
3567 
3568     LLT S32 = LLT::scalar(32);
3569 
3570     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3571     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3572     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3573     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3574 
3575     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3576                               Scale1Unmerge.getReg(1));
3577     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3578                               Scale0Unmerge.getReg(1));
3579     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3580   } else {
3581     Scale = DivScale1.getReg(1);
3582   }
3583 
3584   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3585     .addUse(Fma4.getReg(0))
3586     .addUse(Fma3.getReg(0))
3587     .addUse(Mul.getReg(0))
3588     .addUse(Scale)
3589     .setMIFlags(Flags);
3590 
3591   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3592     .addUse(Fmas.getReg(0))
3593     .addUse(RHS)
3594     .addUse(LHS)
3595     .setMIFlags(Flags);
3596 
3597   MI.eraseFromParent();
3598   return true;
3599 }
3600 
3601 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3602                                                  MachineRegisterInfo &MRI,
3603                                                  MachineIRBuilder &B) const {
3604   Register Res = MI.getOperand(0).getReg();
3605   Register LHS = MI.getOperand(2).getReg();
3606   Register RHS = MI.getOperand(3).getReg();
3607   uint16_t Flags = MI.getFlags();
3608 
3609   LLT S32 = LLT::scalar(32);
3610   LLT S1 = LLT::scalar(1);
3611 
3612   auto Abs = B.buildFAbs(S32, RHS, Flags);
3613   const APFloat C0Val(1.0f);
3614 
3615   auto C0 = B.buildConstant(S32, 0x6f800000);
3616   auto C1 = B.buildConstant(S32, 0x2f800000);
3617   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3618 
3619   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3620   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3621 
3622   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3623 
3624   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3625     .addUse(Mul0.getReg(0))
3626     .setMIFlags(Flags);
3627 
3628   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3629 
3630   B.buildFMul(Res, Sel, Mul1, Flags);
3631 
3632   MI.eraseFromParent();
3633   return true;
3634 }
3635 
3636 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3637 // FIXME: Why do we handle this one but not other removed instructions?
3638 //
3639 // Reciprocal square root.  The clamp prevents infinite results, clamping
3640 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3641 // +-max_float.
3642 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3643                                                     MachineRegisterInfo &MRI,
3644                                                     MachineIRBuilder &B) const {
3645   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3646     return true;
3647 
3648   Register Dst = MI.getOperand(0).getReg();
3649   Register Src = MI.getOperand(2).getReg();
3650   auto Flags = MI.getFlags();
3651 
3652   LLT Ty = MRI.getType(Dst);
3653 
3654   const fltSemantics *FltSemantics;
3655   if (Ty == LLT::scalar(32))
3656     FltSemantics = &APFloat::IEEEsingle();
3657   else if (Ty == LLT::scalar(64))
3658     FltSemantics = &APFloat::IEEEdouble();
3659   else
3660     return false;
3661 
3662   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3663     .addUse(Src)
3664     .setMIFlags(Flags);
3665 
3666   // We don't need to concern ourselves with the snan handling difference, since
3667   // the rsq quieted (or not) so use the one which will directly select.
3668   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3669   const bool UseIEEE = MFI->getMode().IEEE;
3670 
3671   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3672   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3673                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3674 
3675   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3676 
3677   if (UseIEEE)
3678     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3679   else
3680     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3681   MI.eraseFromParent();
3682   return true;
3683 }
3684 
3685 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3686   switch (IID) {
3687   case Intrinsic::amdgcn_ds_fadd:
3688     return AMDGPU::G_ATOMICRMW_FADD;
3689   case Intrinsic::amdgcn_ds_fmin:
3690     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3691   case Intrinsic::amdgcn_ds_fmax:
3692     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3693   default:
3694     llvm_unreachable("not a DS FP intrinsic");
3695   }
3696 }
3697 
3698 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3699                                                       MachineInstr &MI,
3700                                                       Intrinsic::ID IID) const {
3701   GISelChangeObserver &Observer = Helper.Observer;
3702   Observer.changingInstr(MI);
3703 
3704   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3705 
3706   // The remaining operands were used to set fields in the MemOperand on
3707   // construction.
3708   for (int I = 6; I > 3; --I)
3709     MI.RemoveOperand(I);
3710 
3711   MI.RemoveOperand(1); // Remove the intrinsic ID.
3712   Observer.changedInstr(MI);
3713   return true;
3714 }
3715 
3716 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3717                                             MachineRegisterInfo &MRI,
3718                                             MachineIRBuilder &B) const {
3719   uint64_t Offset =
3720     ST.getTargetLowering()->getImplicitParameterOffset(
3721       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3722   LLT DstTy = MRI.getType(DstReg);
3723   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3724 
3725   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3726   if (!loadInputValue(KernargPtrReg, B,
3727                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3728     return false;
3729 
3730   // FIXME: This should be nuw
3731   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3732   return true;
3733 }
3734 
3735 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3736                                                  MachineRegisterInfo &MRI,
3737                                                  MachineIRBuilder &B) const {
3738   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3739   if (!MFI->isEntryFunction()) {
3740     return legalizePreloadedArgIntrin(MI, MRI, B,
3741                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3742   }
3743 
3744   Register DstReg = MI.getOperand(0).getReg();
3745   if (!getImplicitArgPtr(DstReg, MRI, B))
3746     return false;
3747 
3748   MI.eraseFromParent();
3749   return true;
3750 }
3751 
3752 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3753                                               MachineRegisterInfo &MRI,
3754                                               MachineIRBuilder &B,
3755                                               unsigned AddrSpace) const {
3756   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3757   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3758   Register Hi32 = Unmerge.getReg(1);
3759 
3760   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3761   MI.eraseFromParent();
3762   return true;
3763 }
3764 
3765 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3766 // offset (the offset that is included in bounds checking and swizzling, to be
3767 // split between the instruction's voffset and immoffset fields) and soffset
3768 // (the offset that is excluded from bounds checking and swizzling, to go in
3769 // the instruction's soffset field).  This function takes the first kind of
3770 // offset and figures out how to split it between voffset and immoffset.
3771 std::pair<Register, unsigned>
3772 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3773                                         Register OrigOffset) const {
3774   const unsigned MaxImm = 4095;
3775   Register BaseReg;
3776   unsigned ImmOffset;
3777   const LLT S32 = LLT::scalar(32);
3778   MachineRegisterInfo &MRI = *B.getMRI();
3779 
3780   std::tie(BaseReg, ImmOffset) =
3781       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3782 
3783   // If BaseReg is a pointer, convert it to int.
3784   if (MRI.getType(BaseReg).isPointer())
3785     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3786 
3787   // If the immediate value is too big for the immoffset field, put the value
3788   // and -4096 into the immoffset field so that the value that is copied/added
3789   // for the voffset field is a multiple of 4096, and it stands more chance
3790   // of being CSEd with the copy/add for another similar load/store.
3791   // However, do not do that rounding down to a multiple of 4096 if that is a
3792   // negative number, as it appears to be illegal to have a negative offset
3793   // in the vgpr, even if adding the immediate offset makes it positive.
3794   unsigned Overflow = ImmOffset & ~MaxImm;
3795   ImmOffset -= Overflow;
3796   if ((int32_t)Overflow < 0) {
3797     Overflow += ImmOffset;
3798     ImmOffset = 0;
3799   }
3800 
3801   if (Overflow != 0) {
3802     if (!BaseReg) {
3803       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3804     } else {
3805       auto OverflowVal = B.buildConstant(S32, Overflow);
3806       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3807     }
3808   }
3809 
3810   if (!BaseReg)
3811     BaseReg = B.buildConstant(S32, 0).getReg(0);
3812 
3813   return std::make_pair(BaseReg, ImmOffset);
3814 }
3815 
3816 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3817 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3818                                           Register VOffset, Register SOffset,
3819                                           unsigned ImmOffset, Register VIndex,
3820                                           MachineRegisterInfo &MRI) const {
3821   Optional<ValueAndVReg> MaybeVOffsetVal =
3822       getIConstantVRegValWithLookThrough(VOffset, MRI);
3823   Optional<ValueAndVReg> MaybeSOffsetVal =
3824       getIConstantVRegValWithLookThrough(SOffset, MRI);
3825   Optional<ValueAndVReg> MaybeVIndexVal =
3826       getIConstantVRegValWithLookThrough(VIndex, MRI);
3827   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3828   // update the MMO with that offset. The stride is unknown so we can only do
3829   // this if VIndex is constant 0.
3830   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3831       MaybeVIndexVal->Value == 0) {
3832     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3833                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3834     MMO->setOffset(TotalOffset);
3835   } else {
3836     // We don't have a constant combined offset to use in the MMO. Give up.
3837     MMO->setValue((Value *)nullptr);
3838   }
3839 }
3840 
3841 /// Handle register layout difference for f16 images for some subtargets.
3842 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3843                                              MachineRegisterInfo &MRI,
3844                                              Register Reg,
3845                                              bool ImageStore) const {
3846   const LLT S16 = LLT::scalar(16);
3847   const LLT S32 = LLT::scalar(32);
3848   LLT StoreVT = MRI.getType(Reg);
3849   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3850 
3851   if (ST.hasUnpackedD16VMem()) {
3852     auto Unmerge = B.buildUnmerge(S16, Reg);
3853 
3854     SmallVector<Register, 4> WideRegs;
3855     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3856       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3857 
3858     int NumElts = StoreVT.getNumElements();
3859 
3860     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3861         .getReg(0);
3862   }
3863 
3864   if (ImageStore && ST.hasImageStoreD16Bug()) {
3865     if (StoreVT.getNumElements() == 2) {
3866       SmallVector<Register, 4> PackedRegs;
3867       Reg = B.buildBitcast(S32, Reg).getReg(0);
3868       PackedRegs.push_back(Reg);
3869       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3870       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3871           .getReg(0);
3872     }
3873 
3874     if (StoreVT.getNumElements() == 3) {
3875       SmallVector<Register, 4> PackedRegs;
3876       auto Unmerge = B.buildUnmerge(S16, Reg);
3877       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3878         PackedRegs.push_back(Unmerge.getReg(I));
3879       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3880       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3881       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3882     }
3883 
3884     if (StoreVT.getNumElements() == 4) {
3885       SmallVector<Register, 4> PackedRegs;
3886       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3887       auto Unmerge = B.buildUnmerge(S32, Reg);
3888       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3889         PackedRegs.push_back(Unmerge.getReg(I));
3890       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3891       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3892           .getReg(0);
3893     }
3894 
3895     llvm_unreachable("invalid data type");
3896   }
3897 
3898   if (StoreVT == LLT::fixed_vector(3, S16)) {
3899     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3900               .getReg(0);
3901   }
3902   return Reg;
3903 }
3904 
3905 Register AMDGPULegalizerInfo::fixStoreSourceType(
3906   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3907   MachineRegisterInfo *MRI = B.getMRI();
3908   LLT Ty = MRI->getType(VData);
3909 
3910   const LLT S16 = LLT::scalar(16);
3911 
3912   // Fixup illegal register types for i8 stores.
3913   if (Ty == LLT::scalar(8) || Ty == S16) {
3914     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3915     return AnyExt;
3916   }
3917 
3918   if (Ty.isVector()) {
3919     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3920       if (IsFormat)
3921         return handleD16VData(B, *MRI, VData);
3922     }
3923   }
3924 
3925   return VData;
3926 }
3927 
3928 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3929                                               MachineRegisterInfo &MRI,
3930                                               MachineIRBuilder &B,
3931                                               bool IsTyped,
3932                                               bool IsFormat) const {
3933   Register VData = MI.getOperand(1).getReg();
3934   LLT Ty = MRI.getType(VData);
3935   LLT EltTy = Ty.getScalarType();
3936   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3937   const LLT S32 = LLT::scalar(32);
3938 
3939   VData = fixStoreSourceType(B, VData, IsFormat);
3940   Register RSrc = MI.getOperand(2).getReg();
3941 
3942   MachineMemOperand *MMO = *MI.memoperands_begin();
3943   const int MemSize = MMO->getSize();
3944 
3945   unsigned ImmOffset;
3946 
3947   // The typed intrinsics add an immediate after the registers.
3948   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3949 
3950   // The struct intrinsic variants add one additional operand over raw.
3951   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3952   Register VIndex;
3953   int OpOffset = 0;
3954   if (HasVIndex) {
3955     VIndex = MI.getOperand(3).getReg();
3956     OpOffset = 1;
3957   } else {
3958     VIndex = B.buildConstant(S32, 0).getReg(0);
3959   }
3960 
3961   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3962   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3963 
3964   unsigned Format = 0;
3965   if (IsTyped) {
3966     Format = MI.getOperand(5 + OpOffset).getImm();
3967     ++OpOffset;
3968   }
3969 
3970   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3971 
3972   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3973   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3974 
3975   unsigned Opc;
3976   if (IsTyped) {
3977     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3978                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3979   } else if (IsFormat) {
3980     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3981                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3982   } else {
3983     switch (MemSize) {
3984     case 1:
3985       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3986       break;
3987     case 2:
3988       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3989       break;
3990     default:
3991       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3992       break;
3993     }
3994   }
3995 
3996   auto MIB = B.buildInstr(Opc)
3997     .addUse(VData)              // vdata
3998     .addUse(RSrc)               // rsrc
3999     .addUse(VIndex)             // vindex
4000     .addUse(VOffset)            // voffset
4001     .addUse(SOffset)            // soffset
4002     .addImm(ImmOffset);         // offset(imm)
4003 
4004   if (IsTyped)
4005     MIB.addImm(Format);
4006 
4007   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4008      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4009      .addMemOperand(MMO);
4010 
4011   MI.eraseFromParent();
4012   return true;
4013 }
4014 
4015 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4016                                              MachineRegisterInfo &MRI,
4017                                              MachineIRBuilder &B,
4018                                              bool IsFormat,
4019                                              bool IsTyped) const {
4020   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4021   MachineMemOperand *MMO = *MI.memoperands_begin();
4022   const LLT MemTy = MMO->getMemoryType();
4023   const LLT S32 = LLT::scalar(32);
4024 
4025   Register Dst = MI.getOperand(0).getReg();
4026   Register RSrc = MI.getOperand(2).getReg();
4027 
4028   // The typed intrinsics add an immediate after the registers.
4029   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4030 
4031   // The struct intrinsic variants add one additional operand over raw.
4032   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4033   Register VIndex;
4034   int OpOffset = 0;
4035   if (HasVIndex) {
4036     VIndex = MI.getOperand(3).getReg();
4037     OpOffset = 1;
4038   } else {
4039     VIndex = B.buildConstant(S32, 0).getReg(0);
4040   }
4041 
4042   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4043   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4044 
4045   unsigned Format = 0;
4046   if (IsTyped) {
4047     Format = MI.getOperand(5 + OpOffset).getImm();
4048     ++OpOffset;
4049   }
4050 
4051   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4052   unsigned ImmOffset;
4053 
4054   LLT Ty = MRI.getType(Dst);
4055   LLT EltTy = Ty.getScalarType();
4056   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4057   const bool Unpacked = ST.hasUnpackedD16VMem();
4058 
4059   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4060   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4061 
4062   unsigned Opc;
4063 
4064   if (IsTyped) {
4065     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4066                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4067   } else if (IsFormat) {
4068     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4069                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4070   } else {
4071     switch (MemTy.getSizeInBits()) {
4072     case 8:
4073       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4074       break;
4075     case 16:
4076       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4077       break;
4078     default:
4079       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4080       break;
4081     }
4082   }
4083 
4084   Register LoadDstReg;
4085 
4086   bool IsExtLoad =
4087       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4088   LLT UnpackedTy = Ty.changeElementSize(32);
4089 
4090   if (IsExtLoad)
4091     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4092   else if (Unpacked && IsD16 && Ty.isVector())
4093     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4094   else
4095     LoadDstReg = Dst;
4096 
4097   auto MIB = B.buildInstr(Opc)
4098     .addDef(LoadDstReg)         // vdata
4099     .addUse(RSrc)               // rsrc
4100     .addUse(VIndex)             // vindex
4101     .addUse(VOffset)            // voffset
4102     .addUse(SOffset)            // soffset
4103     .addImm(ImmOffset);         // offset(imm)
4104 
4105   if (IsTyped)
4106     MIB.addImm(Format);
4107 
4108   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4109      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4110      .addMemOperand(MMO);
4111 
4112   if (LoadDstReg != Dst) {
4113     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4114 
4115     // Widen result for extending loads was widened.
4116     if (IsExtLoad)
4117       B.buildTrunc(Dst, LoadDstReg);
4118     else {
4119       // Repack to original 16-bit vector result
4120       // FIXME: G_TRUNC should work, but legalization currently fails
4121       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4122       SmallVector<Register, 4> Repack;
4123       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4124         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4125       B.buildMerge(Dst, Repack);
4126     }
4127   }
4128 
4129   MI.eraseFromParent();
4130   return true;
4131 }
4132 
4133 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4134                                                MachineIRBuilder &B,
4135                                                bool IsInc) const {
4136   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4137                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4138   B.buildInstr(Opc)
4139     .addDef(MI.getOperand(0).getReg())
4140     .addUse(MI.getOperand(2).getReg())
4141     .addUse(MI.getOperand(3).getReg())
4142     .cloneMemRefs(MI);
4143   MI.eraseFromParent();
4144   return true;
4145 }
4146 
4147 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4148   switch (IntrID) {
4149   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4150   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4151     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4152   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4153   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4154     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4155   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4156   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4157     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4158   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4159   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4160     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4161   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4162   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4163     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4164   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4165   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4166     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4167   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4168   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4169     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4170   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4171   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4172     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4173   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4174   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4175     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4176   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4177   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4178     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4179   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4180   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4181     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4182   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4183   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4184     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4185   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4186   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4187     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4188   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4189   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4190     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4191   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4192   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4193     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4194   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4195   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4196     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4197   default:
4198     llvm_unreachable("unhandled atomic opcode");
4199   }
4200 }
4201 
4202 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4203                                                MachineIRBuilder &B,
4204                                                Intrinsic::ID IID) const {
4205   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4206                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4207   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4208 
4209   Register Dst;
4210 
4211   int OpOffset = 0;
4212   if (HasReturn) {
4213     // A few FP atomics do not support return values.
4214     Dst = MI.getOperand(0).getReg();
4215   } else {
4216     OpOffset = -1;
4217   }
4218 
4219   Register VData = MI.getOperand(2 + OpOffset).getReg();
4220   Register CmpVal;
4221 
4222   if (IsCmpSwap) {
4223     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4224     ++OpOffset;
4225   }
4226 
4227   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4228   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4229 
4230   // The struct intrinsic variants add one additional operand over raw.
4231   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4232   Register VIndex;
4233   if (HasVIndex) {
4234     VIndex = MI.getOperand(4 + OpOffset).getReg();
4235     ++OpOffset;
4236   } else {
4237     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4238   }
4239 
4240   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4241   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4242   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4243 
4244   MachineMemOperand *MMO = *MI.memoperands_begin();
4245 
4246   unsigned ImmOffset;
4247   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4248   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4249 
4250   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4251 
4252   if (HasReturn)
4253     MIB.addDef(Dst);
4254 
4255   MIB.addUse(VData); // vdata
4256 
4257   if (IsCmpSwap)
4258     MIB.addReg(CmpVal);
4259 
4260   MIB.addUse(RSrc)               // rsrc
4261      .addUse(VIndex)             // vindex
4262      .addUse(VOffset)            // voffset
4263      .addUse(SOffset)            // soffset
4264      .addImm(ImmOffset)          // offset(imm)
4265      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4266      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4267      .addMemOperand(MMO);
4268 
4269   MI.eraseFromParent();
4270   return true;
4271 }
4272 
4273 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4274 /// vector with s16 typed elements.
4275 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4276                                       SmallVectorImpl<Register> &PackedAddrs,
4277                                       unsigned ArgOffset,
4278                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4279                                       bool IsA16, bool IsG16) {
4280   const LLT S16 = LLT::scalar(16);
4281   const LLT V2S16 = LLT::fixed_vector(2, 16);
4282   auto EndIdx = Intr->VAddrEnd;
4283 
4284   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4285     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4286     if (!SrcOp.isReg())
4287       continue; // _L to _LZ may have eliminated this.
4288 
4289     Register AddrReg = SrcOp.getReg();
4290 
4291     if ((I < Intr->GradientStart) ||
4292         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4293         (I >= Intr->CoordStart && !IsA16)) {
4294       if ((I < Intr->GradientStart) && IsA16 &&
4295           (B.getMRI()->getType(AddrReg) == S16)) {
4296         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4297         // Special handling of bias when A16 is on. Bias is of type half but
4298         // occupies full 32-bit.
4299         PackedAddrs.push_back(
4300             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4301                 .getReg(0));
4302       } else {
4303         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4304                "Bias needs to be converted to 16 bit in A16 mode");
4305         // Handle any gradient or coordinate operands that should not be packed
4306         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4307         PackedAddrs.push_back(AddrReg);
4308       }
4309     } else {
4310       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4311       // derivatives dx/dh and dx/dv are packed with undef.
4312       if (((I + 1) >= EndIdx) ||
4313           ((Intr->NumGradients / 2) % 2 == 1 &&
4314            (I == static_cast<unsigned>(Intr->GradientStart +
4315                                        (Intr->NumGradients / 2) - 1) ||
4316             I == static_cast<unsigned>(Intr->GradientStart +
4317                                        Intr->NumGradients - 1))) ||
4318           // Check for _L to _LZ optimization
4319           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4320         PackedAddrs.push_back(
4321             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4322                 .getReg(0));
4323       } else {
4324         PackedAddrs.push_back(
4325             B.buildBuildVector(
4326                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4327                 .getReg(0));
4328         ++I;
4329       }
4330     }
4331   }
4332 }
4333 
4334 /// Convert from separate vaddr components to a single vector address register,
4335 /// and replace the remaining operands with $noreg.
4336 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4337                                      int DimIdx, int NumVAddrs) {
4338   const LLT S32 = LLT::scalar(32);
4339 
4340   SmallVector<Register, 8> AddrRegs;
4341   for (int I = 0; I != NumVAddrs; ++I) {
4342     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4343     if (SrcOp.isReg()) {
4344       AddrRegs.push_back(SrcOp.getReg());
4345       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4346     }
4347   }
4348 
4349   int NumAddrRegs = AddrRegs.size();
4350   if (NumAddrRegs != 1) {
4351     // Above 8 elements round up to next power of 2 (i.e. 16).
4352     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4353       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4354       auto Undef = B.buildUndef(S32);
4355       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4356       NumAddrRegs = RoundedNumRegs;
4357     }
4358 
4359     auto VAddr =
4360         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4361     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4362   }
4363 
4364   for (int I = 1; I != NumVAddrs; ++I) {
4365     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4366     if (SrcOp.isReg())
4367       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4368   }
4369 }
4370 
4371 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4372 ///
4373 /// Depending on the subtarget, load/store with 16-bit element data need to be
4374 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4375 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4376 /// registers.
4377 ///
4378 /// We don't want to directly select image instructions just yet, but also want
4379 /// to exposes all register repacking to the legalizer/combiners. We also don't
4380 /// want a selected instrution entering RegBankSelect. In order to avoid
4381 /// defining a multitude of intermediate image instructions, directly hack on
4382 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4383 /// padding now unnecessary arguments with $noreg.
4384 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4385     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4386     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4387 
4388   const unsigned NumDefs = MI.getNumExplicitDefs();
4389   const unsigned ArgOffset = NumDefs + 1;
4390   bool IsTFE = NumDefs == 2;
4391   // We are only processing the operands of d16 image operations on subtargets
4392   // that use the unpacked register layout, or need to repack the TFE result.
4393 
4394   // TODO: Do we need to guard against already legalized intrinsics?
4395   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4396       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4397 
4398   MachineRegisterInfo *MRI = B.getMRI();
4399   const LLT S32 = LLT::scalar(32);
4400   const LLT S16 = LLT::scalar(16);
4401   const LLT V2S16 = LLT::fixed_vector(2, 16);
4402 
4403   unsigned DMask = 0;
4404   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4405   LLT Ty = MRI->getType(VData);
4406 
4407   // Check for 16 bit addresses and pack if true.
4408   LLT GradTy =
4409       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4410   LLT AddrTy =
4411       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4412   const bool IsG16 = GradTy == S16;
4413   const bool IsA16 = AddrTy == S16;
4414   const bool IsD16 = Ty.getScalarType() == S16;
4415 
4416   int DMaskLanes = 0;
4417   if (!BaseOpcode->Atomic) {
4418     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4419     if (BaseOpcode->Gather4) {
4420       DMaskLanes = 4;
4421     } else if (DMask != 0) {
4422       DMaskLanes = countPopulation(DMask);
4423     } else if (!IsTFE && !BaseOpcode->Store) {
4424       // If dmask is 0, this is a no-op load. This can be eliminated.
4425       B.buildUndef(MI.getOperand(0));
4426       MI.eraseFromParent();
4427       return true;
4428     }
4429   }
4430 
4431   Observer.changingInstr(MI);
4432   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4433 
4434   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4435                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4436   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4437                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4438   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4439 
4440   // Track that we legalized this
4441   MI.setDesc(B.getTII().get(NewOpcode));
4442 
4443   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4444   // dmask to be at least 1 otherwise the instruction will fail
4445   if (IsTFE && DMask == 0) {
4446     DMask = 0x1;
4447     DMaskLanes = 1;
4448     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4449   }
4450 
4451   if (BaseOpcode->Atomic) {
4452     Register VData0 = MI.getOperand(2).getReg();
4453     LLT Ty = MRI->getType(VData0);
4454 
4455     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4456     if (Ty.isVector())
4457       return false;
4458 
4459     if (BaseOpcode->AtomicX2) {
4460       Register VData1 = MI.getOperand(3).getReg();
4461       // The two values are packed in one register.
4462       LLT PackedTy = LLT::fixed_vector(2, Ty);
4463       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4464       MI.getOperand(2).setReg(Concat.getReg(0));
4465       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4466     }
4467   }
4468 
4469   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4470 
4471   // Rewrite the addressing register layout before doing anything else.
4472   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4473     // 16 bit gradients are supported, but are tied to the A16 control
4474     // so both gradients and addresses must be 16 bit
4475     return false;
4476   }
4477 
4478   if (IsA16 && !ST.hasA16()) {
4479     // A16 not supported
4480     return false;
4481   }
4482 
4483   if (IsA16 || IsG16) {
4484     if (Intr->NumVAddrs > 1) {
4485       SmallVector<Register, 4> PackedRegs;
4486 
4487       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4488                                 IsG16);
4489 
4490       // See also below in the non-a16 branch
4491       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4492                           PackedRegs.size() <= ST.getNSAMaxSize();
4493 
4494       if (!UseNSA && PackedRegs.size() > 1) {
4495         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4496         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4497         PackedRegs[0] = Concat.getReg(0);
4498         PackedRegs.resize(1);
4499       }
4500 
4501       const unsigned NumPacked = PackedRegs.size();
4502       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4503         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4504         if (!SrcOp.isReg()) {
4505           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4506           continue;
4507         }
4508 
4509         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4510 
4511         if (I - Intr->VAddrStart < NumPacked)
4512           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4513         else
4514           SrcOp.setReg(AMDGPU::NoRegister);
4515       }
4516     }
4517   } else {
4518     // If the register allocator cannot place the address registers contiguously
4519     // without introducing moves, then using the non-sequential address encoding
4520     // is always preferable, since it saves VALU instructions and is usually a
4521     // wash in terms of code size or even better.
4522     //
4523     // However, we currently have no way of hinting to the register allocator
4524     // that MIMG addresses should be placed contiguously when it is possible to
4525     // do so, so force non-NSA for the common 2-address case as a heuristic.
4526     //
4527     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4528     // allocation when possible.
4529     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4530                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4531 
4532     if (!UseNSA && Intr->NumVAddrs > 1)
4533       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4534                                Intr->NumVAddrs);
4535   }
4536 
4537   int Flags = 0;
4538   if (IsA16)
4539     Flags |= 1;
4540   if (IsG16)
4541     Flags |= 2;
4542   MI.addOperand(MachineOperand::CreateImm(Flags));
4543 
4544   if (BaseOpcode->Store) { // No TFE for stores?
4545     // TODO: Handle dmask trim
4546     if (!Ty.isVector() || !IsD16)
4547       return true;
4548 
4549     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4550     if (RepackedReg != VData) {
4551       MI.getOperand(1).setReg(RepackedReg);
4552     }
4553 
4554     return true;
4555   }
4556 
4557   Register DstReg = MI.getOperand(0).getReg();
4558   const LLT EltTy = Ty.getScalarType();
4559   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4560 
4561   // Confirm that the return type is large enough for the dmask specified
4562   if (NumElts < DMaskLanes)
4563     return false;
4564 
4565   if (NumElts > 4 || DMaskLanes > 4)
4566     return false;
4567 
4568   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4569   const LLT AdjustedTy =
4570       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4571 
4572   // The raw dword aligned data component of the load. The only legal cases
4573   // where this matters should be when using the packed D16 format, for
4574   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4575   LLT RoundedTy;
4576 
4577   // S32 vector to to cover all data, plus TFE result element.
4578   LLT TFETy;
4579 
4580   // Register type to use for each loaded component. Will be S32 or V2S16.
4581   LLT RegTy;
4582 
4583   if (IsD16 && ST.hasUnpackedD16VMem()) {
4584     RoundedTy =
4585         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4586     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4587     RegTy = S32;
4588   } else {
4589     unsigned EltSize = EltTy.getSizeInBits();
4590     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4591     unsigned RoundedSize = 32 * RoundedElts;
4592     RoundedTy = LLT::scalarOrVector(
4593         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4594     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4595     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4596   }
4597 
4598   // The return type does not need adjustment.
4599   // TODO: Should we change s16 case to s32 or <2 x s16>?
4600   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4601     return true;
4602 
4603   Register Dst1Reg;
4604 
4605   // Insert after the instruction.
4606   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4607 
4608   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4609   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4610   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4611   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4612 
4613   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4614 
4615   MI.getOperand(0).setReg(NewResultReg);
4616 
4617   // In the IR, TFE is supposed to be used with a 2 element struct return
4618   // type. The instruction really returns these two values in one contiguous
4619   // register, with one additional dword beyond the loaded data. Rewrite the
4620   // return type to use a single register result.
4621 
4622   if (IsTFE) {
4623     Dst1Reg = MI.getOperand(1).getReg();
4624     if (MRI->getType(Dst1Reg) != S32)
4625       return false;
4626 
4627     // TODO: Make sure the TFE operand bit is set.
4628     MI.RemoveOperand(1);
4629 
4630     // Handle the easy case that requires no repack instructions.
4631     if (Ty == S32) {
4632       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4633       return true;
4634     }
4635   }
4636 
4637   // Now figure out how to copy the new result register back into the old
4638   // result.
4639   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4640 
4641   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4642 
4643   if (ResultNumRegs == 1) {
4644     assert(!IsTFE);
4645     ResultRegs[0] = NewResultReg;
4646   } else {
4647     // We have to repack into a new vector of some kind.
4648     for (int I = 0; I != NumDataRegs; ++I)
4649       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4650     B.buildUnmerge(ResultRegs, NewResultReg);
4651 
4652     // Drop the final TFE element to get the data part. The TFE result is
4653     // directly written to the right place already.
4654     if (IsTFE)
4655       ResultRegs.resize(NumDataRegs);
4656   }
4657 
4658   // For an s16 scalar result, we form an s32 result with a truncate regardless
4659   // of packed vs. unpacked.
4660   if (IsD16 && !Ty.isVector()) {
4661     B.buildTrunc(DstReg, ResultRegs[0]);
4662     return true;
4663   }
4664 
4665   // Avoid a build/concat_vector of 1 entry.
4666   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4667     B.buildBitcast(DstReg, ResultRegs[0]);
4668     return true;
4669   }
4670 
4671   assert(Ty.isVector());
4672 
4673   if (IsD16) {
4674     // For packed D16 results with TFE enabled, all the data components are
4675     // S32. Cast back to the expected type.
4676     //
4677     // TODO: We don't really need to use load s32 elements. We would only need one
4678     // cast for the TFE result if a multiple of v2s16 was used.
4679     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4680       for (Register &Reg : ResultRegs)
4681         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4682     } else if (ST.hasUnpackedD16VMem()) {
4683       for (Register &Reg : ResultRegs)
4684         Reg = B.buildTrunc(S16, Reg).getReg(0);
4685     }
4686   }
4687 
4688   auto padWithUndef = [&](LLT Ty, int NumElts) {
4689     if (NumElts == 0)
4690       return;
4691     Register Undef = B.buildUndef(Ty).getReg(0);
4692     for (int I = 0; I != NumElts; ++I)
4693       ResultRegs.push_back(Undef);
4694   };
4695 
4696   // Pad out any elements eliminated due to the dmask.
4697   LLT ResTy = MRI->getType(ResultRegs[0]);
4698   if (!ResTy.isVector()) {
4699     padWithUndef(ResTy, NumElts - ResultRegs.size());
4700     B.buildBuildVector(DstReg, ResultRegs);
4701     return true;
4702   }
4703 
4704   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4705   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4706 
4707   // Deal with the one annoying legal case.
4708   const LLT V3S16 = LLT::fixed_vector(3, 16);
4709   if (Ty == V3S16) {
4710     if (IsTFE) {
4711       if (ResultRegs.size() == 1) {
4712         NewResultReg = ResultRegs[0];
4713       } else if (ResultRegs.size() == 2) {
4714         LLT V4S16 = LLT::fixed_vector(4, 16);
4715         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4716       } else {
4717         return false;
4718       }
4719     }
4720 
4721     if (MRI->getType(DstReg).getNumElements() <
4722         MRI->getType(NewResultReg).getNumElements()) {
4723       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4724     } else {
4725       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4726     }
4727     return true;
4728   }
4729 
4730   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4731   B.buildConcatVectors(DstReg, ResultRegs);
4732   return true;
4733 }
4734 
4735 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4736   LegalizerHelper &Helper, MachineInstr &MI) const {
4737   MachineIRBuilder &B = Helper.MIRBuilder;
4738   GISelChangeObserver &Observer = Helper.Observer;
4739 
4740   Register Dst = MI.getOperand(0).getReg();
4741   LLT Ty = B.getMRI()->getType(Dst);
4742   unsigned Size = Ty.getSizeInBits();
4743   MachineFunction &MF = B.getMF();
4744 
4745   Observer.changingInstr(MI);
4746 
4747   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4748     Ty = getBitcastRegisterType(Ty);
4749     Helper.bitcastDst(MI, Ty, 0);
4750     Dst = MI.getOperand(0).getReg();
4751     B.setInsertPt(B.getMBB(), MI);
4752   }
4753 
4754   // FIXME: We don't really need this intermediate instruction. The intrinsic
4755   // should be fixed to have a memory operand. Since it's readnone, we're not
4756   // allowed to add one.
4757   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4758   MI.RemoveOperand(1); // Remove intrinsic ID
4759 
4760   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4761   // TODO: Should this use datalayout alignment?
4762   const unsigned MemSize = (Size + 7) / 8;
4763   const Align MemAlign(4);
4764   MachineMemOperand *MMO = MF.getMachineMemOperand(
4765       MachinePointerInfo(),
4766       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4767           MachineMemOperand::MOInvariant,
4768       MemSize, MemAlign);
4769   MI.addMemOperand(MF, MMO);
4770 
4771   // There are no 96-bit result scalar loads, but widening to 128-bit should
4772   // always be legal. We may need to restore this to a 96-bit result if it turns
4773   // out this needs to be converted to a vector load during RegBankSelect.
4774   if (!isPowerOf2_32(Size)) {
4775     if (Ty.isVector())
4776       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4777     else
4778       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4779   }
4780 
4781   Observer.changedInstr(MI);
4782   return true;
4783 }
4784 
4785 // TODO: Move to selection
4786 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4787                                                 MachineRegisterInfo &MRI,
4788                                                 MachineIRBuilder &B) const {
4789   if (!ST.isTrapHandlerEnabled() ||
4790       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4791     return legalizeTrapEndpgm(MI, MRI, B);
4792 
4793   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4794     switch (*HsaAbiVer) {
4795     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4796     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4797       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4798     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4799     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
4800       return ST.supportsGetDoorbellID() ?
4801           legalizeTrapHsa(MI, MRI, B) :
4802           legalizeTrapHsaQueuePtr(MI, MRI, B);
4803     }
4804   }
4805 
4806   llvm_unreachable("Unknown trap handler");
4807 }
4808 
4809 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4810     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4811   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4812   MI.eraseFromParent();
4813   return true;
4814 }
4815 
4816 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4817     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4818   // Pass queue pointer to trap handler as input, and insert trap instruction
4819   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4820   Register LiveIn =
4821     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4822   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4823     return false;
4824 
4825   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4826   B.buildCopy(SGPR01, LiveIn);
4827   B.buildInstr(AMDGPU::S_TRAP)
4828       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4829       .addReg(SGPR01, RegState::Implicit);
4830 
4831   MI.eraseFromParent();
4832   return true;
4833 }
4834 
4835 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4836     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4837   B.buildInstr(AMDGPU::S_TRAP)
4838       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4839   MI.eraseFromParent();
4840   return true;
4841 }
4842 
4843 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4844     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4845   // Is non-HSA path or trap-handler disabled? Then, report a warning
4846   // accordingly
4847   if (!ST.isTrapHandlerEnabled() ||
4848       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4849     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4850                                      "debugtrap handler not supported",
4851                                      MI.getDebugLoc(), DS_Warning);
4852     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4853     Ctx.diagnose(NoTrap);
4854   } else {
4855     // Insert debug-trap instruction
4856     B.buildInstr(AMDGPU::S_TRAP)
4857         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4858   }
4859 
4860   MI.eraseFromParent();
4861   return true;
4862 }
4863 
4864 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4865                                                MachineIRBuilder &B) const {
4866   MachineRegisterInfo &MRI = *B.getMRI();
4867   const LLT S16 = LLT::scalar(16);
4868   const LLT S32 = LLT::scalar(32);
4869 
4870   Register DstReg = MI.getOperand(0).getReg();
4871   Register NodePtr = MI.getOperand(2).getReg();
4872   Register RayExtent = MI.getOperand(3).getReg();
4873   Register RayOrigin = MI.getOperand(4).getReg();
4874   Register RayDir = MI.getOperand(5).getReg();
4875   Register RayInvDir = MI.getOperand(6).getReg();
4876   Register TDescr = MI.getOperand(7).getReg();
4877 
4878   if (!ST.hasGFX10_AEncoding()) {
4879     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4880                                         "intrinsic not supported on subtarget",
4881                                         MI.getDebugLoc());
4882     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4883     return false;
4884   }
4885 
4886   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4887   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4888   const unsigned NumVDataDwords = 4;
4889   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4890   const bool UseNSA =
4891       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4892   const unsigned BaseOpcodes[2][2] = {
4893       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4894       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4895        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4896   int Opcode;
4897   if (UseNSA) {
4898     Opcode =
4899         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4900                               NumVDataDwords, NumVAddrDwords);
4901   } else {
4902     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4903                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4904                                    PowerOf2Ceil(NumVAddrDwords));
4905   }
4906   assert(Opcode != -1);
4907 
4908   SmallVector<Register, 12> Ops;
4909   if (Is64) {
4910     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4911     Ops.push_back(Unmerge.getReg(0));
4912     Ops.push_back(Unmerge.getReg(1));
4913   } else {
4914     Ops.push_back(NodePtr);
4915   }
4916   Ops.push_back(RayExtent);
4917 
4918   auto packLanes = [&Ops, &S32, &B](Register Src) {
4919     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4920     Ops.push_back(Unmerge.getReg(0));
4921     Ops.push_back(Unmerge.getReg(1));
4922     Ops.push_back(Unmerge.getReg(2));
4923   };
4924 
4925   packLanes(RayOrigin);
4926   if (IsA16) {
4927     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4928     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4929     Register R1 = MRI.createGenericVirtualRegister(S32);
4930     Register R2 = MRI.createGenericVirtualRegister(S32);
4931     Register R3 = MRI.createGenericVirtualRegister(S32);
4932     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4933     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4934     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4935     Ops.push_back(R1);
4936     Ops.push_back(R2);
4937     Ops.push_back(R3);
4938   } else {
4939     packLanes(RayDir);
4940     packLanes(RayInvDir);
4941   }
4942 
4943   if (!UseNSA) {
4944     // Build a single vector containing all the operands so far prepared.
4945     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4946     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4947     Ops.clear();
4948     Ops.push_back(MergedOps);
4949   }
4950 
4951   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4952     .addDef(DstReg)
4953     .addImm(Opcode);
4954 
4955   for (Register R : Ops) {
4956     MIB.addUse(R);
4957   }
4958 
4959   MIB.addUse(TDescr)
4960      .addImm(IsA16 ? 1 : 0)
4961      .cloneMemRefs(MI);
4962 
4963   MI.eraseFromParent();
4964   return true;
4965 }
4966 
4967 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4968   B.buildConstant(MI.getOperand(0).getReg(), C);
4969   MI.eraseFromParent();
4970   return true;
4971 }
4972 
4973 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
4974                                                MachineIRBuilder &B) const {
4975   unsigned Opc;
4976   int RoundMode = MI.getOperand(2).getImm();
4977 
4978   if (RoundMode == (int)RoundingMode::TowardPositive)
4979     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
4980   else if (RoundMode == (int)RoundingMode::TowardNegative)
4981     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
4982   else
4983     return false;
4984 
4985   B.buildInstr(Opc)
4986       .addDef(MI.getOperand(0).getReg())
4987       .addUse(MI.getOperand(1).getReg());
4988 
4989   MI.eraseFromParent();
4990 
4991   return true;
4992 }
4993 
4994 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4995                                             MachineInstr &MI) const {
4996   MachineIRBuilder &B = Helper.MIRBuilder;
4997   MachineRegisterInfo &MRI = *B.getMRI();
4998 
4999   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5000   auto IntrID = MI.getIntrinsicID();
5001   switch (IntrID) {
5002   case Intrinsic::amdgcn_if:
5003   case Intrinsic::amdgcn_else: {
5004     MachineInstr *Br = nullptr;
5005     MachineBasicBlock *UncondBrTarget = nullptr;
5006     bool Negated = false;
5007     if (MachineInstr *BrCond =
5008             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5009       const SIRegisterInfo *TRI
5010         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5011 
5012       Register Def = MI.getOperand(1).getReg();
5013       Register Use = MI.getOperand(3).getReg();
5014 
5015       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5016 
5017       if (Negated)
5018         std::swap(CondBrTarget, UncondBrTarget);
5019 
5020       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5021       if (IntrID == Intrinsic::amdgcn_if) {
5022         B.buildInstr(AMDGPU::SI_IF)
5023           .addDef(Def)
5024           .addUse(Use)
5025           .addMBB(UncondBrTarget);
5026       } else {
5027         B.buildInstr(AMDGPU::SI_ELSE)
5028             .addDef(Def)
5029             .addUse(Use)
5030             .addMBB(UncondBrTarget);
5031       }
5032 
5033       if (Br) {
5034         Br->getOperand(0).setMBB(CondBrTarget);
5035       } else {
5036         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5037         // since we're swapping branch targets it needs to be reinserted.
5038         // FIXME: IRTranslator should probably not do this
5039         B.buildBr(*CondBrTarget);
5040       }
5041 
5042       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5043       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5044       MI.eraseFromParent();
5045       BrCond->eraseFromParent();
5046       return true;
5047     }
5048 
5049     return false;
5050   }
5051   case Intrinsic::amdgcn_loop: {
5052     MachineInstr *Br = nullptr;
5053     MachineBasicBlock *UncondBrTarget = nullptr;
5054     bool Negated = false;
5055     if (MachineInstr *BrCond =
5056             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5057       const SIRegisterInfo *TRI
5058         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5059 
5060       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5061       Register Reg = MI.getOperand(2).getReg();
5062 
5063       if (Negated)
5064         std::swap(CondBrTarget, UncondBrTarget);
5065 
5066       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5067       B.buildInstr(AMDGPU::SI_LOOP)
5068         .addUse(Reg)
5069         .addMBB(UncondBrTarget);
5070 
5071       if (Br)
5072         Br->getOperand(0).setMBB(CondBrTarget);
5073       else
5074         B.buildBr(*CondBrTarget);
5075 
5076       MI.eraseFromParent();
5077       BrCond->eraseFromParent();
5078       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5079       return true;
5080     }
5081 
5082     return false;
5083   }
5084   case Intrinsic::amdgcn_kernarg_segment_ptr:
5085     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5086       // This only makes sense to call in a kernel, so just lower to null.
5087       B.buildConstant(MI.getOperand(0).getReg(), 0);
5088       MI.eraseFromParent();
5089       return true;
5090     }
5091 
5092     return legalizePreloadedArgIntrin(
5093       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5094   case Intrinsic::amdgcn_implicitarg_ptr:
5095     return legalizeImplicitArgPtr(MI, MRI, B);
5096   case Intrinsic::amdgcn_workitem_id_x:
5097     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5098       return replaceWithConstant(B, MI, 0);
5099     return legalizePreloadedArgIntrin(MI, MRI, B,
5100                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5101   case Intrinsic::amdgcn_workitem_id_y:
5102     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5103       return replaceWithConstant(B, MI, 0);
5104 
5105     return legalizePreloadedArgIntrin(MI, MRI, B,
5106                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5107   case Intrinsic::amdgcn_workitem_id_z:
5108     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5109       return replaceWithConstant(B, MI, 0);
5110 
5111     return legalizePreloadedArgIntrin(MI, MRI, B,
5112                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5113   case Intrinsic::amdgcn_workgroup_id_x:
5114     return legalizePreloadedArgIntrin(MI, MRI, B,
5115                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5116   case Intrinsic::amdgcn_workgroup_id_y:
5117     return legalizePreloadedArgIntrin(MI, MRI, B,
5118                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5119   case Intrinsic::amdgcn_workgroup_id_z:
5120     return legalizePreloadedArgIntrin(MI, MRI, B,
5121                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5122   case Intrinsic::amdgcn_dispatch_ptr:
5123     return legalizePreloadedArgIntrin(MI, MRI, B,
5124                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5125   case Intrinsic::amdgcn_queue_ptr:
5126     return legalizePreloadedArgIntrin(MI, MRI, B,
5127                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5128   case Intrinsic::amdgcn_implicit_buffer_ptr:
5129     return legalizePreloadedArgIntrin(
5130       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5131   case Intrinsic::amdgcn_dispatch_id:
5132     return legalizePreloadedArgIntrin(MI, MRI, B,
5133                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5134   case Intrinsic::amdgcn_fdiv_fast:
5135     return legalizeFDIVFastIntrin(MI, MRI, B);
5136   case Intrinsic::amdgcn_is_shared:
5137     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5138   case Intrinsic::amdgcn_is_private:
5139     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5140   case Intrinsic::amdgcn_wavefrontsize: {
5141     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5142     MI.eraseFromParent();
5143     return true;
5144   }
5145   case Intrinsic::amdgcn_s_buffer_load:
5146     return legalizeSBufferLoad(Helper, MI);
5147   case Intrinsic::amdgcn_raw_buffer_store:
5148   case Intrinsic::amdgcn_struct_buffer_store:
5149     return legalizeBufferStore(MI, MRI, B, false, false);
5150   case Intrinsic::amdgcn_raw_buffer_store_format:
5151   case Intrinsic::amdgcn_struct_buffer_store_format:
5152     return legalizeBufferStore(MI, MRI, B, false, true);
5153   case Intrinsic::amdgcn_raw_tbuffer_store:
5154   case Intrinsic::amdgcn_struct_tbuffer_store:
5155     return legalizeBufferStore(MI, MRI, B, true, true);
5156   case Intrinsic::amdgcn_raw_buffer_load:
5157   case Intrinsic::amdgcn_struct_buffer_load:
5158     return legalizeBufferLoad(MI, MRI, B, false, false);
5159   case Intrinsic::amdgcn_raw_buffer_load_format:
5160   case Intrinsic::amdgcn_struct_buffer_load_format:
5161     return legalizeBufferLoad(MI, MRI, B, true, false);
5162   case Intrinsic::amdgcn_raw_tbuffer_load:
5163   case Intrinsic::amdgcn_struct_tbuffer_load:
5164     return legalizeBufferLoad(MI, MRI, B, true, true);
5165   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5166   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5167   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5168   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5169   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5170   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5171   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5172   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5173   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5174   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5175   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5176   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5177   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5178   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5179   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5180   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5181   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5182   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5183   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5184   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5185   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5186   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5187   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5188   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5189   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5190   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5191   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5192   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5193   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5194   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5195     return legalizeBufferAtomic(MI, B, IntrID);
5196   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5197   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5198     Register DstReg = MI.getOperand(0).getReg();
5199     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5200       Function &F = B.getMF().getFunction();
5201       DiagnosticInfoUnsupported NoFpRet(
5202           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5203           DS_Error);
5204       F.getContext().diagnose(NoFpRet);
5205       B.buildUndef(DstReg);
5206       MI.eraseFromParent();
5207       return true;
5208     }
5209 
5210     return legalizeBufferAtomic(MI, B, IntrID);
5211   }
5212   case Intrinsic::amdgcn_atomic_inc:
5213     return legalizeAtomicIncDec(MI, B, true);
5214   case Intrinsic::amdgcn_atomic_dec:
5215     return legalizeAtomicIncDec(MI, B, false);
5216   case Intrinsic::trap:
5217     return legalizeTrapIntrinsic(MI, MRI, B);
5218   case Intrinsic::debugtrap:
5219     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5220   case Intrinsic::amdgcn_rsq_clamp:
5221     return legalizeRsqClampIntrinsic(MI, MRI, B);
5222   case Intrinsic::amdgcn_ds_fadd:
5223   case Intrinsic::amdgcn_ds_fmin:
5224   case Intrinsic::amdgcn_ds_fmax:
5225     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5226   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5227     return legalizeBVHIntrinsic(MI, B);
5228   default: {
5229     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5230             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5231       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5232     return true;
5233   }
5234   }
5235 
5236   return true;
5237 }
5238