1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const unsigned MemSizeInBits) {
365   const unsigned Size = Ty.getSizeInBits();
366   if (Size != MemSizeInBits)
367     return Size <= 32 && Ty.isVector();
368 
369   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
370     return true;
371   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
372          !isRegisterVectorElementType(Ty.getElementType());
373 }
374 
375 /// Return true if we should legalize a load by widening an odd sized memory
376 /// access up to the alignment. Note this case when the memory access itself
377 /// changes, not the size of the result register.
378 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
379                             unsigned AlignInBits, unsigned AddrSpace,
380                             unsigned Opcode) {
381   // We don't want to widen cases that are naturally legal.
382   if (isPowerOf2_32(SizeInBits))
383     return false;
384 
385   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
386   // end up widening these for a scalar load during RegBankSelect, since there
387   // aren't 96-bit scalar loads.
388   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
389     return false;
390 
391   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
392     return false;
393 
394   // A load is known dereferenceable up to the alignment, so it's legal to widen
395   // to it.
396   //
397   // TODO: Could check dereferenceable for less aligned cases.
398   unsigned RoundedSize = NextPowerOf2(SizeInBits);
399   if (AlignInBits < RoundedSize)
400     return false;
401 
402   // Do not widen if it would introduce a slow unaligned load.
403   const SITargetLowering *TLI = ST.getTargetLowering();
404   bool Fast = false;
405   return TLI->allowsMisalignedMemoryAccessesImpl(
406              RoundedSize, AddrSpace, Align(AlignInBits / 8),
407              MachineMemOperand::MOLoad, &Fast) &&
408          Fast;
409 }
410 
411 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
412                             unsigned Opcode) {
413   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
414     return false;
415 
416   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
417                          Query.MMODescrs[0].AlignInBits,
418                          Query.Types[1].getAddressSpace(), Opcode);
419 }
420 
421 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
422                                          const GCNTargetMachine &TM)
423   :  ST(ST_) {
424   using namespace TargetOpcode;
425 
426   auto GetAddrSpacePtr = [&TM](unsigned AS) {
427     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
428   };
429 
430   const LLT S1 = LLT::scalar(1);
431   const LLT S8 = LLT::scalar(8);
432   const LLT S16 = LLT::scalar(16);
433   const LLT S32 = LLT::scalar(32);
434   const LLT S64 = LLT::scalar(64);
435   const LLT S128 = LLT::scalar(128);
436   const LLT S256 = LLT::scalar(256);
437   const LLT S512 = LLT::scalar(512);
438   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
439 
440   const LLT V2S8 = LLT::fixed_vector(2, 8);
441   const LLT V2S16 = LLT::fixed_vector(2, 16);
442   const LLT V4S16 = LLT::fixed_vector(4, 16);
443 
444   const LLT V2S32 = LLT::fixed_vector(2, 32);
445   const LLT V3S32 = LLT::fixed_vector(3, 32);
446   const LLT V4S32 = LLT::fixed_vector(4, 32);
447   const LLT V5S32 = LLT::fixed_vector(5, 32);
448   const LLT V6S32 = LLT::fixed_vector(6, 32);
449   const LLT V7S32 = LLT::fixed_vector(7, 32);
450   const LLT V8S32 = LLT::fixed_vector(8, 32);
451   const LLT V9S32 = LLT::fixed_vector(9, 32);
452   const LLT V10S32 = LLT::fixed_vector(10, 32);
453   const LLT V11S32 = LLT::fixed_vector(11, 32);
454   const LLT V12S32 = LLT::fixed_vector(12, 32);
455   const LLT V13S32 = LLT::fixed_vector(13, 32);
456   const LLT V14S32 = LLT::fixed_vector(14, 32);
457   const LLT V15S32 = LLT::fixed_vector(15, 32);
458   const LLT V16S32 = LLT::fixed_vector(16, 32);
459   const LLT V32S32 = LLT::fixed_vector(32, 32);
460 
461   const LLT V2S64 = LLT::fixed_vector(2, 64);
462   const LLT V3S64 = LLT::fixed_vector(3, 64);
463   const LLT V4S64 = LLT::fixed_vector(4, 64);
464   const LLT V5S64 = LLT::fixed_vector(5, 64);
465   const LLT V6S64 = LLT::fixed_vector(6, 64);
466   const LLT V7S64 = LLT::fixed_vector(7, 64);
467   const LLT V8S64 = LLT::fixed_vector(8, 64);
468   const LLT V16S64 = LLT::fixed_vector(16, 64);
469 
470   std::initializer_list<LLT> AllS32Vectors =
471     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
472      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
473   std::initializer_list<LLT> AllS64Vectors =
474     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
475 
476   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
477   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
478   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
479   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
480   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
481   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
482   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
483 
484   const LLT CodePtr = FlatPtr;
485 
486   const std::initializer_list<LLT> AddrSpaces64 = {
487     GlobalPtr, ConstantPtr, FlatPtr
488   };
489 
490   const std::initializer_list<LLT> AddrSpaces32 = {
491     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
492   };
493 
494   const std::initializer_list<LLT> FPTypesBase = {
495     S32, S64
496   };
497 
498   const std::initializer_list<LLT> FPTypes16 = {
499     S32, S64, S16
500   };
501 
502   const std::initializer_list<LLT> FPTypesPK16 = {
503     S32, S64, S16, V2S16
504   };
505 
506   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
507 
508   // s1 for VCC branches, s32 for SCC branches.
509   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
510 
511   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
512   // elements for v3s16
513   getActionDefinitionsBuilder(G_PHI)
514     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
515     .legalFor(AllS32Vectors)
516     .legalFor(AllS64Vectors)
517     .legalFor(AddrSpaces64)
518     .legalFor(AddrSpaces32)
519     .legalIf(isPointer(0))
520     .clampScalar(0, S16, S256)
521     .widenScalarToNextPow2(0, 32)
522     .clampMaxNumElements(0, S32, 16)
523     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
524     .scalarize(0);
525 
526   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
527     // Full set of gfx9 features.
528     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
529       .legalFor({S32, S16, V2S16})
530       .clampScalar(0, S16, S32)
531       .clampMaxNumElements(0, S16, 2)
532       .scalarize(0)
533       .widenScalarToNextPow2(0, 32);
534 
535     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
536       .legalFor({S32, S16, V2S16}) // Clamp modifier
537       .minScalarOrElt(0, S16)
538       .clampMaxNumElements(0, S16, 2)
539       .scalarize(0)
540       .widenScalarToNextPow2(0, 32)
541       .lower();
542   } else if (ST.has16BitInsts()) {
543     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
544       .legalFor({S32, S16})
545       .clampScalar(0, S16, S32)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
548 
549     // Technically the saturating operations require clamp bit support, but this
550     // was introduced at the same time as 16-bit operations.
551     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
552       .legalFor({S32, S16}) // Clamp modifier
553       .minScalar(0, S16)
554       .scalarize(0)
555       .widenScalarToNextPow2(0, 16)
556       .lower();
557 
558     // We're just lowering this, but it helps get a better result to try to
559     // coerce to the desired type first.
560     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
561       .minScalar(0, S16)
562       .scalarize(0)
563       .lower();
564   } else {
565     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
566       .legalFor({S32})
567       .clampScalar(0, S32, S32)
568       .scalarize(0);
569 
570     if (ST.hasIntClamp()) {
571       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
572         .legalFor({S32}) // Clamp modifier.
573         .scalarize(0)
574         .minScalarOrElt(0, S32)
575         .lower();
576     } else {
577       // Clamp bit support was added in VI, along with 16-bit operations.
578       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
579         .minScalar(0, S32)
580         .scalarize(0)
581         .lower();
582     }
583 
584     // FIXME: DAG expansion gets better results. The widening uses the smaller
585     // range values and goes for the min/max lowering directly.
586     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
587       .minScalar(0, S32)
588       .scalarize(0)
589       .lower();
590   }
591 
592   getActionDefinitionsBuilder(
593       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
594       .customFor({S32, S64})
595       .clampScalar(0, S32, S64)
596       .widenScalarToNextPow2(0, 32)
597       .scalarize(0);
598 
599   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
600                    .legalFor({S32})
601                    .maxScalarOrElt(0, S32);
602 
603   if (ST.hasVOP3PInsts()) {
604     Mulh
605       .clampMaxNumElements(0, S8, 2)
606       .lowerFor({V2S8});
607   }
608 
609   Mulh
610     .scalarize(0)
611     .lower();
612 
613   // Report legal for any types we can handle anywhere. For the cases only legal
614   // on the SALU, RegBankSelect will be able to re-legalize.
615   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
616     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
617     .clampScalar(0, S32, S64)
618     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
619     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
620     .widenScalarToNextPow2(0)
621     .scalarize(0);
622 
623   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
624                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
625     .legalFor({{S32, S1}, {S32, S32}})
626     .minScalar(0, S32)
627     // TODO: .scalarize(0)
628     .lower();
629 
630   getActionDefinitionsBuilder(G_BITCAST)
631     // Don't worry about the size constraint.
632     .legalIf(all(isRegisterType(0), isRegisterType(1)))
633     .lower();
634 
635 
636   getActionDefinitionsBuilder(G_CONSTANT)
637     .legalFor({S1, S32, S64, S16, GlobalPtr,
638                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
639     .legalIf(isPointer(0))
640     .clampScalar(0, S32, S64)
641     .widenScalarToNextPow2(0);
642 
643   getActionDefinitionsBuilder(G_FCONSTANT)
644     .legalFor({S32, S64, S16})
645     .clampScalar(0, S16, S64);
646 
647   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
648       .legalIf(isRegisterType(0))
649       // s1 and s16 are special cases because they have legal operations on
650       // them, but don't really occupy registers in the normal way.
651       .legalFor({S1, S16})
652       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
653       .clampScalarOrElt(0, S32, MaxScalar)
654       .widenScalarToNextPow2(0, 32)
655       .clampMaxNumElements(0, S32, 16);
656 
657   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
658 
659   // If the amount is divergent, we have to do a wave reduction to get the
660   // maximum value, so this is expanded during RegBankSelect.
661   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662     .legalFor({{PrivatePtr, S32}});
663 
664   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665     .customIf(typeIsNot(0, PrivatePtr));
666 
667   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
668 
669   auto &FPOpActions = getActionDefinitionsBuilder(
670     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671     .legalFor({S32, S64});
672   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673     .customFor({S32, S64});
674   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675     .customFor({S32, S64});
676 
677   if (ST.has16BitInsts()) {
678     if (ST.hasVOP3PInsts())
679       FPOpActions.legalFor({S16, V2S16});
680     else
681       FPOpActions.legalFor({S16});
682 
683     TrigActions.customFor({S16});
684     FDIVActions.customFor({S16});
685   }
686 
687   auto &MinNumMaxNum = getActionDefinitionsBuilder({
688       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689 
690   if (ST.hasVOP3PInsts()) {
691     MinNumMaxNum.customFor(FPTypesPK16)
692       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
693       .clampMaxNumElements(0, S16, 2)
694       .clampScalar(0, S16, S64)
695       .scalarize(0);
696   } else if (ST.has16BitInsts()) {
697     MinNumMaxNum.customFor(FPTypes16)
698       .clampScalar(0, S16, S64)
699       .scalarize(0);
700   } else {
701     MinNumMaxNum.customFor(FPTypesBase)
702       .clampScalar(0, S32, S64)
703       .scalarize(0);
704   }
705 
706   if (ST.hasVOP3PInsts())
707     FPOpActions.clampMaxNumElements(0, S16, 2);
708 
709   FPOpActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   TrigActions
714     .scalarize(0)
715     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717   FDIVActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   getActionDefinitionsBuilder({G_FNEG, G_FABS})
722     .legalFor(FPTypesPK16)
723     .clampMaxNumElements(0, S16, 2)
724     .scalarize(0)
725     .clampScalar(0, S16, S64);
726 
727   if (ST.has16BitInsts()) {
728     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729       .legalFor({S32, S64, S16})
730       .scalarize(0)
731       .clampScalar(0, S16, S64);
732   } else {
733     getActionDefinitionsBuilder(G_FSQRT)
734       .legalFor({S32, S64})
735       .scalarize(0)
736       .clampScalar(0, S32, S64);
737 
738     if (ST.hasFractBug()) {
739       getActionDefinitionsBuilder(G_FFLOOR)
740         .customFor({S64})
741         .legalFor({S32, S64})
742         .scalarize(0)
743         .clampScalar(0, S32, S64);
744     } else {
745       getActionDefinitionsBuilder(G_FFLOOR)
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     }
750   }
751 
752   getActionDefinitionsBuilder(G_FPTRUNC)
753     .legalFor({{S32, S64}, {S16, S32}})
754     .scalarize(0)
755     .lower();
756 
757   getActionDefinitionsBuilder(G_FPEXT)
758     .legalFor({{S64, S32}, {S32, S16}})
759     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760     .scalarize(0);
761 
762   getActionDefinitionsBuilder(G_FSUB)
763       // Use actual fsub instruction
764       .legalFor({S32})
765       // Must use fadd + fneg
766       .lowerFor({S64, S16, V2S16})
767       .scalarize(0)
768       .clampScalar(0, S32, S64);
769 
770   // Whether this is legal depends on the floating point mode for the function.
771   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773     FMad.customFor({S32, S16});
774   else if (ST.hasMadMacF32Insts())
775     FMad.customFor({S32});
776   else if (ST.hasMadF16())
777     FMad.customFor({S16});
778   FMad.scalarize(0)
779       .lower();
780 
781   auto &FRem = getActionDefinitionsBuilder(G_FREM);
782   if (ST.has16BitInsts()) {
783     FRem.customFor({S16, S32, S64});
784   } else {
785     FRem.minScalar(0, S32)
786         .customFor({S32, S64});
787   }
788   FRem.scalarize(0);
789 
790   // TODO: Do we need to clamp maximum bitwidth?
791   getActionDefinitionsBuilder(G_TRUNC)
792     .legalIf(isScalar(0))
793     .legalFor({{V2S16, V2S32}})
794     .clampMaxNumElements(0, S16, 2)
795     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796     // situations (like an invalid implicit use), we don't want to infinite loop
797     // in the legalizer.
798     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
799     .alwaysLegal();
800 
801   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803                {S32, S1}, {S64, S1}, {S16, S1}})
804     .scalarize(0)
805     .clampScalar(0, S32, S64)
806     .widenScalarToNextPow2(1, 32);
807 
808   // TODO: Split s1->s64 during regbankselect for VALU.
809   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811     .lowerFor({{S32, S64}})
812     .lowerIf(typeIs(1, S1))
813     .customFor({{S64, S64}});
814   if (ST.has16BitInsts())
815     IToFP.legalFor({{S16, S16}});
816   IToFP.clampScalar(1, S32, S64)
817        .minScalar(0, S32)
818        .scalarize(0)
819        .widenScalarToNextPow2(1);
820 
821   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823     .customFor({{S64, S32}, {S64, S64}})
824     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825   if (ST.has16BitInsts())
826     FPToI.legalFor({{S16, S16}});
827   else
828     FPToI.minScalar(1, S32);
829 
830   FPToI.minScalar(0, S32)
831        .widenScalarToNextPow2(0, 32)
832        .scalarize(0)
833        .lower();
834 
835   // Lower roundeven into G_FRINT
836   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837     .scalarize(0)
838     .lower();
839 
840   if (ST.has16BitInsts()) {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S16, S32, S64})
843       .clampScalar(0, S16, S64)
844       .scalarize(0);
845   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S32, S64})
848       .clampScalar(0, S32, S64)
849       .scalarize(0);
850   } else {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32})
853       .customFor({S64})
854       .clampScalar(0, S32, S64)
855       .scalarize(0);
856   }
857 
858   getActionDefinitionsBuilder(G_PTR_ADD)
859     .legalIf(all(isPointer(0), sameSize(0, 1)))
860     .scalarize(0)
861     .scalarSameSizeAs(1, 0);
862 
863   getActionDefinitionsBuilder(G_PTRMASK)
864     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865     .scalarSameSizeAs(1, 0)
866     .scalarize(0);
867 
868   auto &CmpBuilder =
869     getActionDefinitionsBuilder(G_ICMP)
870     // The compare output type differs based on the register bank of the output,
871     // so make both s1 and s32 legal.
872     //
873     // Scalar compares producing output in scc will be promoted to s32, as that
874     // is the allocatable register type that will be needed for the copy from
875     // scc. This will be promoted during RegBankSelect, and we assume something
876     // before that won't try to use s32 result types.
877     //
878     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879     // bank.
880     .legalForCartesianProduct(
881       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882     .legalForCartesianProduct(
883       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884   if (ST.has16BitInsts()) {
885     CmpBuilder.legalFor({{S1, S16}});
886   }
887 
888   CmpBuilder
889     .widenScalarToNextPow2(1)
890     .clampScalar(1, S32, S64)
891     .scalarize(0)
892     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893 
894   getActionDefinitionsBuilder(G_FCMP)
895     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0);
899 
900   // FIXME: fpow has a selection pattern that should move to custom lowering.
901   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902   if (ST.has16BitInsts())
903     Exp2Ops.legalFor({S32, S16});
904   else
905     Exp2Ops.legalFor({S32});
906   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907   Exp2Ops.scalarize(0);
908 
909   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910   if (ST.has16BitInsts())
911     ExpOps.customFor({{S32}, {S16}});
912   else
913     ExpOps.customFor({S32});
914   ExpOps.clampScalar(0, MinScalarFPTy, S32)
915         .scalarize(0);
916 
917   getActionDefinitionsBuilder(G_FPOWI)
918     .clampScalar(0, MinScalarFPTy, S32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder(G_CTPOP)
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   // The hardware instructions return a different result on 0 than the generic
931   // instructions expect. The hardware produces -1, but these produce the
932   // bitwidth.
933   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934     .scalarize(0)
935     .clampScalar(0, S32, S32)
936     .clampScalar(1, S32, S64)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32)
939     .lower();
940 
941   // The 64-bit versions produce 32-bit results, but only on the SALU.
942   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943     .legalFor({{S32, S32}, {S32, S64}})
944     .clampScalar(0, S32, S32)
945     .clampScalar(1, S32, S64)
946     .scalarize(0)
947     .widenScalarToNextPow2(0, 32)
948     .widenScalarToNextPow2(1, 32);
949 
950   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951   // RegBankSelect.
952   getActionDefinitionsBuilder(G_BITREVERSE)
953     .legalFor({S32, S64})
954     .clampScalar(0, S32, S64)
955     .scalarize(0)
956     .widenScalarToNextPow2(0);
957 
958   if (ST.has16BitInsts()) {
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S16, S32, V2S16})
961       .clampMaxNumElements(0, S16, 2)
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .clampScalar(0, S16, S32)
966       .scalarize(0);
967 
968     if (ST.hasVOP3PInsts()) {
969       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970         .legalFor({S32, S16, V2S16})
971         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972         .clampMaxNumElements(0, S16, 2)
973         .minScalar(0, S16)
974         .widenScalarToNextPow2(0)
975         .scalarize(0)
976         .lower();
977     } else {
978       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979         .legalFor({S32, S16})
980         .widenScalarToNextPow2(0)
981         .minScalar(0, S16)
982         .scalarize(0)
983         .lower();
984     }
985   } else {
986     // TODO: Should have same legality without v_perm_b32
987     getActionDefinitionsBuilder(G_BSWAP)
988       .legalFor({S32})
989       .lowerIf(scalarNarrowerThan(0, 32))
990       // FIXME: Fixing non-power-of-2 before clamp is workaround for
991       // narrowScalar limitation.
992       .widenScalarToNextPow2(0)
993       .maxScalar(0, S32)
994       .scalarize(0)
995       .lower();
996 
997     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998       .legalFor({S32})
999       .minScalar(0, S32)
1000       .widenScalarToNextPow2(0)
1001       .scalarize(0)
1002       .lower();
1003   }
1004 
1005   getActionDefinitionsBuilder(G_INTTOPTR)
1006     // List the common cases
1007     .legalForCartesianProduct(AddrSpaces64, {S64})
1008     .legalForCartesianProduct(AddrSpaces32, {S32})
1009     .scalarize(0)
1010     // Accept any address space as long as the size matches
1011     .legalIf(sameSize(0, 1))
1012     .widenScalarIf(smallerThan(1, 0),
1013       [](const LegalityQuery &Query) {
1014         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015       })
1016     .narrowScalarIf(largerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       });
1020 
1021   getActionDefinitionsBuilder(G_PTRTOINT)
1022     // List the common cases
1023     .legalForCartesianProduct(AddrSpaces64, {S64})
1024     .legalForCartesianProduct(AddrSpaces32, {S32})
1025     .scalarize(0)
1026     // Accept any address space as long as the size matches
1027     .legalIf(sameSize(0, 1))
1028     .widenScalarIf(smallerThan(0, 1),
1029       [](const LegalityQuery &Query) {
1030         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031       })
1032     .narrowScalarIf(
1033       largerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       });
1037 
1038   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039     .scalarize(0)
1040     .custom();
1041 
1042   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043                                     bool IsLoad) -> bool {
1044     const LLT DstTy = Query.Types[0];
1045 
1046     // Split vector extloads.
1047     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049 
1050     if (MemSize < DstTy.getSizeInBits())
1051       MemSize = std::max(MemSize, AlignBits);
1052 
1053     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054       return true;
1055 
1056     const LLT PtrTy = Query.Types[1];
1057     unsigned AS = PtrTy.getAddressSpace();
1058     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059       return true;
1060 
1061     // Catch weird sized loads that don't evenly divide into the access sizes
1062     // TODO: May be able to widen depending on alignment etc.
1063     unsigned NumRegs = (MemSize + 31) / 32;
1064     if (NumRegs == 3) {
1065       if (!ST.hasDwordx3LoadStores())
1066         return true;
1067     } else {
1068       // If the alignment allows, these should have been widened.
1069       if (!isPowerOf2_32(NumRegs))
1070         return true;
1071     }
1072 
1073     if (AlignBits < MemSize) {
1074       const SITargetLowering *TLI = ST.getTargetLowering();
1075       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076                                                       Align(AlignBits / 8));
1077     }
1078 
1079     return false;
1080   };
1081 
1082   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085 
1086   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087   // LDS
1088   // TODO: Unsupported flat for SI.
1089 
1090   for (unsigned Op : {G_LOAD, G_STORE}) {
1091     const bool IsStore = Op == G_STORE;
1092 
1093     auto &Actions = getActionDefinitionsBuilder(Op);
1094     // Explicitly list some common cases.
1095     // TODO: Does this help compile time at all?
1096     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1098                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1099                                       {S64, GlobalPtr, 64, GlobalAlign32},
1100                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1101                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1102                                       {S32, GlobalPtr, 8, GlobalAlign8},
1103                                       {S32, GlobalPtr, 16, GlobalAlign16},
1104 
1105                                       {S32, LocalPtr, 32, 32},
1106                                       {S64, LocalPtr, 64, 32},
1107                                       {V2S32, LocalPtr, 64, 32},
1108                                       {S32, LocalPtr, 8, 8},
1109                                       {S32, LocalPtr, 16, 16},
1110                                       {V2S16, LocalPtr, 32, 32},
1111 
1112                                       {S32, PrivatePtr, 32, 32},
1113                                       {S32, PrivatePtr, 8, 8},
1114                                       {S32, PrivatePtr, 16, 16},
1115                                       {V2S16, PrivatePtr, 32, 32},
1116 
1117                                       {S32, ConstantPtr, 32, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1119                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1120                                       {S64, ConstantPtr, 64, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122     Actions.legalIf(
1123       [=](const LegalityQuery &Query) -> bool {
1124         return isLoadStoreLegal(ST, Query);
1125       });
1126 
1127     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128     // 64-bits.
1129     //
1130     // TODO: Should generalize bitcast action into coerce, which will also cover
1131     // inserting addrspacecasts.
1132     Actions.customIf(typeIs(1, Constant32Ptr));
1133 
1134     // Turn any illegal element vectors into something easier to deal
1135     // with. These will ultimately produce 32-bit scalar shifts to extract the
1136     // parts anyway.
1137     //
1138     // For odd 16-bit element vectors, prefer to split those into pieces with
1139     // 16-bit vector parts.
1140     Actions.bitcastIf(
1141       [=](const LegalityQuery &Query) -> bool {
1142         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143                                           Query.MMODescrs[0].SizeInBits);
1144       }, bitcastToRegisterType(0));
1145 
1146     if (!IsStore) {
1147       // Widen suitably aligned loads by loading extra bytes. The standard
1148       // legalization actions can't properly express widening memory operands.
1149       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150         return shouldWidenLoad(ST, Query, G_LOAD);
1151       });
1152     }
1153 
1154     // FIXME: load/store narrowing should be moved to lower action
1155     Actions
1156         .narrowScalarIf(
1157             [=](const LegalityQuery &Query) -> bool {
1158               return !Query.Types[0].isVector() &&
1159                      needToSplitMemOp(Query, Op == G_LOAD);
1160             },
1161             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162               const LLT DstTy = Query.Types[0];
1163               const LLT PtrTy = Query.Types[1];
1164 
1165               const unsigned DstSize = DstTy.getSizeInBits();
1166               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167 
1168               // Split extloads.
1169               if (DstSize > MemSize)
1170                 return std::make_pair(0, LLT::scalar(MemSize));
1171 
1172               if (!isPowerOf2_32(DstSize)) {
1173                 // We're probably decomposing an odd sized store. Try to split
1174                 // to the widest type. TODO: Account for alignment. As-is it
1175                 // should be OK, since the new parts will be further legalized.
1176                 unsigned FloorSize = PowerOf2Floor(DstSize);
1177                 return std::make_pair(0, LLT::scalar(FloorSize));
1178               }
1179 
1180               if (DstSize > 32 && (DstSize % 32 != 0)) {
1181                 // FIXME: Need a way to specify non-extload of larger size if
1182                 // suitably aligned.
1183                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184               }
1185 
1186               unsigned MaxSize = maxSizeForAddrSpace(ST,
1187                                                      PtrTy.getAddressSpace(),
1188                                                      Op == G_LOAD);
1189               if (MemSize > MaxSize)
1190                 return std::make_pair(0, LLT::scalar(MaxSize));
1191 
1192               unsigned Align = Query.MMODescrs[0].AlignInBits;
1193               return std::make_pair(0, LLT::scalar(Align));
1194             })
1195         .fewerElementsIf(
1196             [=](const LegalityQuery &Query) -> bool {
1197               return Query.Types[0].isVector() &&
1198                      needToSplitMemOp(Query, Op == G_LOAD);
1199             },
1200             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201               const LLT DstTy = Query.Types[0];
1202               const LLT PtrTy = Query.Types[1];
1203 
1204               LLT EltTy = DstTy.getElementType();
1205               unsigned MaxSize = maxSizeForAddrSpace(ST,
1206                                                      PtrTy.getAddressSpace(),
1207                                                      Op == G_LOAD);
1208 
1209               // FIXME: Handle widened to power of 2 results better. This ends
1210               // up scalarizing.
1211               // FIXME: 3 element stores scalarized on SI
1212 
1213               // Split if it's too large for the address space.
1214               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215                 unsigned NumElts = DstTy.getNumElements();
1216                 unsigned EltSize = EltTy.getSizeInBits();
1217 
1218                 if (MaxSize % EltSize == 0) {
1219                   return std::make_pair(
1220                       0, LLT::scalarOrVector(
1221                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1222                 }
1223 
1224                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1225 
1226                 // FIXME: Refine when odd breakdowns handled
1227                 // The scalars will need to be re-legalized.
1228                 if (NumPieces == 1 || NumPieces >= NumElts ||
1229                     NumElts % NumPieces != 0)
1230                   return std::make_pair(0, EltTy);
1231 
1232                 return std::make_pair(
1233                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1234               }
1235 
1236               // FIXME: We could probably handle weird extending loads better.
1237               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1238               if (DstTy.getSizeInBits() > MemSize)
1239                 return std::make_pair(0, EltTy);
1240 
1241               unsigned EltSize = EltTy.getSizeInBits();
1242               unsigned DstSize = DstTy.getSizeInBits();
1243               if (!isPowerOf2_32(DstSize)) {
1244                 // We're probably decomposing an odd sized store. Try to split
1245                 // to the widest type. TODO: Account for alignment. As-is it
1246                 // should be OK, since the new parts will be further legalized.
1247                 unsigned FloorSize = PowerOf2Floor(DstSize);
1248                 return std::make_pair(
1249                     0, LLT::scalarOrVector(
1250                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1251               }
1252 
1253               // Need to split because of alignment.
1254               unsigned Align = Query.MMODescrs[0].AlignInBits;
1255               if (EltSize > Align &&
1256                   (EltSize / Align < DstTy.getNumElements())) {
1257                 return std::make_pair(
1258                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1259               }
1260 
1261               // May need relegalization for the scalars.
1262               return std::make_pair(0, EltTy);
1263             })
1264     .lowerIfMemSizeNotPow2()
1265     .minScalar(0, S32)
1266     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1267     .widenScalarToNextPow2(0)
1268     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1269     .lower();
1270   }
1271 
1272   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1273                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1274                                                   {S32, GlobalPtr, 16, 2 * 8},
1275                                                   {S32, LocalPtr, 8, 8},
1276                                                   {S32, LocalPtr, 16, 16},
1277                                                   {S32, PrivatePtr, 8, 8},
1278                                                   {S32, PrivatePtr, 16, 16},
1279                                                   {S32, ConstantPtr, 8, 8},
1280                                                   {S32, ConstantPtr, 16, 2 * 8}})
1281                        .legalIf(
1282                          [=](const LegalityQuery &Query) -> bool {
1283                            return isLoadStoreLegal(ST, Query);
1284                          });
1285 
1286   if (ST.hasFlatAddressSpace()) {
1287     ExtLoads.legalForTypesWithMemDesc(
1288         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1289   }
1290 
1291   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1292   // 64-bits.
1293   //
1294   // TODO: Should generalize bitcast action into coerce, which will also cover
1295   // inserting addrspacecasts.
1296   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1297 
1298   ExtLoads.clampScalar(0, S32, S32)
1299           .widenScalarToNextPow2(0)
1300           .unsupportedIfMemSizeNotPow2()
1301           .lower();
1302 
1303   auto &Atomics = getActionDefinitionsBuilder(
1304     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1305      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1306      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1307      G_ATOMICRMW_UMIN})
1308     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1309                {S64, GlobalPtr}, {S64, LocalPtr},
1310                {S32, RegionPtr}, {S64, RegionPtr}});
1311   if (ST.hasFlatAddressSpace()) {
1312     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1313   }
1314 
1315   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1316   if (ST.hasLDSFPAtomics()) {
1317     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1318     if (ST.hasGFX90AInsts())
1319       Atomic.legalFor({{S64, LocalPtr}});
1320   }
1321   if (ST.hasAtomicFaddInsts())
1322     Atomic.legalFor({{S32, GlobalPtr}});
1323 
1324   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1325   // demarshalling
1326   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1327     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1328                 {S32, FlatPtr}, {S64, FlatPtr}})
1329     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1330                {S32, RegionPtr}, {S64, RegionPtr}});
1331   // TODO: Pointer types, any 32-bit or 64-bit vector
1332 
1333   // Condition should be s32 for scalar, s1 for vector.
1334   getActionDefinitionsBuilder(G_SELECT)
1335       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1336                                  LocalPtr, FlatPtr, PrivatePtr,
1337                                  LLT::fixed_vector(2, LocalPtr),
1338                                  LLT::fixed_vector(2, PrivatePtr)},
1339                                 {S1, S32})
1340       .clampScalar(0, S16, S64)
1341       .scalarize(1)
1342       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1343       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1344       .clampMaxNumElements(0, S32, 2)
1345       .clampMaxNumElements(0, LocalPtr, 2)
1346       .clampMaxNumElements(0, PrivatePtr, 2)
1347       .scalarize(0)
1348       .widenScalarToNextPow2(0)
1349       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1350 
1351   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1352   // be more flexible with the shift amount type.
1353   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1354     .legalFor({{S32, S32}, {S64, S32}});
1355   if (ST.has16BitInsts()) {
1356     if (ST.hasVOP3PInsts()) {
1357       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1358             .clampMaxNumElements(0, S16, 2);
1359     } else
1360       Shifts.legalFor({{S16, S16}});
1361 
1362     // TODO: Support 16-bit shift amounts for all types
1363     Shifts.widenScalarIf(
1364       [=](const LegalityQuery &Query) {
1365         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1366         // 32-bit amount.
1367         const LLT ValTy = Query.Types[0];
1368         const LLT AmountTy = Query.Types[1];
1369         return ValTy.getSizeInBits() <= 16 &&
1370                AmountTy.getSizeInBits() < 16;
1371       }, changeTo(1, S16));
1372     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1373     Shifts.clampScalar(1, S32, S32);
1374     Shifts.clampScalar(0, S16, S64);
1375     Shifts.widenScalarToNextPow2(0, 16);
1376 
1377     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1378       .minScalar(0, S16)
1379       .scalarize(0)
1380       .lower();
1381   } else {
1382     // Make sure we legalize the shift amount type first, as the general
1383     // expansion for the shifted type will produce much worse code if it hasn't
1384     // been truncated already.
1385     Shifts.clampScalar(1, S32, S32);
1386     Shifts.clampScalar(0, S32, S64);
1387     Shifts.widenScalarToNextPow2(0, 32);
1388 
1389     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1390       .minScalar(0, S32)
1391       .scalarize(0)
1392       .lower();
1393   }
1394   Shifts.scalarize(0);
1395 
1396   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1397     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1398     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1399     unsigned IdxTypeIdx = 2;
1400 
1401     getActionDefinitionsBuilder(Op)
1402       .customIf([=](const LegalityQuery &Query) {
1403           const LLT EltTy = Query.Types[EltTypeIdx];
1404           const LLT VecTy = Query.Types[VecTypeIdx];
1405           const LLT IdxTy = Query.Types[IdxTypeIdx];
1406           const unsigned EltSize = EltTy.getSizeInBits();
1407           return (EltSize == 32 || EltSize == 64) &&
1408                   VecTy.getSizeInBits() % 32 == 0 &&
1409                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1410                   IdxTy.getSizeInBits() == 32;
1411         })
1412       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1413                  bitcastToVectorElement32(VecTypeIdx))
1414       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1415       .bitcastIf(
1416         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1417         [=](const LegalityQuery &Query) {
1418           // For > 64-bit element types, try to turn this into a 64-bit
1419           // element vector since we may be able to do better indexing
1420           // if this is scalar. If not, fall back to 32.
1421           const LLT EltTy = Query.Types[EltTypeIdx];
1422           const LLT VecTy = Query.Types[VecTypeIdx];
1423           const unsigned DstEltSize = EltTy.getSizeInBits();
1424           const unsigned VecSize = VecTy.getSizeInBits();
1425 
1426           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1427           return std::make_pair(
1428               VecTypeIdx,
1429               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1430         })
1431       .clampScalar(EltTypeIdx, S32, S64)
1432       .clampScalar(VecTypeIdx, S32, S64)
1433       .clampScalar(IdxTypeIdx, S32, S32)
1434       .clampMaxNumElements(VecTypeIdx, S32, 32)
1435       // TODO: Clamp elements for 64-bit vectors?
1436       // It should only be necessary with variable indexes.
1437       // As a last resort, lower to the stack
1438       .lower();
1439   }
1440 
1441   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1442     .unsupportedIf([=](const LegalityQuery &Query) {
1443         const LLT &EltTy = Query.Types[1].getElementType();
1444         return Query.Types[0] != EltTy;
1445       });
1446 
1447   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1448     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1449     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1450 
1451     // FIXME: Doesn't handle extract of illegal sizes.
1452     getActionDefinitionsBuilder(Op)
1453       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1454       // FIXME: Multiples of 16 should not be legal.
1455       .legalIf([=](const LegalityQuery &Query) {
1456           const LLT BigTy = Query.Types[BigTyIdx];
1457           const LLT LitTy = Query.Types[LitTyIdx];
1458           return (BigTy.getSizeInBits() % 32 == 0) &&
1459                  (LitTy.getSizeInBits() % 16 == 0);
1460         })
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT BigTy = Query.Types[BigTyIdx];
1464           return (BigTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1467       .widenScalarIf(
1468         [=](const LegalityQuery &Query) {
1469           const LLT LitTy = Query.Types[LitTyIdx];
1470           return (LitTy.getScalarSizeInBits() < 16);
1471         },
1472         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1473       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1474       .widenScalarToNextPow2(BigTyIdx, 32);
1475 
1476   }
1477 
1478   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1479     .legalForCartesianProduct(AllS32Vectors, {S32})
1480     .legalForCartesianProduct(AllS64Vectors, {S64})
1481     .clampNumElements(0, V16S32, V32S32)
1482     .clampNumElements(0, V2S64, V16S64)
1483     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1484 
1485   if (ST.hasScalarPackInsts()) {
1486     BuildVector
1487       // FIXME: Should probably widen s1 vectors straight to s32
1488       .minScalarOrElt(0, S16)
1489       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1490       .minScalar(1, S32);
1491 
1492     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1493       .legalFor({V2S16, S32})
1494       .lower();
1495     BuildVector.minScalarOrElt(0, S32);
1496   } else {
1497     BuildVector.customFor({V2S16, S16});
1498     BuildVector.minScalarOrElt(0, S32);
1499 
1500     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1501       .customFor({V2S16, S32})
1502       .lower();
1503   }
1504 
1505   BuildVector.legalIf(isRegisterType(0));
1506 
1507   // FIXME: Clamp maximum size
1508   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1509     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1510     .clampMaxNumElements(0, S32, 32)
1511     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1512     .clampMaxNumElements(0, S16, 64);
1513 
1514   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1515   // pre-legalize.
1516   if (ST.hasVOP3PInsts()) {
1517     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1518       .customFor({V2S16, V2S16})
1519       .lower();
1520   } else
1521     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1522 
1523   // Merge/Unmerge
1524   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1525     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1526     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1527 
1528     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1529       const LLT Ty = Query.Types[TypeIdx];
1530       if (Ty.isVector()) {
1531         const LLT &EltTy = Ty.getElementType();
1532         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1533           return true;
1534         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1535           return true;
1536       }
1537       return false;
1538     };
1539 
1540     auto &Builder = getActionDefinitionsBuilder(Op)
1541       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1542       .lowerFor({{S16, V2S16}})
1543       .lowerIf([=](const LegalityQuery &Query) {
1544           const LLT BigTy = Query.Types[BigTyIdx];
1545           return BigTy.getSizeInBits() == 32;
1546         })
1547       // Try to widen to s16 first for small types.
1548       // TODO: Only do this on targets with legal s16 shifts
1549       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1550       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1551       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1552       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1553                            elementTypeIs(1, S16)),
1554                        changeTo(1, V2S16))
1555       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1556       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1557       // valid.
1558       .clampScalar(LitTyIdx, S32, S512)
1559       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1560       // Break up vectors with weird elements into scalars
1561       .fewerElementsIf(
1562         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1563         scalarize(0))
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1566         scalarize(1))
1567       .clampScalar(BigTyIdx, S32, MaxScalar);
1568 
1569     if (Op == G_MERGE_VALUES) {
1570       Builder.widenScalarIf(
1571         // TODO: Use 16-bit shifts if legal for 8-bit values?
1572         [=](const LegalityQuery &Query) {
1573           const LLT Ty = Query.Types[LitTyIdx];
1574           return Ty.getSizeInBits() < 32;
1575         },
1576         changeTo(LitTyIdx, S32));
1577     }
1578 
1579     Builder.widenScalarIf(
1580       [=](const LegalityQuery &Query) {
1581         const LLT Ty = Query.Types[BigTyIdx];
1582         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1583           Ty.getSizeInBits() % 16 != 0;
1584       },
1585       [=](const LegalityQuery &Query) {
1586         // Pick the next power of 2, or a multiple of 64 over 128.
1587         // Whichever is smaller.
1588         const LLT &Ty = Query.Types[BigTyIdx];
1589         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1590         if (NewSizeInBits >= 256) {
1591           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1592           if (RoundedTo < NewSizeInBits)
1593             NewSizeInBits = RoundedTo;
1594         }
1595         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1596       })
1597       // Any vectors left are the wrong size. Scalarize them.
1598       .scalarize(0)
1599       .scalarize(1);
1600   }
1601 
1602   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1603   // RegBankSelect.
1604   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1605     .legalFor({{S32}, {S64}});
1606 
1607   if (ST.hasVOP3PInsts()) {
1608     SextInReg.lowerFor({{V2S16}})
1609       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1610       // get more vector shift opportunities, since we'll get those when
1611       // expanded.
1612       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1613   } else if (ST.has16BitInsts()) {
1614     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1615   } else {
1616     // Prefer to promote to s32 before lowering if we don't have 16-bit
1617     // shifts. This avoid a lot of intermediate truncate and extend operations.
1618     SextInReg.lowerFor({{S32}, {S64}});
1619   }
1620 
1621   SextInReg
1622     .scalarize(0)
1623     .clampScalar(0, S32, S64)
1624     .lower();
1625 
1626   // TODO: Only Try to form v2s16 with legal packed instructions.
1627   getActionDefinitionsBuilder(G_FSHR)
1628     .legalFor({{S32, S32}})
1629     .lowerFor({{V2S16, V2S16}})
1630     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1631     .scalarize(0)
1632     .lower();
1633 
1634   if (ST.hasVOP3PInsts()) {
1635     getActionDefinitionsBuilder(G_FSHL)
1636       .lowerFor({{V2S16, V2S16}})
1637       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1638       .scalarize(0)
1639       .lower();
1640   } else {
1641     getActionDefinitionsBuilder(G_FSHL)
1642       .scalarize(0)
1643       .lower();
1644   }
1645 
1646   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1647     .legalFor({S64});
1648 
1649   getActionDefinitionsBuilder(G_FENCE)
1650     .alwaysLegal();
1651 
1652   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1653       .scalarize(0)
1654       .minScalar(0, S32)
1655       .lower();
1656 
1657   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1658       .legalFor({{S32, S32}, {S64, S32}})
1659       .clampScalar(1, S32, S32)
1660       .clampScalar(0, S32, S64)
1661       .widenScalarToNextPow2(0)
1662       .scalarize(0);
1663 
1664   getActionDefinitionsBuilder({
1665       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1666       G_FCOPYSIGN,
1667 
1668       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1669       G_ATOMICRMW_NAND,
1670       G_ATOMICRMW_FSUB,
1671       G_READ_REGISTER,
1672       G_WRITE_REGISTER,
1673 
1674       G_SADDO, G_SSUBO,
1675 
1676        // TODO: Implement
1677       G_FMINIMUM, G_FMAXIMUM}).lower();
1678 
1679   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1680         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1681         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1682     .unsupported();
1683 
1684   getLegacyLegalizerInfo().computeTables();
1685   verify(*ST.getInstrInfo());
1686 }
1687 
1688 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1689                                          MachineInstr &MI) const {
1690   MachineIRBuilder &B = Helper.MIRBuilder;
1691   MachineRegisterInfo &MRI = *B.getMRI();
1692 
1693   switch (MI.getOpcode()) {
1694   case TargetOpcode::G_ADDRSPACE_CAST:
1695     return legalizeAddrSpaceCast(MI, MRI, B);
1696   case TargetOpcode::G_FRINT:
1697     return legalizeFrint(MI, MRI, B);
1698   case TargetOpcode::G_FCEIL:
1699     return legalizeFceil(MI, MRI, B);
1700   case TargetOpcode::G_FREM:
1701     return legalizeFrem(MI, MRI, B);
1702   case TargetOpcode::G_INTRINSIC_TRUNC:
1703     return legalizeIntrinsicTrunc(MI, MRI, B);
1704   case TargetOpcode::G_SITOFP:
1705     return legalizeITOFP(MI, MRI, B, true);
1706   case TargetOpcode::G_UITOFP:
1707     return legalizeITOFP(MI, MRI, B, false);
1708   case TargetOpcode::G_FPTOSI:
1709     return legalizeFPTOI(MI, MRI, B, true);
1710   case TargetOpcode::G_FPTOUI:
1711     return legalizeFPTOI(MI, MRI, B, false);
1712   case TargetOpcode::G_FMINNUM:
1713   case TargetOpcode::G_FMAXNUM:
1714   case TargetOpcode::G_FMINNUM_IEEE:
1715   case TargetOpcode::G_FMAXNUM_IEEE:
1716     return legalizeMinNumMaxNum(Helper, MI);
1717   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1718     return legalizeExtractVectorElt(MI, MRI, B);
1719   case TargetOpcode::G_INSERT_VECTOR_ELT:
1720     return legalizeInsertVectorElt(MI, MRI, B);
1721   case TargetOpcode::G_SHUFFLE_VECTOR:
1722     return legalizeShuffleVector(MI, MRI, B);
1723   case TargetOpcode::G_FSIN:
1724   case TargetOpcode::G_FCOS:
1725     return legalizeSinCos(MI, MRI, B);
1726   case TargetOpcode::G_GLOBAL_VALUE:
1727     return legalizeGlobalValue(MI, MRI, B);
1728   case TargetOpcode::G_LOAD:
1729   case TargetOpcode::G_SEXTLOAD:
1730   case TargetOpcode::G_ZEXTLOAD:
1731     return legalizeLoad(Helper, MI);
1732   case TargetOpcode::G_FMAD:
1733     return legalizeFMad(MI, MRI, B);
1734   case TargetOpcode::G_FDIV:
1735     return legalizeFDIV(MI, MRI, B);
1736   case TargetOpcode::G_UDIV:
1737   case TargetOpcode::G_UREM:
1738   case TargetOpcode::G_UDIVREM:
1739     return legalizeUnsignedDIV_REM(MI, MRI, B);
1740   case TargetOpcode::G_SDIV:
1741   case TargetOpcode::G_SREM:
1742   case TargetOpcode::G_SDIVREM:
1743     return legalizeSignedDIV_REM(MI, MRI, B);
1744   case TargetOpcode::G_ATOMIC_CMPXCHG:
1745     return legalizeAtomicCmpXChg(MI, MRI, B);
1746   case TargetOpcode::G_FLOG:
1747     return legalizeFlog(MI, B, numbers::ln2f);
1748   case TargetOpcode::G_FLOG10:
1749     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1750   case TargetOpcode::G_FEXP:
1751     return legalizeFExp(MI, B);
1752   case TargetOpcode::G_FPOW:
1753     return legalizeFPow(MI, B);
1754   case TargetOpcode::G_FFLOOR:
1755     return legalizeFFloor(MI, MRI, B);
1756   case TargetOpcode::G_BUILD_VECTOR:
1757     return legalizeBuildVector(MI, MRI, B);
1758   default:
1759     return false;
1760   }
1761 
1762   llvm_unreachable("expected switch to return");
1763 }
1764 
1765 Register AMDGPULegalizerInfo::getSegmentAperture(
1766   unsigned AS,
1767   MachineRegisterInfo &MRI,
1768   MachineIRBuilder &B) const {
1769   MachineFunction &MF = B.getMF();
1770   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1771   const LLT S32 = LLT::scalar(32);
1772 
1773   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1774 
1775   if (ST.hasApertureRegs()) {
1776     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1777     // getreg.
1778     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1779         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1780         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1781     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1782         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1783         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1784     unsigned Encoding =
1785         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1786         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1787         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1788 
1789     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1790 
1791     B.buildInstr(AMDGPU::S_GETREG_B32)
1792       .addDef(GetReg)
1793       .addImm(Encoding);
1794     MRI.setType(GetReg, S32);
1795 
1796     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1797     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1798   }
1799 
1800   Register QueuePtr = MRI.createGenericVirtualRegister(
1801     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1802 
1803   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1804     return Register();
1805 
1806   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1807   // private_segment_aperture_base_hi.
1808   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1809 
1810   // TODO: can we be smarter about machine pointer info?
1811   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1812   MachineMemOperand *MMO = MF.getMachineMemOperand(
1813       PtrInfo,
1814       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1815           MachineMemOperand::MOInvariant,
1816       4, commonAlignment(Align(64), StructOffset));
1817 
1818   Register LoadAddr;
1819 
1820   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1821   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1822 }
1823 
1824 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1825   MachineInstr &MI, MachineRegisterInfo &MRI,
1826   MachineIRBuilder &B) const {
1827   MachineFunction &MF = B.getMF();
1828 
1829   const LLT S32 = LLT::scalar(32);
1830   Register Dst = MI.getOperand(0).getReg();
1831   Register Src = MI.getOperand(1).getReg();
1832 
1833   LLT DstTy = MRI.getType(Dst);
1834   LLT SrcTy = MRI.getType(Src);
1835   unsigned DestAS = DstTy.getAddressSpace();
1836   unsigned SrcAS = SrcTy.getAddressSpace();
1837 
1838   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1839   // vector element.
1840   assert(!DstTy.isVector());
1841 
1842   const AMDGPUTargetMachine &TM
1843     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1844 
1845   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1846     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1847     return true;
1848   }
1849 
1850   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1851     // Truncate.
1852     B.buildExtract(Dst, Src, 0);
1853     MI.eraseFromParent();
1854     return true;
1855   }
1856 
1857   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1858     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1859     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1860 
1861     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1862     // another. Merge operands are required to be the same type, but creating an
1863     // extra ptrtoint would be kind of pointless.
1864     auto HighAddr = B.buildConstant(
1865       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1866     B.buildMerge(Dst, {Src, HighAddr});
1867     MI.eraseFromParent();
1868     return true;
1869   }
1870 
1871   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1872     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1873            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1874     unsigned NullVal = TM.getNullPointerValue(DestAS);
1875 
1876     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1877     auto FlatNull = B.buildConstant(SrcTy, 0);
1878 
1879     // Extract low 32-bits of the pointer.
1880     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1881 
1882     auto CmpRes =
1883         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1884     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1885 
1886     MI.eraseFromParent();
1887     return true;
1888   }
1889 
1890   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1891     return false;
1892 
1893   if (!ST.hasFlatAddressSpace())
1894     return false;
1895 
1896   auto SegmentNull =
1897       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1898   auto FlatNull =
1899       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1900 
1901   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1902   if (!ApertureReg.isValid())
1903     return false;
1904 
1905   auto CmpRes =
1906       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1907 
1908   // Coerce the type of the low half of the result so we can use merge_values.
1909   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1910 
1911   // TODO: Should we allow mismatched types but matching sizes in merges to
1912   // avoid the ptrtoint?
1913   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1914   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1915 
1916   MI.eraseFromParent();
1917   return true;
1918 }
1919 
1920 bool AMDGPULegalizerInfo::legalizeFrint(
1921   MachineInstr &MI, MachineRegisterInfo &MRI,
1922   MachineIRBuilder &B) const {
1923   Register Src = MI.getOperand(1).getReg();
1924   LLT Ty = MRI.getType(Src);
1925   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1926 
1927   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1928   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1929 
1930   auto C1 = B.buildFConstant(Ty, C1Val);
1931   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1932 
1933   // TODO: Should this propagate fast-math-flags?
1934   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1935   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1936 
1937   auto C2 = B.buildFConstant(Ty, C2Val);
1938   auto Fabs = B.buildFAbs(Ty, Src);
1939 
1940   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1941   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1942   MI.eraseFromParent();
1943   return true;
1944 }
1945 
1946 bool AMDGPULegalizerInfo::legalizeFceil(
1947   MachineInstr &MI, MachineRegisterInfo &MRI,
1948   MachineIRBuilder &B) const {
1949 
1950   const LLT S1 = LLT::scalar(1);
1951   const LLT S64 = LLT::scalar(64);
1952 
1953   Register Src = MI.getOperand(1).getReg();
1954   assert(MRI.getType(Src) == S64);
1955 
1956   // result = trunc(src)
1957   // if (src > 0.0 && src != result)
1958   //   result += 1.0
1959 
1960   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1961 
1962   const auto Zero = B.buildFConstant(S64, 0.0);
1963   const auto One = B.buildFConstant(S64, 1.0);
1964   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1965   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1966   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1967   auto Add = B.buildSelect(S64, And, One, Zero);
1968 
1969   // TODO: Should this propagate fast-math-flags?
1970   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1971   return true;
1972 }
1973 
1974 bool AMDGPULegalizerInfo::legalizeFrem(
1975   MachineInstr &MI, MachineRegisterInfo &MRI,
1976   MachineIRBuilder &B) const {
1977     Register DstReg = MI.getOperand(0).getReg();
1978     Register Src0Reg = MI.getOperand(1).getReg();
1979     Register Src1Reg = MI.getOperand(2).getReg();
1980     auto Flags = MI.getFlags();
1981     LLT Ty = MRI.getType(DstReg);
1982 
1983     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1984     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1985     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1986     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1987     MI.eraseFromParent();
1988     return true;
1989 }
1990 
1991 static MachineInstrBuilder extractF64Exponent(Register Hi,
1992                                               MachineIRBuilder &B) {
1993   const unsigned FractBits = 52;
1994   const unsigned ExpBits = 11;
1995   LLT S32 = LLT::scalar(32);
1996 
1997   auto Const0 = B.buildConstant(S32, FractBits - 32);
1998   auto Const1 = B.buildConstant(S32, ExpBits);
1999 
2000   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2001     .addUse(Hi)
2002     .addUse(Const0.getReg(0))
2003     .addUse(Const1.getReg(0));
2004 
2005   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2006 }
2007 
2008 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2009   MachineInstr &MI, MachineRegisterInfo &MRI,
2010   MachineIRBuilder &B) const {
2011   const LLT S1 = LLT::scalar(1);
2012   const LLT S32 = LLT::scalar(32);
2013   const LLT S64 = LLT::scalar(64);
2014 
2015   Register Src = MI.getOperand(1).getReg();
2016   assert(MRI.getType(Src) == S64);
2017 
2018   // TODO: Should this use extract since the low half is unused?
2019   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2020   Register Hi = Unmerge.getReg(1);
2021 
2022   // Extract the upper half, since this is where we will find the sign and
2023   // exponent.
2024   auto Exp = extractF64Exponent(Hi, B);
2025 
2026   const unsigned FractBits = 52;
2027 
2028   // Extract the sign bit.
2029   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2030   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2031 
2032   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2033 
2034   const auto Zero32 = B.buildConstant(S32, 0);
2035 
2036   // Extend back to 64-bits.
2037   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2038 
2039   auto Shr = B.buildAShr(S64, FractMask, Exp);
2040   auto Not = B.buildNot(S64, Shr);
2041   auto Tmp0 = B.buildAnd(S64, Src, Not);
2042   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2043 
2044   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2045   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2046 
2047   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2048   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2049   MI.eraseFromParent();
2050   return true;
2051 }
2052 
2053 bool AMDGPULegalizerInfo::legalizeITOFP(
2054   MachineInstr &MI, MachineRegisterInfo &MRI,
2055   MachineIRBuilder &B, bool Signed) const {
2056 
2057   Register Dst = MI.getOperand(0).getReg();
2058   Register Src = MI.getOperand(1).getReg();
2059 
2060   const LLT S64 = LLT::scalar(64);
2061   const LLT S32 = LLT::scalar(32);
2062 
2063   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2064 
2065   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2066 
2067   auto CvtHi = Signed ?
2068     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2069     B.buildUITOFP(S64, Unmerge.getReg(1));
2070 
2071   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2072 
2073   auto ThirtyTwo = B.buildConstant(S32, 32);
2074   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2075     .addUse(CvtHi.getReg(0))
2076     .addUse(ThirtyTwo.getReg(0));
2077 
2078   // TODO: Should this propagate fast-math-flags?
2079   B.buildFAdd(Dst, LdExp, CvtLo);
2080   MI.eraseFromParent();
2081   return true;
2082 }
2083 
2084 // TODO: Copied from DAG implementation. Verify logic and document how this
2085 // actually works.
2086 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2087                                         MachineRegisterInfo &MRI,
2088                                         MachineIRBuilder &B,
2089                                         bool Signed) const {
2090 
2091   Register Dst = MI.getOperand(0).getReg();
2092   Register Src = MI.getOperand(1).getReg();
2093 
2094   const LLT S64 = LLT::scalar(64);
2095   const LLT S32 = LLT::scalar(32);
2096 
2097   const LLT SrcLT = MRI.getType(Src);
2098   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2099 
2100   unsigned Flags = MI.getFlags();
2101 
2102   // The basic idea of converting a floating point number into a pair of 32-bit
2103   // integers is illustrated as follows:
2104   //
2105   //     tf := trunc(val);
2106   //    hif := floor(tf * 2^-32);
2107   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2108   //     hi := fptoi(hif);
2109   //     lo := fptoi(lof);
2110   //
2111   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2112   MachineInstrBuilder Sign;
2113   if (Signed && SrcLT == S32) {
2114     // However, a 32-bit floating point number has only 23 bits mantissa and
2115     // it's not enough to hold all the significant bits of `lof` if val is
2116     // negative. To avoid the loss of precision, We need to take the absolute
2117     // value after truncating and flip the result back based on the original
2118     // signedness.
2119     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2120     Trunc = B.buildFAbs(S32, Trunc, Flags);
2121   }
2122   MachineInstrBuilder K0, K1;
2123   if (SrcLT == S64) {
2124     K0 = B.buildFConstant(S64,
2125                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2126     K1 = B.buildFConstant(S64,
2127                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2128   } else {
2129     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2130     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2131   }
2132 
2133   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2134   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2135   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2136 
2137   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2138                                      : B.buildFPTOUI(S32, FloorMul);
2139   auto Lo = B.buildFPTOUI(S32, Fma);
2140 
2141   if (Signed && SrcLT == S32) {
2142     // Flip the result based on the signedness, which is either all 0s or 1s.
2143     Sign = B.buildMerge(S64, {Sign, Sign});
2144     // r := xor({lo, hi}, sign) - sign;
2145     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2146   } else
2147     B.buildMerge(Dst, {Lo, Hi});
2148   MI.eraseFromParent();
2149 
2150   return true;
2151 }
2152 
2153 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2154                                                MachineInstr &MI) const {
2155   MachineFunction &MF = Helper.MIRBuilder.getMF();
2156   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2157 
2158   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2159                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2160 
2161   // With ieee_mode disabled, the instructions have the correct behavior
2162   // already for G_FMINNUM/G_FMAXNUM
2163   if (!MFI->getMode().IEEE)
2164     return !IsIEEEOp;
2165 
2166   if (IsIEEEOp)
2167     return true;
2168 
2169   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2170 }
2171 
2172 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2173   MachineInstr &MI, MachineRegisterInfo &MRI,
2174   MachineIRBuilder &B) const {
2175   // TODO: Should move some of this into LegalizerHelper.
2176 
2177   // TODO: Promote dynamic indexing of s16 to s32
2178 
2179   // FIXME: Artifact combiner probably should have replaced the truncated
2180   // constant before this, so we shouldn't need
2181   // getConstantVRegValWithLookThrough.
2182   Optional<ValueAndVReg> MaybeIdxVal =
2183       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2184   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2185     return true;
2186   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2187 
2188   Register Dst = MI.getOperand(0).getReg();
2189   Register Vec = MI.getOperand(1).getReg();
2190 
2191   LLT VecTy = MRI.getType(Vec);
2192   LLT EltTy = VecTy.getElementType();
2193   assert(EltTy == MRI.getType(Dst));
2194 
2195   if (IdxVal < VecTy.getNumElements())
2196     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2197   else
2198     B.buildUndef(Dst);
2199 
2200   MI.eraseFromParent();
2201   return true;
2202 }
2203 
2204 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2205   MachineInstr &MI, MachineRegisterInfo &MRI,
2206   MachineIRBuilder &B) const {
2207   // TODO: Should move some of this into LegalizerHelper.
2208 
2209   // TODO: Promote dynamic indexing of s16 to s32
2210 
2211   // FIXME: Artifact combiner probably should have replaced the truncated
2212   // constant before this, so we shouldn't need
2213   // getConstantVRegValWithLookThrough.
2214   Optional<ValueAndVReg> MaybeIdxVal =
2215       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2216   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2217     return true;
2218 
2219   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2220   Register Dst = MI.getOperand(0).getReg();
2221   Register Vec = MI.getOperand(1).getReg();
2222   Register Ins = MI.getOperand(2).getReg();
2223 
2224   LLT VecTy = MRI.getType(Vec);
2225   LLT EltTy = VecTy.getElementType();
2226   assert(EltTy == MRI.getType(Ins));
2227 
2228   if (IdxVal < VecTy.getNumElements())
2229     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2230   else
2231     B.buildUndef(Dst);
2232 
2233   MI.eraseFromParent();
2234   return true;
2235 }
2236 
2237 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2238   MachineInstr &MI, MachineRegisterInfo &MRI,
2239   MachineIRBuilder &B) const {
2240   const LLT V2S16 = LLT::fixed_vector(2, 16);
2241 
2242   Register Dst = MI.getOperand(0).getReg();
2243   Register Src0 = MI.getOperand(1).getReg();
2244   LLT DstTy = MRI.getType(Dst);
2245   LLT SrcTy = MRI.getType(Src0);
2246 
2247   if (SrcTy == V2S16 && DstTy == V2S16 &&
2248       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2249     return true;
2250 
2251   MachineIRBuilder HelperBuilder(MI);
2252   GISelObserverWrapper DummyObserver;
2253   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2254   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2255 }
2256 
2257 bool AMDGPULegalizerInfo::legalizeSinCos(
2258   MachineInstr &MI, MachineRegisterInfo &MRI,
2259   MachineIRBuilder &B) const {
2260 
2261   Register DstReg = MI.getOperand(0).getReg();
2262   Register SrcReg = MI.getOperand(1).getReg();
2263   LLT Ty = MRI.getType(DstReg);
2264   unsigned Flags = MI.getFlags();
2265 
2266   Register TrigVal;
2267   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2268   if (ST.hasTrigReducedRange()) {
2269     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2270     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2271       .addUse(MulVal.getReg(0))
2272       .setMIFlags(Flags).getReg(0);
2273   } else
2274     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2275 
2276   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2277     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2278   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2279     .addUse(TrigVal)
2280     .setMIFlags(Flags);
2281   MI.eraseFromParent();
2282   return true;
2283 }
2284 
2285 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2286                                                   MachineIRBuilder &B,
2287                                                   const GlobalValue *GV,
2288                                                   int64_t Offset,
2289                                                   unsigned GAFlags) const {
2290   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2291   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2292   // to the following code sequence:
2293   //
2294   // For constant address space:
2295   //   s_getpc_b64 s[0:1]
2296   //   s_add_u32 s0, s0, $symbol
2297   //   s_addc_u32 s1, s1, 0
2298   //
2299   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2300   //   a fixup or relocation is emitted to replace $symbol with a literal
2301   //   constant, which is a pc-relative offset from the encoding of the $symbol
2302   //   operand to the global variable.
2303   //
2304   // For global address space:
2305   //   s_getpc_b64 s[0:1]
2306   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2307   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2308   //
2309   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2310   //   fixups or relocations are emitted to replace $symbol@*@lo and
2311   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2312   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2313   //   operand to the global variable.
2314   //
2315   // What we want here is an offset from the value returned by s_getpc
2316   // (which is the address of the s_add_u32 instruction) to the global
2317   // variable, but since the encoding of $symbol starts 4 bytes after the start
2318   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2319   // small. This requires us to add 4 to the global variable offset in order to
2320   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2321   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2322   // instruction.
2323 
2324   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2325 
2326   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2327     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2328 
2329   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2330     .addDef(PCReg);
2331 
2332   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2333   if (GAFlags == SIInstrInfo::MO_NONE)
2334     MIB.addImm(0);
2335   else
2336     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2337 
2338   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2339 
2340   if (PtrTy.getSizeInBits() == 32)
2341     B.buildExtract(DstReg, PCReg, 0);
2342   return true;
2343  }
2344 
2345 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2346   MachineInstr &MI, MachineRegisterInfo &MRI,
2347   MachineIRBuilder &B) const {
2348   Register DstReg = MI.getOperand(0).getReg();
2349   LLT Ty = MRI.getType(DstReg);
2350   unsigned AS = Ty.getAddressSpace();
2351 
2352   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2353   MachineFunction &MF = B.getMF();
2354   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2355 
2356   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2357     if (!MFI->isModuleEntryFunction() &&
2358         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2359       const Function &Fn = MF.getFunction();
2360       DiagnosticInfoUnsupported BadLDSDecl(
2361         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2362         DS_Warning);
2363       Fn.getContext().diagnose(BadLDSDecl);
2364 
2365       // We currently don't have a way to correctly allocate LDS objects that
2366       // aren't directly associated with a kernel. We do force inlining of
2367       // functions that use local objects. However, if these dead functions are
2368       // not eliminated, we don't want a compile time error. Just emit a warning
2369       // and a trap, since there should be no callable path here.
2370       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2371       B.buildUndef(DstReg);
2372       MI.eraseFromParent();
2373       return true;
2374     }
2375 
2376     // TODO: We could emit code to handle the initialization somewhere.
2377     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2378       const SITargetLowering *TLI = ST.getTargetLowering();
2379       if (!TLI->shouldUseLDSConstAddress(GV)) {
2380         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2381         return true; // Leave in place;
2382       }
2383 
2384       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2385         Type *Ty = GV->getValueType();
2386         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2387         // zero-sized type in other languages to declare the dynamic shared
2388         // memory which size is not known at the compile time. They will be
2389         // allocated by the runtime and placed directly after the static
2390         // allocated ones. They all share the same offset.
2391         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2392           // Adjust alignment for that dynamic shared memory array.
2393           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2394           LLT S32 = LLT::scalar(32);
2395           auto Sz =
2396               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2397           B.buildIntToPtr(DstReg, Sz);
2398           MI.eraseFromParent();
2399           return true;
2400         }
2401       }
2402 
2403       B.buildConstant(
2404           DstReg,
2405           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2406       MI.eraseFromParent();
2407       return true;
2408     }
2409 
2410     const Function &Fn = MF.getFunction();
2411     DiagnosticInfoUnsupported BadInit(
2412       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2413     Fn.getContext().diagnose(BadInit);
2414     return true;
2415   }
2416 
2417   const SITargetLowering *TLI = ST.getTargetLowering();
2418 
2419   if (TLI->shouldEmitFixup(GV)) {
2420     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2421     MI.eraseFromParent();
2422     return true;
2423   }
2424 
2425   if (TLI->shouldEmitPCReloc(GV)) {
2426     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2427     MI.eraseFromParent();
2428     return true;
2429   }
2430 
2431   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2432   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2433 
2434   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2435       MachinePointerInfo::getGOT(MF),
2436       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2437           MachineMemOperand::MOInvariant,
2438       8 /*Size*/, Align(8));
2439 
2440   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2441 
2442   if (Ty.getSizeInBits() == 32) {
2443     // Truncate if this is a 32-bit constant adrdess.
2444     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2445     B.buildExtract(DstReg, Load, 0);
2446   } else
2447     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2448 
2449   MI.eraseFromParent();
2450   return true;
2451 }
2452 
2453 static LLT widenToNextPowerOf2(LLT Ty) {
2454   if (Ty.isVector())
2455     return Ty.changeElementCount(
2456         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2457   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2458 }
2459 
2460 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2461                                        MachineInstr &MI) const {
2462   MachineIRBuilder &B = Helper.MIRBuilder;
2463   MachineRegisterInfo &MRI = *B.getMRI();
2464   GISelChangeObserver &Observer = Helper.Observer;
2465 
2466   Register PtrReg = MI.getOperand(1).getReg();
2467   LLT PtrTy = MRI.getType(PtrReg);
2468   unsigned AddrSpace = PtrTy.getAddressSpace();
2469 
2470   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2471     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2472     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2473     Observer.changingInstr(MI);
2474     MI.getOperand(1).setReg(Cast.getReg(0));
2475     Observer.changedInstr(MI);
2476     return true;
2477   }
2478 
2479   if (MI.getOpcode() != AMDGPU::G_LOAD)
2480     return false;
2481 
2482   Register ValReg = MI.getOperand(0).getReg();
2483   LLT ValTy = MRI.getType(ValReg);
2484 
2485   MachineMemOperand *MMO = *MI.memoperands_begin();
2486   const unsigned ValSize = ValTy.getSizeInBits();
2487   const unsigned MemSize = 8 * MMO->getSize();
2488   const Align MemAlign = MMO->getAlign();
2489   const unsigned AlignInBits = 8 * MemAlign.value();
2490 
2491   // Widen non-power-of-2 loads to the alignment if needed
2492   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2493     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2494 
2495     // This was already the correct extending load result type, so just adjust
2496     // the memory type.
2497     if (WideMemSize == ValSize) {
2498       MachineFunction &MF = B.getMF();
2499 
2500       MachineMemOperand *WideMMO =
2501           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2502       Observer.changingInstr(MI);
2503       MI.setMemRefs(MF, {WideMMO});
2504       Observer.changedInstr(MI);
2505       return true;
2506     }
2507 
2508     // Don't bother handling edge case that should probably never be produced.
2509     if (ValSize > WideMemSize)
2510       return false;
2511 
2512     LLT WideTy = widenToNextPowerOf2(ValTy);
2513 
2514     Register WideLoad;
2515     if (!WideTy.isVector()) {
2516       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2517       B.buildTrunc(ValReg, WideLoad).getReg(0);
2518     } else {
2519       // Extract the subvector.
2520 
2521       if (isRegisterType(ValTy)) {
2522         // If this a case where G_EXTRACT is legal, use it.
2523         // (e.g. <3 x s32> -> <4 x s32>)
2524         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2525         B.buildExtract(ValReg, WideLoad, 0);
2526       } else {
2527         // For cases where the widened type isn't a nice register value, unmerge
2528         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2529         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2530         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2531         B.setInsertPt(B.getMBB(), MI.getIterator());
2532         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2533       }
2534     }
2535 
2536     MI.eraseFromParent();
2537     return true;
2538   }
2539 
2540   return false;
2541 }
2542 
2543 bool AMDGPULegalizerInfo::legalizeFMad(
2544   MachineInstr &MI, MachineRegisterInfo &MRI,
2545   MachineIRBuilder &B) const {
2546   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2547   assert(Ty.isScalar());
2548 
2549   MachineFunction &MF = B.getMF();
2550   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2551 
2552   // TODO: Always legal with future ftz flag.
2553   // FIXME: Do we need just output?
2554   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2555     return true;
2556   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2557     return true;
2558 
2559   MachineIRBuilder HelperBuilder(MI);
2560   GISelObserverWrapper DummyObserver;
2561   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2562   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2563 }
2564 
2565 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2566   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2567   Register DstReg = MI.getOperand(0).getReg();
2568   Register PtrReg = MI.getOperand(1).getReg();
2569   Register CmpVal = MI.getOperand(2).getReg();
2570   Register NewVal = MI.getOperand(3).getReg();
2571 
2572   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2573          "this should not have been custom lowered");
2574 
2575   LLT ValTy = MRI.getType(CmpVal);
2576   LLT VecTy = LLT::fixed_vector(2, ValTy);
2577 
2578   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2579 
2580   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2581     .addDef(DstReg)
2582     .addUse(PtrReg)
2583     .addUse(PackedVal)
2584     .setMemRefs(MI.memoperands());
2585 
2586   MI.eraseFromParent();
2587   return true;
2588 }
2589 
2590 bool AMDGPULegalizerInfo::legalizeFlog(
2591   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2592   Register Dst = MI.getOperand(0).getReg();
2593   Register Src = MI.getOperand(1).getReg();
2594   LLT Ty = B.getMRI()->getType(Dst);
2595   unsigned Flags = MI.getFlags();
2596 
2597   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2598   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2599 
2600   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2601   MI.eraseFromParent();
2602   return true;
2603 }
2604 
2605 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2606                                        MachineIRBuilder &B) const {
2607   Register Dst = MI.getOperand(0).getReg();
2608   Register Src = MI.getOperand(1).getReg();
2609   unsigned Flags = MI.getFlags();
2610   LLT Ty = B.getMRI()->getType(Dst);
2611 
2612   auto K = B.buildFConstant(Ty, numbers::log2e);
2613   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2614   B.buildFExp2(Dst, Mul, Flags);
2615   MI.eraseFromParent();
2616   return true;
2617 }
2618 
2619 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2620                                        MachineIRBuilder &B) const {
2621   Register Dst = MI.getOperand(0).getReg();
2622   Register Src0 = MI.getOperand(1).getReg();
2623   Register Src1 = MI.getOperand(2).getReg();
2624   unsigned Flags = MI.getFlags();
2625   LLT Ty = B.getMRI()->getType(Dst);
2626   const LLT S16 = LLT::scalar(16);
2627   const LLT S32 = LLT::scalar(32);
2628 
2629   if (Ty == S32) {
2630     auto Log = B.buildFLog2(S32, Src0, Flags);
2631     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2632       .addUse(Log.getReg(0))
2633       .addUse(Src1)
2634       .setMIFlags(Flags);
2635     B.buildFExp2(Dst, Mul, Flags);
2636   } else if (Ty == S16) {
2637     // There's no f16 fmul_legacy, so we need to convert for it.
2638     auto Log = B.buildFLog2(S16, Src0, Flags);
2639     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2640     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2641     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2642       .addUse(Ext0.getReg(0))
2643       .addUse(Ext1.getReg(0))
2644       .setMIFlags(Flags);
2645 
2646     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2647   } else
2648     return false;
2649 
2650   MI.eraseFromParent();
2651   return true;
2652 }
2653 
2654 // Find a source register, ignoring any possible source modifiers.
2655 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2656   Register ModSrc = OrigSrc;
2657   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2658     ModSrc = SrcFNeg->getOperand(1).getReg();
2659     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2660       ModSrc = SrcFAbs->getOperand(1).getReg();
2661   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2662     ModSrc = SrcFAbs->getOperand(1).getReg();
2663   return ModSrc;
2664 }
2665 
2666 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2667                                          MachineRegisterInfo &MRI,
2668                                          MachineIRBuilder &B) const {
2669 
2670   const LLT S1 = LLT::scalar(1);
2671   const LLT S64 = LLT::scalar(64);
2672   Register Dst = MI.getOperand(0).getReg();
2673   Register OrigSrc = MI.getOperand(1).getReg();
2674   unsigned Flags = MI.getFlags();
2675   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2676          "this should not have been custom lowered");
2677 
2678   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2679   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2680   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2681   // V_FRACT bug is:
2682   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2683   //
2684   // Convert floor(x) to (x - fract(x))
2685 
2686   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2687     .addUse(OrigSrc)
2688     .setMIFlags(Flags);
2689 
2690   // Give source modifier matching some assistance before obscuring a foldable
2691   // pattern.
2692 
2693   // TODO: We can avoid the neg on the fract? The input sign to fract
2694   // shouldn't matter?
2695   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2696 
2697   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2698 
2699   Register Min = MRI.createGenericVirtualRegister(S64);
2700 
2701   // We don't need to concern ourselves with the snan handling difference, so
2702   // use the one which will directly select.
2703   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2704   if (MFI->getMode().IEEE)
2705     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2706   else
2707     B.buildFMinNum(Min, Fract, Const, Flags);
2708 
2709   Register CorrectedFract = Min;
2710   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2711     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2712     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2713   }
2714 
2715   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2716   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2717 
2718   MI.eraseFromParent();
2719   return true;
2720 }
2721 
2722 // Turn an illegal packed v2s16 build vector into bit operations.
2723 // TODO: This should probably be a bitcast action in LegalizerHelper.
2724 bool AMDGPULegalizerInfo::legalizeBuildVector(
2725   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2726   Register Dst = MI.getOperand(0).getReg();
2727   const LLT S32 = LLT::scalar(32);
2728   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2729 
2730   Register Src0 = MI.getOperand(1).getReg();
2731   Register Src1 = MI.getOperand(2).getReg();
2732   assert(MRI.getType(Src0) == LLT::scalar(16));
2733 
2734   auto Merge = B.buildMerge(S32, {Src0, Src1});
2735   B.buildBitcast(Dst, Merge);
2736 
2737   MI.eraseFromParent();
2738   return true;
2739 }
2740 
2741 // Check that this is a G_XOR x, -1
2742 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2743   if (MI.getOpcode() != TargetOpcode::G_XOR)
2744     return false;
2745   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2746   return ConstVal && *ConstVal == -1;
2747 }
2748 
2749 // Return the use branch instruction, otherwise null if the usage is invalid.
2750 static MachineInstr *
2751 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2752                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2753   Register CondDef = MI.getOperand(0).getReg();
2754   if (!MRI.hasOneNonDBGUse(CondDef))
2755     return nullptr;
2756 
2757   MachineBasicBlock *Parent = MI.getParent();
2758   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2759 
2760   if (isNot(MRI, *UseMI)) {
2761     Register NegatedCond = UseMI->getOperand(0).getReg();
2762     if (!MRI.hasOneNonDBGUse(NegatedCond))
2763       return nullptr;
2764 
2765     // We're deleting the def of this value, so we need to remove it.
2766     UseMI->eraseFromParent();
2767 
2768     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2769     Negated = true;
2770   }
2771 
2772   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2773     return nullptr;
2774 
2775   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2776   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2777   if (Next == Parent->end()) {
2778     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2779     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2780       return nullptr;
2781     UncondBrTarget = &*NextMBB;
2782   } else {
2783     if (Next->getOpcode() != AMDGPU::G_BR)
2784       return nullptr;
2785     Br = &*Next;
2786     UncondBrTarget = Br->getOperand(0).getMBB();
2787   }
2788 
2789   return UseMI;
2790 }
2791 
2792 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2793                                          const ArgDescriptor *Arg,
2794                                          const TargetRegisterClass *ArgRC,
2795                                          LLT ArgTy) const {
2796   MCRegister SrcReg = Arg->getRegister();
2797   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2798   assert(DstReg.isVirtual() && "Virtual register expected");
2799 
2800   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2801                                              ArgTy);
2802   if (Arg->isMasked()) {
2803     // TODO: Should we try to emit this once in the entry block?
2804     const LLT S32 = LLT::scalar(32);
2805     const unsigned Mask = Arg->getMask();
2806     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2807 
2808     Register AndMaskSrc = LiveIn;
2809 
2810     if (Shift != 0) {
2811       auto ShiftAmt = B.buildConstant(S32, Shift);
2812       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2813     }
2814 
2815     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2816   } else {
2817     B.buildCopy(DstReg, LiveIn);
2818   }
2819 
2820   return true;
2821 }
2822 
2823 bool AMDGPULegalizerInfo::loadInputValue(
2824     Register DstReg, MachineIRBuilder &B,
2825     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2826   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2827   const ArgDescriptor *Arg;
2828   const TargetRegisterClass *ArgRC;
2829   LLT ArgTy;
2830   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2831 
2832   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2833     return false; // TODO: Handle these
2834   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2835 }
2836 
2837 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2838     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2839     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2840   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2841     return false;
2842 
2843   MI.eraseFromParent();
2844   return true;
2845 }
2846 
2847 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2848                                        MachineRegisterInfo &MRI,
2849                                        MachineIRBuilder &B) const {
2850   Register Dst = MI.getOperand(0).getReg();
2851   LLT DstTy = MRI.getType(Dst);
2852   LLT S16 = LLT::scalar(16);
2853   LLT S32 = LLT::scalar(32);
2854   LLT S64 = LLT::scalar(64);
2855 
2856   if (DstTy == S16)
2857     return legalizeFDIV16(MI, MRI, B);
2858   if (DstTy == S32)
2859     return legalizeFDIV32(MI, MRI, B);
2860   if (DstTy == S64)
2861     return legalizeFDIV64(MI, MRI, B);
2862 
2863   return false;
2864 }
2865 
2866 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2867                                                         Register DstDivReg,
2868                                                         Register DstRemReg,
2869                                                         Register X,
2870                                                         Register Y) const {
2871   const LLT S1 = LLT::scalar(1);
2872   const LLT S32 = LLT::scalar(32);
2873 
2874   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2875   // algorithm used here.
2876 
2877   // Initial estimate of inv(y).
2878   auto FloatY = B.buildUITOFP(S32, Y);
2879   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2880   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2881   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2882   auto Z = B.buildFPTOUI(S32, ScaledY);
2883 
2884   // One round of UNR.
2885   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2886   auto NegYZ = B.buildMul(S32, NegY, Z);
2887   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2888 
2889   // Quotient/remainder estimate.
2890   auto Q = B.buildUMulH(S32, X, Z);
2891   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2892 
2893   // First quotient/remainder refinement.
2894   auto One = B.buildConstant(S32, 1);
2895   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2896   if (DstDivReg)
2897     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2898   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2899 
2900   // Second quotient/remainder refinement.
2901   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2902   if (DstDivReg)
2903     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2904 
2905   if (DstRemReg)
2906     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2907 }
2908 
2909 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2910 //
2911 // Return lo, hi of result
2912 //
2913 // %cvt.lo = G_UITOFP Val.lo
2914 // %cvt.hi = G_UITOFP Val.hi
2915 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2916 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2917 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2918 // %mul2 = G_FMUL %mul1, 2**(-32)
2919 // %trunc = G_INTRINSIC_TRUNC %mul2
2920 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2921 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2922 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2923                                                        Register Val) {
2924   const LLT S32 = LLT::scalar(32);
2925   auto Unmerge = B.buildUnmerge(S32, Val);
2926 
2927   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2928   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2929 
2930   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2931                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2932 
2933   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2934   auto Mul1 =
2935       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2936 
2937   // 2**(-32)
2938   auto Mul2 =
2939       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2940   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2941 
2942   // -(2**32)
2943   auto Mad2 = B.buildFMAD(S32, Trunc,
2944                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2945 
2946   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2947   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2948 
2949   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2950 }
2951 
2952 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2953                                                         Register DstDivReg,
2954                                                         Register DstRemReg,
2955                                                         Register Numer,
2956                                                         Register Denom) const {
2957   const LLT S32 = LLT::scalar(32);
2958   const LLT S64 = LLT::scalar(64);
2959   const LLT S1 = LLT::scalar(1);
2960   Register RcpLo, RcpHi;
2961 
2962   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2963 
2964   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2965 
2966   auto Zero64 = B.buildConstant(S64, 0);
2967   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2968 
2969   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2970   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2971 
2972   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2973   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2974   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2975 
2976   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2977   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2978   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2979   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2980 
2981   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2982   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2983   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2984   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2985   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2986 
2987   auto Zero32 = B.buildConstant(S32, 0);
2988   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2989   auto Add2_HiC =
2990       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2991   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2992   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2993 
2994   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2995   Register NumerLo = UnmergeNumer.getReg(0);
2996   Register NumerHi = UnmergeNumer.getReg(1);
2997 
2998   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2999   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3000   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3001   Register Mul3_Lo = UnmergeMul3.getReg(0);
3002   Register Mul3_Hi = UnmergeMul3.getReg(1);
3003   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3004   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3005   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3006   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3007 
3008   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3009   Register DenomLo = UnmergeDenom.getReg(0);
3010   Register DenomHi = UnmergeDenom.getReg(1);
3011 
3012   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3013   auto C1 = B.buildSExt(S32, CmpHi);
3014 
3015   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3016   auto C2 = B.buildSExt(S32, CmpLo);
3017 
3018   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3019   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3020 
3021   // TODO: Here and below portions of the code can be enclosed into if/endif.
3022   // Currently control flow is unconditional and we have 4 selects after
3023   // potential endif to substitute PHIs.
3024 
3025   // if C3 != 0 ...
3026   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3027   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3028   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3029   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3030 
3031   auto One64 = B.buildConstant(S64, 1);
3032   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3033 
3034   auto C4 =
3035       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3036   auto C5 =
3037       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3038   auto C6 = B.buildSelect(
3039       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3040 
3041   // if (C6 != 0)
3042   auto Add4 = B.buildAdd(S64, Add3, One64);
3043   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3044 
3045   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3046   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3047   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3048 
3049   // endif C6
3050   // endif C3
3051 
3052   if (DstDivReg) {
3053     auto Sel1 = B.buildSelect(
3054         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3055     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3056                   Sel1, MulHi3);
3057   }
3058 
3059   if (DstRemReg) {
3060     auto Sel2 = B.buildSelect(
3061         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3062     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3063                   Sel2, Sub1);
3064   }
3065 }
3066 
3067 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3068                                                   MachineRegisterInfo &MRI,
3069                                                   MachineIRBuilder &B) const {
3070   Register DstDivReg, DstRemReg;
3071   switch (MI.getOpcode()) {
3072   default:
3073     llvm_unreachable("Unexpected opcode!");
3074   case AMDGPU::G_UDIV: {
3075     DstDivReg = MI.getOperand(0).getReg();
3076     break;
3077   }
3078   case AMDGPU::G_UREM: {
3079     DstRemReg = MI.getOperand(0).getReg();
3080     break;
3081   }
3082   case AMDGPU::G_UDIVREM: {
3083     DstDivReg = MI.getOperand(0).getReg();
3084     DstRemReg = MI.getOperand(1).getReg();
3085     break;
3086   }
3087   }
3088 
3089   const LLT S64 = LLT::scalar(64);
3090   const LLT S32 = LLT::scalar(32);
3091   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3092   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3093   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3094   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3095 
3096   if (Ty == S32)
3097     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3098   else if (Ty == S64)
3099     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3100   else
3101     return false;
3102 
3103   MI.eraseFromParent();
3104   return true;
3105 }
3106 
3107 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3108                                                 MachineRegisterInfo &MRI,
3109                                                 MachineIRBuilder &B) const {
3110   const LLT S64 = LLT::scalar(64);
3111   const LLT S32 = LLT::scalar(32);
3112 
3113   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3114   if (Ty != S32 && Ty != S64)
3115     return false;
3116 
3117   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3118   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3119   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3120 
3121   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3122   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3123   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3124 
3125   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3126   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3127 
3128   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3129   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3130 
3131   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3132   switch (MI.getOpcode()) {
3133   default:
3134     llvm_unreachable("Unexpected opcode!");
3135   case AMDGPU::G_SDIV: {
3136     DstDivReg = MI.getOperand(0).getReg();
3137     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3138     break;
3139   }
3140   case AMDGPU::G_SREM: {
3141     DstRemReg = MI.getOperand(0).getReg();
3142     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3143     break;
3144   }
3145   case AMDGPU::G_SDIVREM: {
3146     DstDivReg = MI.getOperand(0).getReg();
3147     DstRemReg = MI.getOperand(1).getReg();
3148     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3149     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3150     break;
3151   }
3152   }
3153 
3154   if (Ty == S32)
3155     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3156   else
3157     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3158 
3159   if (DstDivReg) {
3160     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3161     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3162     B.buildSub(DstDivReg, SignXor, Sign);
3163   }
3164 
3165   if (DstRemReg) {
3166     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3167     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3168     B.buildSub(DstRemReg, SignXor, Sign);
3169   }
3170 
3171   MI.eraseFromParent();
3172   return true;
3173 }
3174 
3175 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3176                                                  MachineRegisterInfo &MRI,
3177                                                  MachineIRBuilder &B) const {
3178   Register Res = MI.getOperand(0).getReg();
3179   Register LHS = MI.getOperand(1).getReg();
3180   Register RHS = MI.getOperand(2).getReg();
3181   uint16_t Flags = MI.getFlags();
3182   LLT ResTy = MRI.getType(Res);
3183 
3184   const MachineFunction &MF = B.getMF();
3185   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3186                             MI.getFlag(MachineInstr::FmAfn);
3187 
3188   if (!AllowInaccurateRcp)
3189     return false;
3190 
3191   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3192     // 1 / x -> RCP(x)
3193     if (CLHS->isExactlyValue(1.0)) {
3194       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3195         .addUse(RHS)
3196         .setMIFlags(Flags);
3197 
3198       MI.eraseFromParent();
3199       return true;
3200     }
3201 
3202     // -1 / x -> RCP( FNEG(x) )
3203     if (CLHS->isExactlyValue(-1.0)) {
3204       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3205       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3206         .addUse(FNeg.getReg(0))
3207         .setMIFlags(Flags);
3208 
3209       MI.eraseFromParent();
3210       return true;
3211     }
3212   }
3213 
3214   // x / y -> x * (1.0 / y)
3215   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3216     .addUse(RHS)
3217     .setMIFlags(Flags);
3218   B.buildFMul(Res, LHS, RCP, Flags);
3219 
3220   MI.eraseFromParent();
3221   return true;
3222 }
3223 
3224 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3225                                                    MachineRegisterInfo &MRI,
3226                                                    MachineIRBuilder &B) const {
3227   Register Res = MI.getOperand(0).getReg();
3228   Register X = MI.getOperand(1).getReg();
3229   Register Y = MI.getOperand(2).getReg();
3230   uint16_t Flags = MI.getFlags();
3231   LLT ResTy = MRI.getType(Res);
3232 
3233   const MachineFunction &MF = B.getMF();
3234   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3235                             MI.getFlag(MachineInstr::FmAfn);
3236 
3237   if (!AllowInaccurateRcp)
3238     return false;
3239 
3240   auto NegY = B.buildFNeg(ResTy, Y);
3241   auto One = B.buildFConstant(ResTy, 1.0);
3242 
3243   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3244     .addUse(Y)
3245     .setMIFlags(Flags);
3246 
3247   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3248   R = B.buildFMA(ResTy, Tmp0, R, R);
3249 
3250   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3251   R = B.buildFMA(ResTy, Tmp1, R, R);
3252 
3253   auto Ret = B.buildFMul(ResTy, X, R);
3254   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3255 
3256   B.buildFMA(Res, Tmp2, R, Ret);
3257   MI.eraseFromParent();
3258   return true;
3259 }
3260 
3261 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3262                                          MachineRegisterInfo &MRI,
3263                                          MachineIRBuilder &B) const {
3264   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3265     return true;
3266 
3267   Register Res = MI.getOperand(0).getReg();
3268   Register LHS = MI.getOperand(1).getReg();
3269   Register RHS = MI.getOperand(2).getReg();
3270 
3271   uint16_t Flags = MI.getFlags();
3272 
3273   LLT S16 = LLT::scalar(16);
3274   LLT S32 = LLT::scalar(32);
3275 
3276   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3277   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3278 
3279   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3280     .addUse(RHSExt.getReg(0))
3281     .setMIFlags(Flags);
3282 
3283   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3284   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3285 
3286   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3287     .addUse(RDst.getReg(0))
3288     .addUse(RHS)
3289     .addUse(LHS)
3290     .setMIFlags(Flags);
3291 
3292   MI.eraseFromParent();
3293   return true;
3294 }
3295 
3296 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3297 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3298 static void toggleSPDenormMode(bool Enable,
3299                                MachineIRBuilder &B,
3300                                const GCNSubtarget &ST,
3301                                AMDGPU::SIModeRegisterDefaults Mode) {
3302   // Set SP denorm mode to this value.
3303   unsigned SPDenormMode =
3304     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3305 
3306   if (ST.hasDenormModeInst()) {
3307     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3308     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3309 
3310     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3311     B.buildInstr(AMDGPU::S_DENORM_MODE)
3312       .addImm(NewDenormModeValue);
3313 
3314   } else {
3315     // Select FP32 bit field in mode register.
3316     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3317                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3318                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3319 
3320     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3321       .addImm(SPDenormMode)
3322       .addImm(SPDenormModeBitField);
3323   }
3324 }
3325 
3326 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3327                                          MachineRegisterInfo &MRI,
3328                                          MachineIRBuilder &B) const {
3329   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3330     return true;
3331 
3332   Register Res = MI.getOperand(0).getReg();
3333   Register LHS = MI.getOperand(1).getReg();
3334   Register RHS = MI.getOperand(2).getReg();
3335   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3336   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3337 
3338   uint16_t Flags = MI.getFlags();
3339 
3340   LLT S32 = LLT::scalar(32);
3341   LLT S1 = LLT::scalar(1);
3342 
3343   auto One = B.buildFConstant(S32, 1.0f);
3344 
3345   auto DenominatorScaled =
3346     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3347       .addUse(LHS)
3348       .addUse(RHS)
3349       .addImm(0)
3350       .setMIFlags(Flags);
3351   auto NumeratorScaled =
3352     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3353       .addUse(LHS)
3354       .addUse(RHS)
3355       .addImm(1)
3356       .setMIFlags(Flags);
3357 
3358   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3359     .addUse(DenominatorScaled.getReg(0))
3360     .setMIFlags(Flags);
3361   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3362 
3363   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3364   // aren't modeled as reading it.
3365   if (!Mode.allFP32Denormals())
3366     toggleSPDenormMode(true, B, ST, Mode);
3367 
3368   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3369   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3370   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3371   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3372   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3373   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3374 
3375   if (!Mode.allFP32Denormals())
3376     toggleSPDenormMode(false, B, ST, Mode);
3377 
3378   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3379     .addUse(Fma4.getReg(0))
3380     .addUse(Fma1.getReg(0))
3381     .addUse(Fma3.getReg(0))
3382     .addUse(NumeratorScaled.getReg(1))
3383     .setMIFlags(Flags);
3384 
3385   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3386     .addUse(Fmas.getReg(0))
3387     .addUse(RHS)
3388     .addUse(LHS)
3389     .setMIFlags(Flags);
3390 
3391   MI.eraseFromParent();
3392   return true;
3393 }
3394 
3395 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3396                                          MachineRegisterInfo &MRI,
3397                                          MachineIRBuilder &B) const {
3398   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3399     return true;
3400 
3401   Register Res = MI.getOperand(0).getReg();
3402   Register LHS = MI.getOperand(1).getReg();
3403   Register RHS = MI.getOperand(2).getReg();
3404 
3405   uint16_t Flags = MI.getFlags();
3406 
3407   LLT S64 = LLT::scalar(64);
3408   LLT S1 = LLT::scalar(1);
3409 
3410   auto One = B.buildFConstant(S64, 1.0);
3411 
3412   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3413     .addUse(LHS)
3414     .addUse(RHS)
3415     .addImm(0)
3416     .setMIFlags(Flags);
3417 
3418   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3419 
3420   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3421     .addUse(DivScale0.getReg(0))
3422     .setMIFlags(Flags);
3423 
3424   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3425   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3426   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3427 
3428   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3429     .addUse(LHS)
3430     .addUse(RHS)
3431     .addImm(1)
3432     .setMIFlags(Flags);
3433 
3434   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3435   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3436   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3437 
3438   Register Scale;
3439   if (!ST.hasUsableDivScaleConditionOutput()) {
3440     // Workaround a hardware bug on SI where the condition output from div_scale
3441     // is not usable.
3442 
3443     LLT S32 = LLT::scalar(32);
3444 
3445     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3446     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3447     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3448     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3449 
3450     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3451                               Scale1Unmerge.getReg(1));
3452     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3453                               Scale0Unmerge.getReg(1));
3454     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3455   } else {
3456     Scale = DivScale1.getReg(1);
3457   }
3458 
3459   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3460     .addUse(Fma4.getReg(0))
3461     .addUse(Fma3.getReg(0))
3462     .addUse(Mul.getReg(0))
3463     .addUse(Scale)
3464     .setMIFlags(Flags);
3465 
3466   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3467     .addUse(Fmas.getReg(0))
3468     .addUse(RHS)
3469     .addUse(LHS)
3470     .setMIFlags(Flags);
3471 
3472   MI.eraseFromParent();
3473   return true;
3474 }
3475 
3476 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3477                                                  MachineRegisterInfo &MRI,
3478                                                  MachineIRBuilder &B) const {
3479   Register Res = MI.getOperand(0).getReg();
3480   Register LHS = MI.getOperand(2).getReg();
3481   Register RHS = MI.getOperand(3).getReg();
3482   uint16_t Flags = MI.getFlags();
3483 
3484   LLT S32 = LLT::scalar(32);
3485   LLT S1 = LLT::scalar(1);
3486 
3487   auto Abs = B.buildFAbs(S32, RHS, Flags);
3488   const APFloat C0Val(1.0f);
3489 
3490   auto C0 = B.buildConstant(S32, 0x6f800000);
3491   auto C1 = B.buildConstant(S32, 0x2f800000);
3492   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3493 
3494   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3495   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3496 
3497   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3498 
3499   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3500     .addUse(Mul0.getReg(0))
3501     .setMIFlags(Flags);
3502 
3503   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3504 
3505   B.buildFMul(Res, Sel, Mul1, Flags);
3506 
3507   MI.eraseFromParent();
3508   return true;
3509 }
3510 
3511 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3512 // FIXME: Why do we handle this one but not other removed instructions?
3513 //
3514 // Reciprocal square root.  The clamp prevents infinite results, clamping
3515 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3516 // +-max_float.
3517 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3518                                                     MachineRegisterInfo &MRI,
3519                                                     MachineIRBuilder &B) const {
3520   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3521     return true;
3522 
3523   Register Dst = MI.getOperand(0).getReg();
3524   Register Src = MI.getOperand(2).getReg();
3525   auto Flags = MI.getFlags();
3526 
3527   LLT Ty = MRI.getType(Dst);
3528 
3529   const fltSemantics *FltSemantics;
3530   if (Ty == LLT::scalar(32))
3531     FltSemantics = &APFloat::IEEEsingle();
3532   else if (Ty == LLT::scalar(64))
3533     FltSemantics = &APFloat::IEEEdouble();
3534   else
3535     return false;
3536 
3537   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3538     .addUse(Src)
3539     .setMIFlags(Flags);
3540 
3541   // We don't need to concern ourselves with the snan handling difference, since
3542   // the rsq quieted (or not) so use the one which will directly select.
3543   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3544   const bool UseIEEE = MFI->getMode().IEEE;
3545 
3546   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3547   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3548                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3549 
3550   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3551 
3552   if (UseIEEE)
3553     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3554   else
3555     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3556   MI.eraseFromParent();
3557   return true;
3558 }
3559 
3560 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3561   switch (IID) {
3562   case Intrinsic::amdgcn_ds_fadd:
3563     return AMDGPU::G_ATOMICRMW_FADD;
3564   case Intrinsic::amdgcn_ds_fmin:
3565     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3566   case Intrinsic::amdgcn_ds_fmax:
3567     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3568   default:
3569     llvm_unreachable("not a DS FP intrinsic");
3570   }
3571 }
3572 
3573 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3574                                                       MachineInstr &MI,
3575                                                       Intrinsic::ID IID) const {
3576   GISelChangeObserver &Observer = Helper.Observer;
3577   Observer.changingInstr(MI);
3578 
3579   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3580 
3581   // The remaining operands were used to set fields in the MemOperand on
3582   // construction.
3583   for (int I = 6; I > 3; --I)
3584     MI.RemoveOperand(I);
3585 
3586   MI.RemoveOperand(1); // Remove the intrinsic ID.
3587   Observer.changedInstr(MI);
3588   return true;
3589 }
3590 
3591 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3592                                             MachineRegisterInfo &MRI,
3593                                             MachineIRBuilder &B) const {
3594   uint64_t Offset =
3595     ST.getTargetLowering()->getImplicitParameterOffset(
3596       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3597   LLT DstTy = MRI.getType(DstReg);
3598   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3599 
3600   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3601   if (!loadInputValue(KernargPtrReg, B,
3602                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3603     return false;
3604 
3605   // FIXME: This should be nuw
3606   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3607   return true;
3608 }
3609 
3610 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3611                                                  MachineRegisterInfo &MRI,
3612                                                  MachineIRBuilder &B) const {
3613   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3614   if (!MFI->isEntryFunction()) {
3615     return legalizePreloadedArgIntrin(MI, MRI, B,
3616                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3617   }
3618 
3619   Register DstReg = MI.getOperand(0).getReg();
3620   if (!getImplicitArgPtr(DstReg, MRI, B))
3621     return false;
3622 
3623   MI.eraseFromParent();
3624   return true;
3625 }
3626 
3627 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3628                                               MachineRegisterInfo &MRI,
3629                                               MachineIRBuilder &B,
3630                                               unsigned AddrSpace) const {
3631   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3632   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3633   Register Hi32 = Unmerge.getReg(1);
3634 
3635   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3636   MI.eraseFromParent();
3637   return true;
3638 }
3639 
3640 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3641 // offset (the offset that is included in bounds checking and swizzling, to be
3642 // split between the instruction's voffset and immoffset fields) and soffset
3643 // (the offset that is excluded from bounds checking and swizzling, to go in
3644 // the instruction's soffset field).  This function takes the first kind of
3645 // offset and figures out how to split it between voffset and immoffset.
3646 std::tuple<Register, unsigned, unsigned>
3647 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3648                                         Register OrigOffset) const {
3649   const unsigned MaxImm = 4095;
3650   Register BaseReg;
3651   unsigned TotalConstOffset;
3652   const LLT S32 = LLT::scalar(32);
3653   MachineRegisterInfo &MRI = *B.getMRI();
3654 
3655   std::tie(BaseReg, TotalConstOffset) =
3656       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3657 
3658   unsigned ImmOffset = TotalConstOffset;
3659 
3660   // If BaseReg is a pointer, convert it to int.
3661   if (MRI.getType(BaseReg).isPointer())
3662     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3663 
3664   // If the immediate value is too big for the immoffset field, put the value
3665   // and -4096 into the immoffset field so that the value that is copied/added
3666   // for the voffset field is a multiple of 4096, and it stands more chance
3667   // of being CSEd with the copy/add for another similar load/store.
3668   // However, do not do that rounding down to a multiple of 4096 if that is a
3669   // negative number, as it appears to be illegal to have a negative offset
3670   // in the vgpr, even if adding the immediate offset makes it positive.
3671   unsigned Overflow = ImmOffset & ~MaxImm;
3672   ImmOffset -= Overflow;
3673   if ((int32_t)Overflow < 0) {
3674     Overflow += ImmOffset;
3675     ImmOffset = 0;
3676   }
3677 
3678   if (Overflow != 0) {
3679     if (!BaseReg) {
3680       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3681     } else {
3682       auto OverflowVal = B.buildConstant(S32, Overflow);
3683       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3684     }
3685   }
3686 
3687   if (!BaseReg)
3688     BaseReg = B.buildConstant(S32, 0).getReg(0);
3689 
3690   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3691 }
3692 
3693 /// Handle register layout difference for f16 images for some subtargets.
3694 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3695                                              MachineRegisterInfo &MRI,
3696                                              Register Reg,
3697                                              bool ImageStore) const {
3698   const LLT S16 = LLT::scalar(16);
3699   const LLT S32 = LLT::scalar(32);
3700   LLT StoreVT = MRI.getType(Reg);
3701   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3702 
3703   if (ST.hasUnpackedD16VMem()) {
3704     auto Unmerge = B.buildUnmerge(S16, Reg);
3705 
3706     SmallVector<Register, 4> WideRegs;
3707     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3708       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3709 
3710     int NumElts = StoreVT.getNumElements();
3711 
3712     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3713         .getReg(0);
3714   }
3715 
3716   if (ImageStore && ST.hasImageStoreD16Bug()) {
3717     if (StoreVT.getNumElements() == 2) {
3718       SmallVector<Register, 4> PackedRegs;
3719       Reg = B.buildBitcast(S32, Reg).getReg(0);
3720       PackedRegs.push_back(Reg);
3721       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3722       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3723           .getReg(0);
3724     }
3725 
3726     if (StoreVT.getNumElements() == 3) {
3727       SmallVector<Register, 4> PackedRegs;
3728       auto Unmerge = B.buildUnmerge(S16, Reg);
3729       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3730         PackedRegs.push_back(Unmerge.getReg(I));
3731       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3732       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3733       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3734     }
3735 
3736     if (StoreVT.getNumElements() == 4) {
3737       SmallVector<Register, 4> PackedRegs;
3738       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3739       auto Unmerge = B.buildUnmerge(S32, Reg);
3740       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3741         PackedRegs.push_back(Unmerge.getReg(I));
3742       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3743       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3744           .getReg(0);
3745     }
3746 
3747     llvm_unreachable("invalid data type");
3748   }
3749 
3750   return Reg;
3751 }
3752 
3753 Register AMDGPULegalizerInfo::fixStoreSourceType(
3754   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3755   MachineRegisterInfo *MRI = B.getMRI();
3756   LLT Ty = MRI->getType(VData);
3757 
3758   const LLT S16 = LLT::scalar(16);
3759 
3760   // Fixup illegal register types for i8 stores.
3761   if (Ty == LLT::scalar(8) || Ty == S16) {
3762     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3763     return AnyExt;
3764   }
3765 
3766   if (Ty.isVector()) {
3767     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3768       if (IsFormat)
3769         return handleD16VData(B, *MRI, VData);
3770     }
3771   }
3772 
3773   return VData;
3774 }
3775 
3776 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3777                                               MachineRegisterInfo &MRI,
3778                                               MachineIRBuilder &B,
3779                                               bool IsTyped,
3780                                               bool IsFormat) const {
3781   Register VData = MI.getOperand(1).getReg();
3782   LLT Ty = MRI.getType(VData);
3783   LLT EltTy = Ty.getScalarType();
3784   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3785   const LLT S32 = LLT::scalar(32);
3786 
3787   VData = fixStoreSourceType(B, VData, IsFormat);
3788   Register RSrc = MI.getOperand(2).getReg();
3789 
3790   MachineMemOperand *MMO = *MI.memoperands_begin();
3791   const int MemSize = MMO->getSize();
3792 
3793   unsigned ImmOffset;
3794   unsigned TotalOffset;
3795 
3796   // The typed intrinsics add an immediate after the registers.
3797   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3798 
3799   // The struct intrinsic variants add one additional operand over raw.
3800   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3801   Register VIndex;
3802   int OpOffset = 0;
3803   if (HasVIndex) {
3804     VIndex = MI.getOperand(3).getReg();
3805     OpOffset = 1;
3806   }
3807 
3808   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3809   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3810 
3811   unsigned Format = 0;
3812   if (IsTyped) {
3813     Format = MI.getOperand(5 + OpOffset).getImm();
3814     ++OpOffset;
3815   }
3816 
3817   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3818 
3819   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3820   if (TotalOffset != 0)
3821     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3822 
3823   unsigned Opc;
3824   if (IsTyped) {
3825     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3826                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3827   } else if (IsFormat) {
3828     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3829                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3830   } else {
3831     switch (MemSize) {
3832     case 1:
3833       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3834       break;
3835     case 2:
3836       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3837       break;
3838     default:
3839       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3840       break;
3841     }
3842   }
3843 
3844   if (!VIndex)
3845     VIndex = B.buildConstant(S32, 0).getReg(0);
3846 
3847   auto MIB = B.buildInstr(Opc)
3848     .addUse(VData)              // vdata
3849     .addUse(RSrc)               // rsrc
3850     .addUse(VIndex)             // vindex
3851     .addUse(VOffset)            // voffset
3852     .addUse(SOffset)            // soffset
3853     .addImm(ImmOffset);         // offset(imm)
3854 
3855   if (IsTyped)
3856     MIB.addImm(Format);
3857 
3858   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3859      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3860      .addMemOperand(MMO);
3861 
3862   MI.eraseFromParent();
3863   return true;
3864 }
3865 
3866 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3867                                              MachineRegisterInfo &MRI,
3868                                              MachineIRBuilder &B,
3869                                              bool IsFormat,
3870                                              bool IsTyped) const {
3871   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3872   MachineMemOperand *MMO = *MI.memoperands_begin();
3873   const int MemSize = MMO->getSize();
3874   const LLT S32 = LLT::scalar(32);
3875 
3876   Register Dst = MI.getOperand(0).getReg();
3877   Register RSrc = MI.getOperand(2).getReg();
3878 
3879   // The typed intrinsics add an immediate after the registers.
3880   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3881 
3882   // The struct intrinsic variants add one additional operand over raw.
3883   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3884   Register VIndex;
3885   int OpOffset = 0;
3886   if (HasVIndex) {
3887     VIndex = MI.getOperand(3).getReg();
3888     OpOffset = 1;
3889   }
3890 
3891   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3892   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3893 
3894   unsigned Format = 0;
3895   if (IsTyped) {
3896     Format = MI.getOperand(5 + OpOffset).getImm();
3897     ++OpOffset;
3898   }
3899 
3900   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3901   unsigned ImmOffset;
3902   unsigned TotalOffset;
3903 
3904   LLT Ty = MRI.getType(Dst);
3905   LLT EltTy = Ty.getScalarType();
3906   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3907   const bool Unpacked = ST.hasUnpackedD16VMem();
3908 
3909   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3910   if (TotalOffset != 0)
3911     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3912 
3913   unsigned Opc;
3914 
3915   if (IsTyped) {
3916     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3917                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3918   } else if (IsFormat) {
3919     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3920                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3921   } else {
3922     switch (MemSize) {
3923     case 1:
3924       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3925       break;
3926     case 2:
3927       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3928       break;
3929     default:
3930       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3931       break;
3932     }
3933   }
3934 
3935   Register LoadDstReg;
3936 
3937   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3938   LLT UnpackedTy = Ty.changeElementSize(32);
3939 
3940   if (IsExtLoad)
3941     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3942   else if (Unpacked && IsD16 && Ty.isVector())
3943     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3944   else
3945     LoadDstReg = Dst;
3946 
3947   if (!VIndex)
3948     VIndex = B.buildConstant(S32, 0).getReg(0);
3949 
3950   auto MIB = B.buildInstr(Opc)
3951     .addDef(LoadDstReg)         // vdata
3952     .addUse(RSrc)               // rsrc
3953     .addUse(VIndex)             // vindex
3954     .addUse(VOffset)            // voffset
3955     .addUse(SOffset)            // soffset
3956     .addImm(ImmOffset);         // offset(imm)
3957 
3958   if (IsTyped)
3959     MIB.addImm(Format);
3960 
3961   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3962      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3963      .addMemOperand(MMO);
3964 
3965   if (LoadDstReg != Dst) {
3966     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3967 
3968     // Widen result for extending loads was widened.
3969     if (IsExtLoad)
3970       B.buildTrunc(Dst, LoadDstReg);
3971     else {
3972       // Repack to original 16-bit vector result
3973       // FIXME: G_TRUNC should work, but legalization currently fails
3974       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3975       SmallVector<Register, 4> Repack;
3976       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3977         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3978       B.buildMerge(Dst, Repack);
3979     }
3980   }
3981 
3982   MI.eraseFromParent();
3983   return true;
3984 }
3985 
3986 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3987                                                MachineIRBuilder &B,
3988                                                bool IsInc) const {
3989   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3990                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3991   B.buildInstr(Opc)
3992     .addDef(MI.getOperand(0).getReg())
3993     .addUse(MI.getOperand(2).getReg())
3994     .addUse(MI.getOperand(3).getReg())
3995     .cloneMemRefs(MI);
3996   MI.eraseFromParent();
3997   return true;
3998 }
3999 
4000 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4001   switch (IntrID) {
4002   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4003   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4004     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4005   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4006   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4007     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4008   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4009   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4010     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4011   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4012   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4013     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4014   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4015   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4016     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4017   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4018   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4019     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4020   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4021   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4022     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4023   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4024   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4025     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4026   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4027   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4028     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4029   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4030   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4031     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4032   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4033   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4034     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4035   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4036   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4037     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4038   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4039   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4040     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4041   case Intrinsic::amdgcn_buffer_atomic_fadd:
4042   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4043   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4044     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4045   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4046   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4047     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4048   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4049   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4050     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4051   default:
4052     llvm_unreachable("unhandled atomic opcode");
4053   }
4054 }
4055 
4056 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4057                                                MachineIRBuilder &B,
4058                                                Intrinsic::ID IID) const {
4059   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4060                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4061   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4062 
4063   Register Dst;
4064 
4065   int OpOffset = 0;
4066   if (HasReturn) {
4067     // A few FP atomics do not support return values.
4068     Dst = MI.getOperand(0).getReg();
4069   } else {
4070     OpOffset = -1;
4071   }
4072 
4073   Register VData = MI.getOperand(2 + OpOffset).getReg();
4074   Register CmpVal;
4075 
4076   if (IsCmpSwap) {
4077     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4078     ++OpOffset;
4079   }
4080 
4081   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4082   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4083 
4084   // The struct intrinsic variants add one additional operand over raw.
4085   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4086   Register VIndex;
4087   if (HasVIndex) {
4088     VIndex = MI.getOperand(4 + OpOffset).getReg();
4089     ++OpOffset;
4090   }
4091 
4092   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4093   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4094   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4095 
4096   MachineMemOperand *MMO = *MI.memoperands_begin();
4097 
4098   unsigned ImmOffset;
4099   unsigned TotalOffset;
4100   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4101   if (TotalOffset != 0)
4102     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4103 
4104   if (!VIndex)
4105     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4106 
4107   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4108 
4109   if (HasReturn)
4110     MIB.addDef(Dst);
4111 
4112   MIB.addUse(VData); // vdata
4113 
4114   if (IsCmpSwap)
4115     MIB.addReg(CmpVal);
4116 
4117   MIB.addUse(RSrc)               // rsrc
4118      .addUse(VIndex)             // vindex
4119      .addUse(VOffset)            // voffset
4120      .addUse(SOffset)            // soffset
4121      .addImm(ImmOffset)          // offset(imm)
4122      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4123      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4124      .addMemOperand(MMO);
4125 
4126   MI.eraseFromParent();
4127   return true;
4128 }
4129 
4130 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4131 /// vector with s16 typed elements.
4132 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4133                                       SmallVectorImpl<Register> &PackedAddrs,
4134                                       unsigned ArgOffset,
4135                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4136                                       bool IsA16, bool IsG16) {
4137   const LLT S16 = LLT::scalar(16);
4138   const LLT V2S16 = LLT::fixed_vector(2, 16);
4139   auto EndIdx = Intr->VAddrEnd;
4140 
4141   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4142     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4143     if (!SrcOp.isReg())
4144       continue; // _L to _LZ may have eliminated this.
4145 
4146     Register AddrReg = SrcOp.getReg();
4147 
4148     if (I < Intr->GradientStart) {
4149       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4150       PackedAddrs.push_back(AddrReg);
4151     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4152                (I >= Intr->CoordStart && !IsA16)) {
4153       // Handle any gradient or coordinate operands that should not be packed
4154       PackedAddrs.push_back(AddrReg);
4155     } else {
4156       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4157       // derivatives dx/dh and dx/dv are packed with undef.
4158       if (((I + 1) >= EndIdx) ||
4159           ((Intr->NumGradients / 2) % 2 == 1 &&
4160            (I == static_cast<unsigned>(Intr->GradientStart +
4161                                        (Intr->NumGradients / 2) - 1) ||
4162             I == static_cast<unsigned>(Intr->GradientStart +
4163                                        Intr->NumGradients - 1))) ||
4164           // Check for _L to _LZ optimization
4165           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4166         PackedAddrs.push_back(
4167             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4168                 .getReg(0));
4169       } else {
4170         PackedAddrs.push_back(
4171             B.buildBuildVector(
4172                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4173                 .getReg(0));
4174         ++I;
4175       }
4176     }
4177   }
4178 }
4179 
4180 /// Convert from separate vaddr components to a single vector address register,
4181 /// and replace the remaining operands with $noreg.
4182 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4183                                      int DimIdx, int NumVAddrs) {
4184   const LLT S32 = LLT::scalar(32);
4185 
4186   SmallVector<Register, 8> AddrRegs;
4187   for (int I = 0; I != NumVAddrs; ++I) {
4188     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4189     if (SrcOp.isReg()) {
4190       AddrRegs.push_back(SrcOp.getReg());
4191       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4192     }
4193   }
4194 
4195   int NumAddrRegs = AddrRegs.size();
4196   if (NumAddrRegs != 1) {
4197     // Round up to 8 elements for v5-v7
4198     // FIXME: Missing intermediate sized register classes and instructions.
4199     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4200       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4201       auto Undef = B.buildUndef(S32);
4202       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4203       NumAddrRegs = RoundedNumRegs;
4204     }
4205 
4206     auto VAddr =
4207         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4208     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4209   }
4210 
4211   for (int I = 1; I != NumVAddrs; ++I) {
4212     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4213     if (SrcOp.isReg())
4214       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4215   }
4216 }
4217 
4218 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4219 ///
4220 /// Depending on the subtarget, load/store with 16-bit element data need to be
4221 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4222 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4223 /// registers.
4224 ///
4225 /// We don't want to directly select image instructions just yet, but also want
4226 /// to exposes all register repacking to the legalizer/combiners. We also don't
4227 /// want a selected instrution entering RegBankSelect. In order to avoid
4228 /// defining a multitude of intermediate image instructions, directly hack on
4229 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4230 /// now unnecessary arguments with $noreg.
4231 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4232     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4233     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4234 
4235   const unsigned NumDefs = MI.getNumExplicitDefs();
4236   const unsigned ArgOffset = NumDefs + 1;
4237   bool IsTFE = NumDefs == 2;
4238   // We are only processing the operands of d16 image operations on subtargets
4239   // that use the unpacked register layout, or need to repack the TFE result.
4240 
4241   // TODO: Do we need to guard against already legalized intrinsics?
4242   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4243       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4244 
4245   MachineRegisterInfo *MRI = B.getMRI();
4246   const LLT S32 = LLT::scalar(32);
4247   const LLT S16 = LLT::scalar(16);
4248   const LLT V2S16 = LLT::fixed_vector(2, 16);
4249 
4250   unsigned DMask = 0;
4251 
4252   // Check for 16 bit addresses and pack if true.
4253   LLT GradTy =
4254       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4255   LLT AddrTy =
4256       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4257   const bool IsG16 = GradTy == S16;
4258   const bool IsA16 = AddrTy == S16;
4259 
4260   int DMaskLanes = 0;
4261   if (!BaseOpcode->Atomic) {
4262     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4263     if (BaseOpcode->Gather4) {
4264       DMaskLanes = 4;
4265     } else if (DMask != 0) {
4266       DMaskLanes = countPopulation(DMask);
4267     } else if (!IsTFE && !BaseOpcode->Store) {
4268       // If dmask is 0, this is a no-op load. This can be eliminated.
4269       B.buildUndef(MI.getOperand(0));
4270       MI.eraseFromParent();
4271       return true;
4272     }
4273   }
4274 
4275   Observer.changingInstr(MI);
4276   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4277 
4278   unsigned NewOpcode = NumDefs == 0 ?
4279     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4280 
4281   // Track that we legalized this
4282   MI.setDesc(B.getTII().get(NewOpcode));
4283 
4284   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4285   // dmask to be at least 1 otherwise the instruction will fail
4286   if (IsTFE && DMask == 0) {
4287     DMask = 0x1;
4288     DMaskLanes = 1;
4289     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4290   }
4291 
4292   if (BaseOpcode->Atomic) {
4293     Register VData0 = MI.getOperand(2).getReg();
4294     LLT Ty = MRI->getType(VData0);
4295 
4296     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4297     if (Ty.isVector())
4298       return false;
4299 
4300     if (BaseOpcode->AtomicX2) {
4301       Register VData1 = MI.getOperand(3).getReg();
4302       // The two values are packed in one register.
4303       LLT PackedTy = LLT::fixed_vector(2, Ty);
4304       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4305       MI.getOperand(2).setReg(Concat.getReg(0));
4306       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4307     }
4308   }
4309 
4310   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4311 
4312   // Optimize _L to _LZ when _L is zero
4313   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4314           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4315     const ConstantFP *ConstantLod;
4316 
4317     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4318                  m_GFCst(ConstantLod))) {
4319       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4320         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4321         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4322             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4323                                                       Intr->Dim);
4324 
4325         // The starting indexes should remain in the same place.
4326         --CorrectedNumVAddrs;
4327 
4328         MI.getOperand(MI.getNumExplicitDefs())
4329             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4330         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4331         Intr = NewImageDimIntr;
4332       }
4333     }
4334   }
4335 
4336   // Optimize _mip away, when 'lod' is zero
4337   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4338     int64_t ConstantLod;
4339     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4340                  m_ICst(ConstantLod))) {
4341       if (ConstantLod == 0) {
4342         // TODO: Change intrinsic opcode and remove operand instead or replacing
4343         // it with 0, as the _L to _LZ handling is done above.
4344         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4345         --CorrectedNumVAddrs;
4346       }
4347     }
4348   }
4349 
4350   // Rewrite the addressing register layout before doing anything else.
4351   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4352     // 16 bit gradients are supported, but are tied to the A16 control
4353     // so both gradients and addresses must be 16 bit
4354     return false;
4355   }
4356 
4357   if (IsA16 && !ST.hasA16()) {
4358     // A16 not supported
4359     return false;
4360   }
4361 
4362   if (IsA16 || IsG16) {
4363     if (Intr->NumVAddrs > 1) {
4364       SmallVector<Register, 4> PackedRegs;
4365 
4366       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4367                                 IsG16);
4368 
4369       // See also below in the non-a16 branch
4370       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4371 
4372       if (!UseNSA && PackedRegs.size() > 1) {
4373         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4374         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4375         PackedRegs[0] = Concat.getReg(0);
4376         PackedRegs.resize(1);
4377       }
4378 
4379       const unsigned NumPacked = PackedRegs.size();
4380       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4381         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4382         if (!SrcOp.isReg()) {
4383           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4384           continue;
4385         }
4386 
4387         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4388 
4389         if (I - Intr->VAddrStart < NumPacked)
4390           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4391         else
4392           SrcOp.setReg(AMDGPU::NoRegister);
4393       }
4394     }
4395   } else {
4396     // If the register allocator cannot place the address registers contiguously
4397     // without introducing moves, then using the non-sequential address encoding
4398     // is always preferable, since it saves VALU instructions and is usually a
4399     // wash in terms of code size or even better.
4400     //
4401     // However, we currently have no way of hinting to the register allocator
4402     // that MIMG addresses should be placed contiguously when it is possible to
4403     // do so, so force non-NSA for the common 2-address case as a heuristic.
4404     //
4405     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4406     // allocation when possible.
4407     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4408 
4409     if (!UseNSA && Intr->NumVAddrs > 1)
4410       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4411                                Intr->NumVAddrs);
4412   }
4413 
4414   int Flags = 0;
4415   if (IsA16)
4416     Flags |= 1;
4417   if (IsG16)
4418     Flags |= 2;
4419   MI.addOperand(MachineOperand::CreateImm(Flags));
4420 
4421   if (BaseOpcode->Store) { // No TFE for stores?
4422     // TODO: Handle dmask trim
4423     Register VData = MI.getOperand(1).getReg();
4424     LLT Ty = MRI->getType(VData);
4425     if (!Ty.isVector() || Ty.getElementType() != S16)
4426       return true;
4427 
4428     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4429     if (RepackedReg != VData) {
4430       MI.getOperand(1).setReg(RepackedReg);
4431     }
4432 
4433     return true;
4434   }
4435 
4436   Register DstReg = MI.getOperand(0).getReg();
4437   LLT Ty = MRI->getType(DstReg);
4438   const LLT EltTy = Ty.getScalarType();
4439   const bool IsD16 = Ty.getScalarType() == S16;
4440   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4441 
4442   // Confirm that the return type is large enough for the dmask specified
4443   if (NumElts < DMaskLanes)
4444     return false;
4445 
4446   if (NumElts > 4 || DMaskLanes > 4)
4447     return false;
4448 
4449   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4450   const LLT AdjustedTy =
4451       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4452 
4453   // The raw dword aligned data component of the load. The only legal cases
4454   // where this matters should be when using the packed D16 format, for
4455   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4456   LLT RoundedTy;
4457 
4458   // S32 vector to to cover all data, plus TFE result element.
4459   LLT TFETy;
4460 
4461   // Register type to use for each loaded component. Will be S32 or V2S16.
4462   LLT RegTy;
4463 
4464   if (IsD16 && ST.hasUnpackedD16VMem()) {
4465     RoundedTy =
4466         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4467     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4468     RegTy = S32;
4469   } else {
4470     unsigned EltSize = EltTy.getSizeInBits();
4471     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4472     unsigned RoundedSize = 32 * RoundedElts;
4473     RoundedTy = LLT::scalarOrVector(
4474         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4475     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4476     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4477   }
4478 
4479   // The return type does not need adjustment.
4480   // TODO: Should we change s16 case to s32 or <2 x s16>?
4481   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4482     return true;
4483 
4484   Register Dst1Reg;
4485 
4486   // Insert after the instruction.
4487   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4488 
4489   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4490   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4491   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4492   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4493 
4494   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4495 
4496   MI.getOperand(0).setReg(NewResultReg);
4497 
4498   // In the IR, TFE is supposed to be used with a 2 element struct return
4499   // type. The intruction really returns these two values in one contiguous
4500   // register, with one additional dword beyond the loaded data. Rewrite the
4501   // return type to use a single register result.
4502 
4503   if (IsTFE) {
4504     Dst1Reg = MI.getOperand(1).getReg();
4505     if (MRI->getType(Dst1Reg) != S32)
4506       return false;
4507 
4508     // TODO: Make sure the TFE operand bit is set.
4509     MI.RemoveOperand(1);
4510 
4511     // Handle the easy case that requires no repack instructions.
4512     if (Ty == S32) {
4513       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4514       return true;
4515     }
4516   }
4517 
4518   // Now figure out how to copy the new result register back into the old
4519   // result.
4520   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4521 
4522   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4523 
4524   if (ResultNumRegs == 1) {
4525     assert(!IsTFE);
4526     ResultRegs[0] = NewResultReg;
4527   } else {
4528     // We have to repack into a new vector of some kind.
4529     for (int I = 0; I != NumDataRegs; ++I)
4530       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4531     B.buildUnmerge(ResultRegs, NewResultReg);
4532 
4533     // Drop the final TFE element to get the data part. The TFE result is
4534     // directly written to the right place already.
4535     if (IsTFE)
4536       ResultRegs.resize(NumDataRegs);
4537   }
4538 
4539   // For an s16 scalar result, we form an s32 result with a truncate regardless
4540   // of packed vs. unpacked.
4541   if (IsD16 && !Ty.isVector()) {
4542     B.buildTrunc(DstReg, ResultRegs[0]);
4543     return true;
4544   }
4545 
4546   // Avoid a build/concat_vector of 1 entry.
4547   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4548     B.buildBitcast(DstReg, ResultRegs[0]);
4549     return true;
4550   }
4551 
4552   assert(Ty.isVector());
4553 
4554   if (IsD16) {
4555     // For packed D16 results with TFE enabled, all the data components are
4556     // S32. Cast back to the expected type.
4557     //
4558     // TODO: We don't really need to use load s32 elements. We would only need one
4559     // cast for the TFE result if a multiple of v2s16 was used.
4560     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4561       for (Register &Reg : ResultRegs)
4562         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4563     } else if (ST.hasUnpackedD16VMem()) {
4564       for (Register &Reg : ResultRegs)
4565         Reg = B.buildTrunc(S16, Reg).getReg(0);
4566     }
4567   }
4568 
4569   auto padWithUndef = [&](LLT Ty, int NumElts) {
4570     if (NumElts == 0)
4571       return;
4572     Register Undef = B.buildUndef(Ty).getReg(0);
4573     for (int I = 0; I != NumElts; ++I)
4574       ResultRegs.push_back(Undef);
4575   };
4576 
4577   // Pad out any elements eliminated due to the dmask.
4578   LLT ResTy = MRI->getType(ResultRegs[0]);
4579   if (!ResTy.isVector()) {
4580     padWithUndef(ResTy, NumElts - ResultRegs.size());
4581     B.buildBuildVector(DstReg, ResultRegs);
4582     return true;
4583   }
4584 
4585   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4586   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4587 
4588   // Deal with the one annoying legal case.
4589   const LLT V3S16 = LLT::fixed_vector(3, 16);
4590   if (Ty == V3S16) {
4591     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4592     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4593     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4594     return true;
4595   }
4596 
4597   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4598   B.buildConcatVectors(DstReg, ResultRegs);
4599   return true;
4600 }
4601 
4602 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4603   LegalizerHelper &Helper, MachineInstr &MI) const {
4604   MachineIRBuilder &B = Helper.MIRBuilder;
4605   GISelChangeObserver &Observer = Helper.Observer;
4606 
4607   Register Dst = MI.getOperand(0).getReg();
4608   LLT Ty = B.getMRI()->getType(Dst);
4609   unsigned Size = Ty.getSizeInBits();
4610   MachineFunction &MF = B.getMF();
4611 
4612   Observer.changingInstr(MI);
4613 
4614   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4615     Ty = getBitcastRegisterType(Ty);
4616     Helper.bitcastDst(MI, Ty, 0);
4617     Dst = MI.getOperand(0).getReg();
4618     B.setInsertPt(B.getMBB(), MI);
4619   }
4620 
4621   // FIXME: We don't really need this intermediate instruction. The intrinsic
4622   // should be fixed to have a memory operand. Since it's readnone, we're not
4623   // allowed to add one.
4624   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4625   MI.RemoveOperand(1); // Remove intrinsic ID
4626 
4627   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4628   // TODO: Should this use datalayout alignment?
4629   const unsigned MemSize = (Size + 7) / 8;
4630   const Align MemAlign(4);
4631   MachineMemOperand *MMO = MF.getMachineMemOperand(
4632       MachinePointerInfo(),
4633       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4634           MachineMemOperand::MOInvariant,
4635       MemSize, MemAlign);
4636   MI.addMemOperand(MF, MMO);
4637 
4638   // There are no 96-bit result scalar loads, but widening to 128-bit should
4639   // always be legal. We may need to restore this to a 96-bit result if it turns
4640   // out this needs to be converted to a vector load during RegBankSelect.
4641   if (!isPowerOf2_32(Size)) {
4642     if (Ty.isVector())
4643       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4644     else
4645       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4646   }
4647 
4648   Observer.changedInstr(MI);
4649   return true;
4650 }
4651 
4652 // TODO: Move to selection
4653 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4654                                                 MachineRegisterInfo &MRI,
4655                                                 MachineIRBuilder &B) const {
4656   if (!ST.isTrapHandlerEnabled() ||
4657       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4658     return legalizeTrapEndpgm(MI, MRI, B);
4659 
4660   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4661     switch (*HsaAbiVer) {
4662     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4663     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4664       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4665     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4666       return ST.supportsGetDoorbellID() ?
4667           legalizeTrapHsa(MI, MRI, B) :
4668           legalizeTrapHsaQueuePtr(MI, MRI, B);
4669     }
4670   }
4671 
4672   llvm_unreachable("Unknown trap handler");
4673 }
4674 
4675 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4676     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4677   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4678   MI.eraseFromParent();
4679   return true;
4680 }
4681 
4682 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4683     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4684   // Pass queue pointer to trap handler as input, and insert trap instruction
4685   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4686   Register LiveIn =
4687     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4688   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4689     return false;
4690 
4691   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4692   B.buildCopy(SGPR01, LiveIn);
4693   B.buildInstr(AMDGPU::S_TRAP)
4694       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4695       .addReg(SGPR01, RegState::Implicit);
4696 
4697   MI.eraseFromParent();
4698   return true;
4699 }
4700 
4701 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4702     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4703   B.buildInstr(AMDGPU::S_TRAP)
4704       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4705   MI.eraseFromParent();
4706   return true;
4707 }
4708 
4709 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4710     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4711   // Is non-HSA path or trap-handler disabled? then, report a warning
4712   // accordingly
4713   if (!ST.isTrapHandlerEnabled() ||
4714       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4715     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4716                                      "debugtrap handler not supported",
4717                                      MI.getDebugLoc(), DS_Warning);
4718     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4719     Ctx.diagnose(NoTrap);
4720   } else {
4721     // Insert debug-trap instruction
4722     B.buildInstr(AMDGPU::S_TRAP)
4723         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4724   }
4725 
4726   MI.eraseFromParent();
4727   return true;
4728 }
4729 
4730 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4731                                                MachineIRBuilder &B) const {
4732   MachineRegisterInfo &MRI = *B.getMRI();
4733   const LLT S16 = LLT::scalar(16);
4734   const LLT S32 = LLT::scalar(32);
4735 
4736   Register DstReg = MI.getOperand(0).getReg();
4737   Register NodePtr = MI.getOperand(2).getReg();
4738   Register RayExtent = MI.getOperand(3).getReg();
4739   Register RayOrigin = MI.getOperand(4).getReg();
4740   Register RayDir = MI.getOperand(5).getReg();
4741   Register RayInvDir = MI.getOperand(6).getReg();
4742   Register TDescr = MI.getOperand(7).getReg();
4743 
4744   if (!ST.hasGFX10_AEncoding()) {
4745     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4746                                         "intrinsic not supported on subtarget",
4747                                         MI.getDebugLoc());
4748     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4749     return false;
4750   }
4751 
4752   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4753   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4754   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4755                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4756                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4757                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4758 
4759   SmallVector<Register, 12> Ops;
4760   if (Is64) {
4761     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4762     Ops.push_back(Unmerge.getReg(0));
4763     Ops.push_back(Unmerge.getReg(1));
4764   } else {
4765     Ops.push_back(NodePtr);
4766   }
4767   Ops.push_back(RayExtent);
4768 
4769   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4770     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4771     Ops.push_back(Unmerge.getReg(0));
4772     Ops.push_back(Unmerge.getReg(1));
4773     Ops.push_back(Unmerge.getReg(2));
4774   };
4775 
4776   packLanes(RayOrigin);
4777   if (IsA16) {
4778     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4779     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4780     Register R1 = MRI.createGenericVirtualRegister(S32);
4781     Register R2 = MRI.createGenericVirtualRegister(S32);
4782     Register R3 = MRI.createGenericVirtualRegister(S32);
4783     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4784     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4785     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4786     Ops.push_back(R1);
4787     Ops.push_back(R2);
4788     Ops.push_back(R3);
4789   } else {
4790     packLanes(RayDir);
4791     packLanes(RayInvDir);
4792   }
4793 
4794   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4795     .addDef(DstReg)
4796     .addImm(Opcode);
4797 
4798   for (Register R : Ops) {
4799     MIB.addUse(R);
4800   }
4801 
4802   MIB.addUse(TDescr)
4803      .addImm(IsA16 ? 1 : 0)
4804      .cloneMemRefs(MI);
4805 
4806   MI.eraseFromParent();
4807   return true;
4808 }
4809 
4810 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4811                                             MachineInstr &MI) const {
4812   MachineIRBuilder &B = Helper.MIRBuilder;
4813   MachineRegisterInfo &MRI = *B.getMRI();
4814 
4815   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4816   auto IntrID = MI.getIntrinsicID();
4817   switch (IntrID) {
4818   case Intrinsic::amdgcn_if:
4819   case Intrinsic::amdgcn_else: {
4820     MachineInstr *Br = nullptr;
4821     MachineBasicBlock *UncondBrTarget = nullptr;
4822     bool Negated = false;
4823     if (MachineInstr *BrCond =
4824             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4825       const SIRegisterInfo *TRI
4826         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4827 
4828       Register Def = MI.getOperand(1).getReg();
4829       Register Use = MI.getOperand(3).getReg();
4830 
4831       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4832 
4833       if (Negated)
4834         std::swap(CondBrTarget, UncondBrTarget);
4835 
4836       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4837       if (IntrID == Intrinsic::amdgcn_if) {
4838         B.buildInstr(AMDGPU::SI_IF)
4839           .addDef(Def)
4840           .addUse(Use)
4841           .addMBB(UncondBrTarget);
4842       } else {
4843         B.buildInstr(AMDGPU::SI_ELSE)
4844             .addDef(Def)
4845             .addUse(Use)
4846             .addMBB(UncondBrTarget);
4847       }
4848 
4849       if (Br) {
4850         Br->getOperand(0).setMBB(CondBrTarget);
4851       } else {
4852         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4853         // since we're swapping branch targets it needs to be reinserted.
4854         // FIXME: IRTranslator should probably not do this
4855         B.buildBr(*CondBrTarget);
4856       }
4857 
4858       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4859       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4860       MI.eraseFromParent();
4861       BrCond->eraseFromParent();
4862       return true;
4863     }
4864 
4865     return false;
4866   }
4867   case Intrinsic::amdgcn_loop: {
4868     MachineInstr *Br = nullptr;
4869     MachineBasicBlock *UncondBrTarget = nullptr;
4870     bool Negated = false;
4871     if (MachineInstr *BrCond =
4872             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4873       const SIRegisterInfo *TRI
4874         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4875 
4876       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4877       Register Reg = MI.getOperand(2).getReg();
4878 
4879       if (Negated)
4880         std::swap(CondBrTarget, UncondBrTarget);
4881 
4882       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4883       B.buildInstr(AMDGPU::SI_LOOP)
4884         .addUse(Reg)
4885         .addMBB(UncondBrTarget);
4886 
4887       if (Br)
4888         Br->getOperand(0).setMBB(CondBrTarget);
4889       else
4890         B.buildBr(*CondBrTarget);
4891 
4892       MI.eraseFromParent();
4893       BrCond->eraseFromParent();
4894       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4895       return true;
4896     }
4897 
4898     return false;
4899   }
4900   case Intrinsic::amdgcn_kernarg_segment_ptr:
4901     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4902       // This only makes sense to call in a kernel, so just lower to null.
4903       B.buildConstant(MI.getOperand(0).getReg(), 0);
4904       MI.eraseFromParent();
4905       return true;
4906     }
4907 
4908     return legalizePreloadedArgIntrin(
4909       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4910   case Intrinsic::amdgcn_implicitarg_ptr:
4911     return legalizeImplicitArgPtr(MI, MRI, B);
4912   case Intrinsic::amdgcn_workitem_id_x:
4913     return legalizePreloadedArgIntrin(MI, MRI, B,
4914                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4915   case Intrinsic::amdgcn_workitem_id_y:
4916     return legalizePreloadedArgIntrin(MI, MRI, B,
4917                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4918   case Intrinsic::amdgcn_workitem_id_z:
4919     return legalizePreloadedArgIntrin(MI, MRI, B,
4920                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4921   case Intrinsic::amdgcn_workgroup_id_x:
4922     return legalizePreloadedArgIntrin(MI, MRI, B,
4923                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4924   case Intrinsic::amdgcn_workgroup_id_y:
4925     return legalizePreloadedArgIntrin(MI, MRI, B,
4926                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4927   case Intrinsic::amdgcn_workgroup_id_z:
4928     return legalizePreloadedArgIntrin(MI, MRI, B,
4929                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4930   case Intrinsic::amdgcn_dispatch_ptr:
4931     return legalizePreloadedArgIntrin(MI, MRI, B,
4932                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4933   case Intrinsic::amdgcn_queue_ptr:
4934     return legalizePreloadedArgIntrin(MI, MRI, B,
4935                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4936   case Intrinsic::amdgcn_implicit_buffer_ptr:
4937     return legalizePreloadedArgIntrin(
4938       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4939   case Intrinsic::amdgcn_dispatch_id:
4940     return legalizePreloadedArgIntrin(MI, MRI, B,
4941                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4942   case Intrinsic::amdgcn_fdiv_fast:
4943     return legalizeFDIVFastIntrin(MI, MRI, B);
4944   case Intrinsic::amdgcn_is_shared:
4945     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4946   case Intrinsic::amdgcn_is_private:
4947     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4948   case Intrinsic::amdgcn_wavefrontsize: {
4949     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4950     MI.eraseFromParent();
4951     return true;
4952   }
4953   case Intrinsic::amdgcn_s_buffer_load:
4954     return legalizeSBufferLoad(Helper, MI);
4955   case Intrinsic::amdgcn_raw_buffer_store:
4956   case Intrinsic::amdgcn_struct_buffer_store:
4957     return legalizeBufferStore(MI, MRI, B, false, false);
4958   case Intrinsic::amdgcn_raw_buffer_store_format:
4959   case Intrinsic::amdgcn_struct_buffer_store_format:
4960     return legalizeBufferStore(MI, MRI, B, false, true);
4961   case Intrinsic::amdgcn_raw_tbuffer_store:
4962   case Intrinsic::amdgcn_struct_tbuffer_store:
4963     return legalizeBufferStore(MI, MRI, B, true, true);
4964   case Intrinsic::amdgcn_raw_buffer_load:
4965   case Intrinsic::amdgcn_struct_buffer_load:
4966     return legalizeBufferLoad(MI, MRI, B, false, false);
4967   case Intrinsic::amdgcn_raw_buffer_load_format:
4968   case Intrinsic::amdgcn_struct_buffer_load_format:
4969     return legalizeBufferLoad(MI, MRI, B, true, false);
4970   case Intrinsic::amdgcn_raw_tbuffer_load:
4971   case Intrinsic::amdgcn_struct_tbuffer_load:
4972     return legalizeBufferLoad(MI, MRI, B, true, true);
4973   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4974   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4975   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4976   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4977   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4978   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4979   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4980   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4981   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4982   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4983   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4984   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4985   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4986   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4987   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4988   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4989   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4990   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4991   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4992   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4993   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4994   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4995   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4996   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4997   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4998   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4999   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5000   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5001   case Intrinsic::amdgcn_buffer_atomic_fadd:
5002   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5003   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5004   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5005   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5006     return legalizeBufferAtomic(MI, B, IntrID);
5007   case Intrinsic::amdgcn_atomic_inc:
5008     return legalizeAtomicIncDec(MI, B, true);
5009   case Intrinsic::amdgcn_atomic_dec:
5010     return legalizeAtomicIncDec(MI, B, false);
5011   case Intrinsic::trap:
5012     return legalizeTrapIntrinsic(MI, MRI, B);
5013   case Intrinsic::debugtrap:
5014     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5015   case Intrinsic::amdgcn_rsq_clamp:
5016     return legalizeRsqClampIntrinsic(MI, MRI, B);
5017   case Intrinsic::amdgcn_ds_fadd:
5018   case Intrinsic::amdgcn_ds_fmin:
5019   case Intrinsic::amdgcn_ds_fmax:
5020     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5021   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5022     return legalizeBVHIntrinsic(MI, B);
5023   default: {
5024     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5025             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5026       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5027     return true;
5028   }
5029   }
5030 
5031   return true;
5032 }
5033