1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const unsigned MemSizeInBits) {
365   const unsigned Size = Ty.getSizeInBits();
366   if (Size != MemSizeInBits)
367     return Size <= 32 && Ty.isVector();
368 
369   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
370     return true;
371   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
372          !isRegisterVectorElementType(Ty.getElementType());
373 }
374 
375 /// Return true if we should legalize a load by widening an odd sized memory
376 /// access up to the alignment. Note this case when the memory access itself
377 /// changes, not the size of the result register.
378 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
379                             unsigned AlignInBits, unsigned AddrSpace,
380                             unsigned Opcode) {
381   // We don't want to widen cases that are naturally legal.
382   if (isPowerOf2_32(SizeInBits))
383     return false;
384 
385   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
386   // end up widening these for a scalar load during RegBankSelect, since there
387   // aren't 96-bit scalar loads.
388   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
389     return false;
390 
391   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
392     return false;
393 
394   // A load is known dereferenceable up to the alignment, so it's legal to widen
395   // to it.
396   //
397   // TODO: Could check dereferenceable for less aligned cases.
398   unsigned RoundedSize = NextPowerOf2(SizeInBits);
399   if (AlignInBits < RoundedSize)
400     return false;
401 
402   // Do not widen if it would introduce a slow unaligned load.
403   const SITargetLowering *TLI = ST.getTargetLowering();
404   bool Fast = false;
405   return TLI->allowsMisalignedMemoryAccessesImpl(
406              RoundedSize, AddrSpace, Align(AlignInBits / 8),
407              MachineMemOperand::MOLoad, &Fast) &&
408          Fast;
409 }
410 
411 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
412                             unsigned Opcode) {
413   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
414     return false;
415 
416   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
417                          Query.MMODescrs[0].AlignInBits,
418                          Query.Types[1].getAddressSpace(), Opcode);
419 }
420 
421 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
422                                          const GCNTargetMachine &TM)
423   :  ST(ST_) {
424   using namespace TargetOpcode;
425 
426   auto GetAddrSpacePtr = [&TM](unsigned AS) {
427     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
428   };
429 
430   const LLT S1 = LLT::scalar(1);
431   const LLT S8 = LLT::scalar(8);
432   const LLT S16 = LLT::scalar(16);
433   const LLT S32 = LLT::scalar(32);
434   const LLT S64 = LLT::scalar(64);
435   const LLT S128 = LLT::scalar(128);
436   const LLT S256 = LLT::scalar(256);
437   const LLT S512 = LLT::scalar(512);
438   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
439 
440   const LLT V2S8 = LLT::fixed_vector(2, 8);
441   const LLT V2S16 = LLT::fixed_vector(2, 16);
442   const LLT V4S16 = LLT::fixed_vector(4, 16);
443 
444   const LLT V2S32 = LLT::fixed_vector(2, 32);
445   const LLT V3S32 = LLT::fixed_vector(3, 32);
446   const LLT V4S32 = LLT::fixed_vector(4, 32);
447   const LLT V5S32 = LLT::fixed_vector(5, 32);
448   const LLT V6S32 = LLT::fixed_vector(6, 32);
449   const LLT V7S32 = LLT::fixed_vector(7, 32);
450   const LLT V8S32 = LLT::fixed_vector(8, 32);
451   const LLT V9S32 = LLT::fixed_vector(9, 32);
452   const LLT V10S32 = LLT::fixed_vector(10, 32);
453   const LLT V11S32 = LLT::fixed_vector(11, 32);
454   const LLT V12S32 = LLT::fixed_vector(12, 32);
455   const LLT V13S32 = LLT::fixed_vector(13, 32);
456   const LLT V14S32 = LLT::fixed_vector(14, 32);
457   const LLT V15S32 = LLT::fixed_vector(15, 32);
458   const LLT V16S32 = LLT::fixed_vector(16, 32);
459   const LLT V32S32 = LLT::fixed_vector(32, 32);
460 
461   const LLT V2S64 = LLT::fixed_vector(2, 64);
462   const LLT V3S64 = LLT::fixed_vector(3, 64);
463   const LLT V4S64 = LLT::fixed_vector(4, 64);
464   const LLT V5S64 = LLT::fixed_vector(5, 64);
465   const LLT V6S64 = LLT::fixed_vector(6, 64);
466   const LLT V7S64 = LLT::fixed_vector(7, 64);
467   const LLT V8S64 = LLT::fixed_vector(8, 64);
468   const LLT V16S64 = LLT::fixed_vector(16, 64);
469 
470   std::initializer_list<LLT> AllS32Vectors =
471     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
472      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
473   std::initializer_list<LLT> AllS64Vectors =
474     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
475 
476   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
477   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
478   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
479   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
480   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
481   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
482   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
483 
484   const LLT CodePtr = FlatPtr;
485 
486   const std::initializer_list<LLT> AddrSpaces64 = {
487     GlobalPtr, ConstantPtr, FlatPtr
488   };
489 
490   const std::initializer_list<LLT> AddrSpaces32 = {
491     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
492   };
493 
494   const std::initializer_list<LLT> FPTypesBase = {
495     S32, S64
496   };
497 
498   const std::initializer_list<LLT> FPTypes16 = {
499     S32, S64, S16
500   };
501 
502   const std::initializer_list<LLT> FPTypesPK16 = {
503     S32, S64, S16, V2S16
504   };
505 
506   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
507 
508   // s1 for VCC branches, s32 for SCC branches.
509   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
510 
511   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
512   // elements for v3s16
513   getActionDefinitionsBuilder(G_PHI)
514     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
515     .legalFor(AllS32Vectors)
516     .legalFor(AllS64Vectors)
517     .legalFor(AddrSpaces64)
518     .legalFor(AddrSpaces32)
519     .legalIf(isPointer(0))
520     .clampScalar(0, S16, S256)
521     .widenScalarToNextPow2(0, 32)
522     .clampMaxNumElements(0, S32, 16)
523     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
524     .scalarize(0);
525 
526   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
527     // Full set of gfx9 features.
528     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
529       .legalFor({S32, S16, V2S16})
530       .clampScalar(0, S16, S32)
531       .clampMaxNumElements(0, S16, 2)
532       .scalarize(0)
533       .widenScalarToNextPow2(0, 32);
534 
535     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
536       .legalFor({S32, S16, V2S16}) // Clamp modifier
537       .minScalarOrElt(0, S16)
538       .clampMaxNumElements(0, S16, 2)
539       .scalarize(0)
540       .widenScalarToNextPow2(0, 32)
541       .lower();
542   } else if (ST.has16BitInsts()) {
543     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
544       .legalFor({S32, S16})
545       .clampScalar(0, S16, S32)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
548 
549     // Technically the saturating operations require clamp bit support, but this
550     // was introduced at the same time as 16-bit operations.
551     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
552       .legalFor({S32, S16}) // Clamp modifier
553       .minScalar(0, S16)
554       .scalarize(0)
555       .widenScalarToNextPow2(0, 16)
556       .lower();
557 
558     // We're just lowering this, but it helps get a better result to try to
559     // coerce to the desired type first.
560     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
561       .minScalar(0, S16)
562       .scalarize(0)
563       .lower();
564   } else {
565     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
566       .legalFor({S32})
567       .clampScalar(0, S32, S32)
568       .scalarize(0);
569 
570     if (ST.hasIntClamp()) {
571       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
572         .legalFor({S32}) // Clamp modifier.
573         .scalarize(0)
574         .minScalarOrElt(0, S32)
575         .lower();
576     } else {
577       // Clamp bit support was added in VI, along with 16-bit operations.
578       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
579         .minScalar(0, S32)
580         .scalarize(0)
581         .lower();
582     }
583 
584     // FIXME: DAG expansion gets better results. The widening uses the smaller
585     // range values and goes for the min/max lowering directly.
586     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
587       .minScalar(0, S32)
588       .scalarize(0)
589       .lower();
590   }
591 
592   getActionDefinitionsBuilder(
593       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
594       .customFor({S32, S64})
595       .clampScalar(0, S32, S64)
596       .widenScalarToNextPow2(0, 32)
597       .scalarize(0);
598 
599   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
600                    .legalFor({S32})
601                    .maxScalarOrElt(0, S32);
602 
603   if (ST.hasVOP3PInsts()) {
604     Mulh
605       .clampMaxNumElements(0, S8, 2)
606       .lowerFor({V2S8});
607   }
608 
609   Mulh
610     .scalarize(0)
611     .lower();
612 
613   // Report legal for any types we can handle anywhere. For the cases only legal
614   // on the SALU, RegBankSelect will be able to re-legalize.
615   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
616     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
617     .clampScalar(0, S32, S64)
618     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
619     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
620     .widenScalarToNextPow2(0)
621     .scalarize(0);
622 
623   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
624                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
625     .legalFor({{S32, S1}, {S32, S32}})
626     .minScalar(0, S32)
627     // TODO: .scalarize(0)
628     .lower();
629 
630   getActionDefinitionsBuilder(G_BITCAST)
631     // Don't worry about the size constraint.
632     .legalIf(all(isRegisterType(0), isRegisterType(1)))
633     .lower();
634 
635 
636   getActionDefinitionsBuilder(G_CONSTANT)
637     .legalFor({S1, S32, S64, S16, GlobalPtr,
638                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
639     .legalIf(isPointer(0))
640     .clampScalar(0, S32, S64)
641     .widenScalarToNextPow2(0);
642 
643   getActionDefinitionsBuilder(G_FCONSTANT)
644     .legalFor({S32, S64, S16})
645     .clampScalar(0, S16, S64);
646 
647   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
648       .legalIf(isRegisterType(0))
649       // s1 and s16 are special cases because they have legal operations on
650       // them, but don't really occupy registers in the normal way.
651       .legalFor({S1, S16})
652       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
653       .clampScalarOrElt(0, S32, MaxScalar)
654       .widenScalarToNextPow2(0, 32)
655       .clampMaxNumElements(0, S32, 16);
656 
657   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
658 
659   // If the amount is divergent, we have to do a wave reduction to get the
660   // maximum value, so this is expanded during RegBankSelect.
661   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662     .legalFor({{PrivatePtr, S32}});
663 
664   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665     .customIf(typeIsNot(0, PrivatePtr));
666 
667   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
668 
669   auto &FPOpActions = getActionDefinitionsBuilder(
670     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671     .legalFor({S32, S64});
672   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673     .customFor({S32, S64});
674   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675     .customFor({S32, S64});
676 
677   if (ST.has16BitInsts()) {
678     if (ST.hasVOP3PInsts())
679       FPOpActions.legalFor({S16, V2S16});
680     else
681       FPOpActions.legalFor({S16});
682 
683     TrigActions.customFor({S16});
684     FDIVActions.customFor({S16});
685   }
686 
687   auto &MinNumMaxNum = getActionDefinitionsBuilder({
688       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689 
690   if (ST.hasVOP3PInsts()) {
691     MinNumMaxNum.customFor(FPTypesPK16)
692       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
693       .clampMaxNumElements(0, S16, 2)
694       .clampScalar(0, S16, S64)
695       .scalarize(0);
696   } else if (ST.has16BitInsts()) {
697     MinNumMaxNum.customFor(FPTypes16)
698       .clampScalar(0, S16, S64)
699       .scalarize(0);
700   } else {
701     MinNumMaxNum.customFor(FPTypesBase)
702       .clampScalar(0, S32, S64)
703       .scalarize(0);
704   }
705 
706   if (ST.hasVOP3PInsts())
707     FPOpActions.clampMaxNumElements(0, S16, 2);
708 
709   FPOpActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   TrigActions
714     .scalarize(0)
715     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717   FDIVActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   getActionDefinitionsBuilder({G_FNEG, G_FABS})
722     .legalFor(FPTypesPK16)
723     .clampMaxNumElements(0, S16, 2)
724     .scalarize(0)
725     .clampScalar(0, S16, S64);
726 
727   if (ST.has16BitInsts()) {
728     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729       .legalFor({S32, S64, S16})
730       .scalarize(0)
731       .clampScalar(0, S16, S64);
732   } else {
733     getActionDefinitionsBuilder(G_FSQRT)
734       .legalFor({S32, S64})
735       .scalarize(0)
736       .clampScalar(0, S32, S64);
737 
738     if (ST.hasFractBug()) {
739       getActionDefinitionsBuilder(G_FFLOOR)
740         .customFor({S64})
741         .legalFor({S32, S64})
742         .scalarize(0)
743         .clampScalar(0, S32, S64);
744     } else {
745       getActionDefinitionsBuilder(G_FFLOOR)
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     }
750   }
751 
752   getActionDefinitionsBuilder(G_FPTRUNC)
753     .legalFor({{S32, S64}, {S16, S32}})
754     .scalarize(0)
755     .lower();
756 
757   getActionDefinitionsBuilder(G_FPEXT)
758     .legalFor({{S64, S32}, {S32, S16}})
759     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760     .scalarize(0);
761 
762   getActionDefinitionsBuilder(G_FSUB)
763       // Use actual fsub instruction
764       .legalFor({S32})
765       // Must use fadd + fneg
766       .lowerFor({S64, S16, V2S16})
767       .scalarize(0)
768       .clampScalar(0, S32, S64);
769 
770   // Whether this is legal depends on the floating point mode for the function.
771   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773     FMad.customFor({S32, S16});
774   else if (ST.hasMadMacF32Insts())
775     FMad.customFor({S32});
776   else if (ST.hasMadF16())
777     FMad.customFor({S16});
778   FMad.scalarize(0)
779       .lower();
780 
781   auto &FRem = getActionDefinitionsBuilder(G_FREM);
782   if (ST.has16BitInsts()) {
783     FRem.customFor({S16, S32, S64});
784   } else {
785     FRem.minScalar(0, S32)
786         .customFor({S32, S64});
787   }
788   FRem.scalarize(0);
789 
790   // TODO: Do we need to clamp maximum bitwidth?
791   getActionDefinitionsBuilder(G_TRUNC)
792     .legalIf(isScalar(0))
793     .legalFor({{V2S16, V2S32}})
794     .clampMaxNumElements(0, S16, 2)
795     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796     // situations (like an invalid implicit use), we don't want to infinite loop
797     // in the legalizer.
798     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
799     .alwaysLegal();
800 
801   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803                {S32, S1}, {S64, S1}, {S16, S1}})
804     .scalarize(0)
805     .clampScalar(0, S32, S64)
806     .widenScalarToNextPow2(1, 32);
807 
808   // TODO: Split s1->s64 during regbankselect for VALU.
809   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811     .lowerFor({{S32, S64}})
812     .lowerIf(typeIs(1, S1))
813     .customFor({{S64, S64}});
814   if (ST.has16BitInsts())
815     IToFP.legalFor({{S16, S16}});
816   IToFP.clampScalar(1, S32, S64)
817        .minScalar(0, S32)
818        .scalarize(0)
819        .widenScalarToNextPow2(1);
820 
821   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823     .customFor({{S64, S32}, {S64, S64}})
824     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825   if (ST.has16BitInsts())
826     FPToI.legalFor({{S16, S16}});
827   else
828     FPToI.minScalar(1, S32);
829 
830   FPToI.minScalar(0, S32)
831        .widenScalarToNextPow2(0, 32)
832        .scalarize(0)
833        .lower();
834 
835   // Lower roundeven into G_FRINT
836   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837     .scalarize(0)
838     .lower();
839 
840   if (ST.has16BitInsts()) {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S16, S32, S64})
843       .clampScalar(0, S16, S64)
844       .scalarize(0);
845   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S32, S64})
848       .clampScalar(0, S32, S64)
849       .scalarize(0);
850   } else {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32})
853       .customFor({S64})
854       .clampScalar(0, S32, S64)
855       .scalarize(0);
856   }
857 
858   getActionDefinitionsBuilder(G_PTR_ADD)
859     .legalIf(all(isPointer(0), sameSize(0, 1)))
860     .scalarize(0)
861     .scalarSameSizeAs(1, 0);
862 
863   getActionDefinitionsBuilder(G_PTRMASK)
864     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865     .scalarSameSizeAs(1, 0)
866     .scalarize(0);
867 
868   auto &CmpBuilder =
869     getActionDefinitionsBuilder(G_ICMP)
870     // The compare output type differs based on the register bank of the output,
871     // so make both s1 and s32 legal.
872     //
873     // Scalar compares producing output in scc will be promoted to s32, as that
874     // is the allocatable register type that will be needed for the copy from
875     // scc. This will be promoted during RegBankSelect, and we assume something
876     // before that won't try to use s32 result types.
877     //
878     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879     // bank.
880     .legalForCartesianProduct(
881       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882     .legalForCartesianProduct(
883       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884   if (ST.has16BitInsts()) {
885     CmpBuilder.legalFor({{S1, S16}});
886   }
887 
888   CmpBuilder
889     .widenScalarToNextPow2(1)
890     .clampScalar(1, S32, S64)
891     .scalarize(0)
892     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893 
894   getActionDefinitionsBuilder(G_FCMP)
895     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0);
899 
900   // FIXME: fpow has a selection pattern that should move to custom lowering.
901   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902   if (ST.has16BitInsts())
903     Exp2Ops.legalFor({S32, S16});
904   else
905     Exp2Ops.legalFor({S32});
906   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907   Exp2Ops.scalarize(0);
908 
909   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910   if (ST.has16BitInsts())
911     ExpOps.customFor({{S32}, {S16}});
912   else
913     ExpOps.customFor({S32});
914   ExpOps.clampScalar(0, MinScalarFPTy, S32)
915         .scalarize(0);
916 
917   getActionDefinitionsBuilder(G_FPOWI)
918     .clampScalar(0, MinScalarFPTy, S32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder(G_CTPOP)
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   // The hardware instructions return a different result on 0 than the generic
931   // instructions expect. The hardware produces -1, but these produce the
932   // bitwidth.
933   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934     .scalarize(0)
935     .clampScalar(0, S32, S32)
936     .clampScalar(1, S32, S64)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32)
939     .lower();
940 
941   // The 64-bit versions produce 32-bit results, but only on the SALU.
942   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943     .legalFor({{S32, S32}, {S32, S64}})
944     .clampScalar(0, S32, S32)
945     .clampScalar(1, S32, S64)
946     .scalarize(0)
947     .widenScalarToNextPow2(0, 32)
948     .widenScalarToNextPow2(1, 32);
949 
950   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951   // RegBankSelect.
952   getActionDefinitionsBuilder(G_BITREVERSE)
953     .legalFor({S32, S64})
954     .clampScalar(0, S32, S64)
955     .scalarize(0)
956     .widenScalarToNextPow2(0);
957 
958   if (ST.has16BitInsts()) {
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S16, S32, V2S16})
961       .clampMaxNumElements(0, S16, 2)
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .clampScalar(0, S16, S32)
966       .scalarize(0);
967 
968     if (ST.hasVOP3PInsts()) {
969       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970         .legalFor({S32, S16, V2S16})
971         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972         .clampMaxNumElements(0, S16, 2)
973         .minScalar(0, S16)
974         .widenScalarToNextPow2(0)
975         .scalarize(0)
976         .lower();
977     } else {
978       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979         .legalFor({S32, S16})
980         .widenScalarToNextPow2(0)
981         .minScalar(0, S16)
982         .scalarize(0)
983         .lower();
984     }
985   } else {
986     // TODO: Should have same legality without v_perm_b32
987     getActionDefinitionsBuilder(G_BSWAP)
988       .legalFor({S32})
989       .lowerIf(scalarNarrowerThan(0, 32))
990       // FIXME: Fixing non-power-of-2 before clamp is workaround for
991       // narrowScalar limitation.
992       .widenScalarToNextPow2(0)
993       .maxScalar(0, S32)
994       .scalarize(0)
995       .lower();
996 
997     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998       .legalFor({S32})
999       .minScalar(0, S32)
1000       .widenScalarToNextPow2(0)
1001       .scalarize(0)
1002       .lower();
1003   }
1004 
1005   getActionDefinitionsBuilder(G_INTTOPTR)
1006     // List the common cases
1007     .legalForCartesianProduct(AddrSpaces64, {S64})
1008     .legalForCartesianProduct(AddrSpaces32, {S32})
1009     .scalarize(0)
1010     // Accept any address space as long as the size matches
1011     .legalIf(sameSize(0, 1))
1012     .widenScalarIf(smallerThan(1, 0),
1013       [](const LegalityQuery &Query) {
1014         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015       })
1016     .narrowScalarIf(largerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       });
1020 
1021   getActionDefinitionsBuilder(G_PTRTOINT)
1022     // List the common cases
1023     .legalForCartesianProduct(AddrSpaces64, {S64})
1024     .legalForCartesianProduct(AddrSpaces32, {S32})
1025     .scalarize(0)
1026     // Accept any address space as long as the size matches
1027     .legalIf(sameSize(0, 1))
1028     .widenScalarIf(smallerThan(0, 1),
1029       [](const LegalityQuery &Query) {
1030         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031       })
1032     .narrowScalarIf(
1033       largerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       });
1037 
1038   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039     .scalarize(0)
1040     .custom();
1041 
1042   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043                                     bool IsLoad) -> bool {
1044     const LLT DstTy = Query.Types[0];
1045 
1046     // Split vector extloads.
1047     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049 
1050     if (MemSize < DstTy.getSizeInBits())
1051       MemSize = std::max(MemSize, AlignBits);
1052 
1053     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054       return true;
1055 
1056     const LLT PtrTy = Query.Types[1];
1057     unsigned AS = PtrTy.getAddressSpace();
1058     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059       return true;
1060 
1061     // Catch weird sized loads that don't evenly divide into the access sizes
1062     // TODO: May be able to widen depending on alignment etc.
1063     unsigned NumRegs = (MemSize + 31) / 32;
1064     if (NumRegs == 3) {
1065       if (!ST.hasDwordx3LoadStores())
1066         return true;
1067     } else {
1068       // If the alignment allows, these should have been widened.
1069       if (!isPowerOf2_32(NumRegs))
1070         return true;
1071     }
1072 
1073     if (AlignBits < MemSize) {
1074       const SITargetLowering *TLI = ST.getTargetLowering();
1075       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076                                                       Align(AlignBits / 8));
1077     }
1078 
1079     return false;
1080   };
1081 
1082   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085 
1086   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087   // LDS
1088   // TODO: Unsupported flat for SI.
1089 
1090   for (unsigned Op : {G_LOAD, G_STORE}) {
1091     const bool IsStore = Op == G_STORE;
1092 
1093     auto &Actions = getActionDefinitionsBuilder(Op);
1094     // Explicitly list some common cases.
1095     // TODO: Does this help compile time at all?
1096     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1098                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1099                                       {S64, GlobalPtr, 64, GlobalAlign32},
1100                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1101                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1102                                       {S32, GlobalPtr, 8, GlobalAlign8},
1103                                       {S32, GlobalPtr, 16, GlobalAlign16},
1104 
1105                                       {S32, LocalPtr, 32, 32},
1106                                       {S64, LocalPtr, 64, 32},
1107                                       {V2S32, LocalPtr, 64, 32},
1108                                       {S32, LocalPtr, 8, 8},
1109                                       {S32, LocalPtr, 16, 16},
1110                                       {V2S16, LocalPtr, 32, 32},
1111 
1112                                       {S32, PrivatePtr, 32, 32},
1113                                       {S32, PrivatePtr, 8, 8},
1114                                       {S32, PrivatePtr, 16, 16},
1115                                       {V2S16, PrivatePtr, 32, 32},
1116 
1117                                       {S32, ConstantPtr, 32, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1119                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1120                                       {S64, ConstantPtr, 64, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122     Actions.legalIf(
1123       [=](const LegalityQuery &Query) -> bool {
1124         return isLoadStoreLegal(ST, Query);
1125       });
1126 
1127     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128     // 64-bits.
1129     //
1130     // TODO: Should generalize bitcast action into coerce, which will also cover
1131     // inserting addrspacecasts.
1132     Actions.customIf(typeIs(1, Constant32Ptr));
1133 
1134     // Turn any illegal element vectors into something easier to deal
1135     // with. These will ultimately produce 32-bit scalar shifts to extract the
1136     // parts anyway.
1137     //
1138     // For odd 16-bit element vectors, prefer to split those into pieces with
1139     // 16-bit vector parts.
1140     Actions.bitcastIf(
1141       [=](const LegalityQuery &Query) -> bool {
1142         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143                                           Query.MMODescrs[0].SizeInBits);
1144       }, bitcastToRegisterType(0));
1145 
1146     if (!IsStore) {
1147       // Widen suitably aligned loads by loading extra bytes. The standard
1148       // legalization actions can't properly express widening memory operands.
1149       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150         return shouldWidenLoad(ST, Query, G_LOAD);
1151       });
1152     }
1153 
1154     // FIXME: load/store narrowing should be moved to lower action
1155     Actions
1156         .narrowScalarIf(
1157             [=](const LegalityQuery &Query) -> bool {
1158               return !Query.Types[0].isVector() &&
1159                      needToSplitMemOp(Query, Op == G_LOAD);
1160             },
1161             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162               const LLT DstTy = Query.Types[0];
1163               const LLT PtrTy = Query.Types[1];
1164 
1165               const unsigned DstSize = DstTy.getSizeInBits();
1166               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167 
1168               // Split extloads.
1169               if (DstSize > MemSize)
1170                 return std::make_pair(0, LLT::scalar(MemSize));
1171 
1172               if (!isPowerOf2_32(DstSize)) {
1173                 // We're probably decomposing an odd sized store. Try to split
1174                 // to the widest type. TODO: Account for alignment. As-is it
1175                 // should be OK, since the new parts will be further legalized.
1176                 unsigned FloorSize = PowerOf2Floor(DstSize);
1177                 return std::make_pair(0, LLT::scalar(FloorSize));
1178               }
1179 
1180               if (DstSize > 32 && (DstSize % 32 != 0)) {
1181                 // FIXME: Need a way to specify non-extload of larger size if
1182                 // suitably aligned.
1183                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184               }
1185 
1186               unsigned MaxSize = maxSizeForAddrSpace(ST,
1187                                                      PtrTy.getAddressSpace(),
1188                                                      Op == G_LOAD);
1189               if (MemSize > MaxSize)
1190                 return std::make_pair(0, LLT::scalar(MaxSize));
1191 
1192               unsigned Align = Query.MMODescrs[0].AlignInBits;
1193               return std::make_pair(0, LLT::scalar(Align));
1194             })
1195         .fewerElementsIf(
1196             [=](const LegalityQuery &Query) -> bool {
1197               return Query.Types[0].isVector() &&
1198                      needToSplitMemOp(Query, Op == G_LOAD);
1199             },
1200             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201               const LLT DstTy = Query.Types[0];
1202               const LLT PtrTy = Query.Types[1];
1203 
1204               LLT EltTy = DstTy.getElementType();
1205               unsigned MaxSize = maxSizeForAddrSpace(ST,
1206                                                      PtrTy.getAddressSpace(),
1207                                                      Op == G_LOAD);
1208 
1209               // FIXME: Handle widened to power of 2 results better. This ends
1210               // up scalarizing.
1211               // FIXME: 3 element stores scalarized on SI
1212 
1213               // Split if it's too large for the address space.
1214               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215                 unsigned NumElts = DstTy.getNumElements();
1216                 unsigned EltSize = EltTy.getSizeInBits();
1217 
1218                 if (MaxSize % EltSize == 0) {
1219                   return std::make_pair(
1220                       0, LLT::scalarOrVector(
1221                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1222                 }
1223 
1224                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1225 
1226                 // FIXME: Refine when odd breakdowns handled
1227                 // The scalars will need to be re-legalized.
1228                 if (NumPieces == 1 || NumPieces >= NumElts ||
1229                     NumElts % NumPieces != 0)
1230                   return std::make_pair(0, EltTy);
1231 
1232                 return std::make_pair(
1233                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1234               }
1235 
1236               // FIXME: We could probably handle weird extending loads better.
1237               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1238               if (DstTy.getSizeInBits() > MemSize)
1239                 return std::make_pair(0, EltTy);
1240 
1241               unsigned EltSize = EltTy.getSizeInBits();
1242               unsigned DstSize = DstTy.getSizeInBits();
1243               if (!isPowerOf2_32(DstSize)) {
1244                 // We're probably decomposing an odd sized store. Try to split
1245                 // to the widest type. TODO: Account for alignment. As-is it
1246                 // should be OK, since the new parts will be further legalized.
1247                 unsigned FloorSize = PowerOf2Floor(DstSize);
1248                 return std::make_pair(
1249                     0, LLT::scalarOrVector(
1250                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1251               }
1252 
1253               // Need to split because of alignment.
1254               unsigned Align = Query.MMODescrs[0].AlignInBits;
1255               if (EltSize > Align &&
1256                   (EltSize / Align < DstTy.getNumElements())) {
1257                 return std::make_pair(
1258                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1259               }
1260 
1261               // May need relegalization for the scalars.
1262               return std::make_pair(0, EltTy);
1263             })
1264     .lowerIfMemSizeNotPow2()
1265     .minScalar(0, S32)
1266     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1267     .widenScalarToNextPow2(0)
1268     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1269     .lower();
1270   }
1271 
1272   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1273                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1274                                                   {S32, GlobalPtr, 16, 2 * 8},
1275                                                   {S32, LocalPtr, 8, 8},
1276                                                   {S32, LocalPtr, 16, 16},
1277                                                   {S32, PrivatePtr, 8, 8},
1278                                                   {S32, PrivatePtr, 16, 16},
1279                                                   {S32, ConstantPtr, 8, 8},
1280                                                   {S32, ConstantPtr, 16, 2 * 8}})
1281                        .legalIf(
1282                          [=](const LegalityQuery &Query) -> bool {
1283                            return isLoadStoreLegal(ST, Query);
1284                          });
1285 
1286   if (ST.hasFlatAddressSpace()) {
1287     ExtLoads.legalForTypesWithMemDesc(
1288         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1289   }
1290 
1291   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1292   // 64-bits.
1293   //
1294   // TODO: Should generalize bitcast action into coerce, which will also cover
1295   // inserting addrspacecasts.
1296   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1297 
1298   ExtLoads.clampScalar(0, S32, S32)
1299           .widenScalarToNextPow2(0)
1300           .unsupportedIfMemSizeNotPow2()
1301           .lower();
1302 
1303   auto &Atomics = getActionDefinitionsBuilder(
1304     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1305      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1306      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1307      G_ATOMICRMW_UMIN})
1308     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1309                {S64, GlobalPtr}, {S64, LocalPtr},
1310                {S32, RegionPtr}, {S64, RegionPtr}});
1311   if (ST.hasFlatAddressSpace()) {
1312     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1313   }
1314 
1315   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1316   if (ST.hasLDSFPAtomics()) {
1317     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1318     if (ST.hasGFX90AInsts())
1319       Atomic.legalFor({{S64, LocalPtr}});
1320   }
1321   if (ST.hasAtomicFaddInsts())
1322     Atomic.legalFor({{S32, GlobalPtr}});
1323 
1324   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1325   // demarshalling
1326   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1327     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1328                 {S32, FlatPtr}, {S64, FlatPtr}})
1329     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1330                {S32, RegionPtr}, {S64, RegionPtr}});
1331   // TODO: Pointer types, any 32-bit or 64-bit vector
1332 
1333   // Condition should be s32 for scalar, s1 for vector.
1334   getActionDefinitionsBuilder(G_SELECT)
1335       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1336                                  LocalPtr, FlatPtr, PrivatePtr,
1337                                  LLT::fixed_vector(2, LocalPtr),
1338                                  LLT::fixed_vector(2, PrivatePtr)},
1339                                 {S1, S32})
1340       .clampScalar(0, S16, S64)
1341       .scalarize(1)
1342       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1343       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1344       .clampMaxNumElements(0, S32, 2)
1345       .clampMaxNumElements(0, LocalPtr, 2)
1346       .clampMaxNumElements(0, PrivatePtr, 2)
1347       .scalarize(0)
1348       .widenScalarToNextPow2(0)
1349       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1350 
1351   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1352   // be more flexible with the shift amount type.
1353   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1354     .legalFor({{S32, S32}, {S64, S32}});
1355   if (ST.has16BitInsts()) {
1356     if (ST.hasVOP3PInsts()) {
1357       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1358             .clampMaxNumElements(0, S16, 2);
1359     } else
1360       Shifts.legalFor({{S16, S16}});
1361 
1362     // TODO: Support 16-bit shift amounts for all types
1363     Shifts.widenScalarIf(
1364       [=](const LegalityQuery &Query) {
1365         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1366         // 32-bit amount.
1367         const LLT ValTy = Query.Types[0];
1368         const LLT AmountTy = Query.Types[1];
1369         return ValTy.getSizeInBits() <= 16 &&
1370                AmountTy.getSizeInBits() < 16;
1371       }, changeTo(1, S16));
1372     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1373     Shifts.clampScalar(1, S32, S32);
1374     Shifts.clampScalar(0, S16, S64);
1375     Shifts.widenScalarToNextPow2(0, 16);
1376 
1377     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1378       .minScalar(0, S16)
1379       .scalarize(0)
1380       .lower();
1381   } else {
1382     // Make sure we legalize the shift amount type first, as the general
1383     // expansion for the shifted type will produce much worse code if it hasn't
1384     // been truncated already.
1385     Shifts.clampScalar(1, S32, S32);
1386     Shifts.clampScalar(0, S32, S64);
1387     Shifts.widenScalarToNextPow2(0, 32);
1388 
1389     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1390       .minScalar(0, S32)
1391       .scalarize(0)
1392       .lower();
1393   }
1394   Shifts.scalarize(0);
1395 
1396   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1397     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1398     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1399     unsigned IdxTypeIdx = 2;
1400 
1401     getActionDefinitionsBuilder(Op)
1402       .customIf([=](const LegalityQuery &Query) {
1403           const LLT EltTy = Query.Types[EltTypeIdx];
1404           const LLT VecTy = Query.Types[VecTypeIdx];
1405           const LLT IdxTy = Query.Types[IdxTypeIdx];
1406           const unsigned EltSize = EltTy.getSizeInBits();
1407           return (EltSize == 32 || EltSize == 64) &&
1408                   VecTy.getSizeInBits() % 32 == 0 &&
1409                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1410                   IdxTy.getSizeInBits() == 32;
1411         })
1412       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1413                  bitcastToVectorElement32(VecTypeIdx))
1414       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1415       .bitcastIf(
1416         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1417         [=](const LegalityQuery &Query) {
1418           // For > 64-bit element types, try to turn this into a 64-bit
1419           // element vector since we may be able to do better indexing
1420           // if this is scalar. If not, fall back to 32.
1421           const LLT EltTy = Query.Types[EltTypeIdx];
1422           const LLT VecTy = Query.Types[VecTypeIdx];
1423           const unsigned DstEltSize = EltTy.getSizeInBits();
1424           const unsigned VecSize = VecTy.getSizeInBits();
1425 
1426           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1427           return std::make_pair(
1428               VecTypeIdx,
1429               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1430         })
1431       .clampScalar(EltTypeIdx, S32, S64)
1432       .clampScalar(VecTypeIdx, S32, S64)
1433       .clampScalar(IdxTypeIdx, S32, S32)
1434       .clampMaxNumElements(VecTypeIdx, S32, 32)
1435       // TODO: Clamp elements for 64-bit vectors?
1436       // It should only be necessary with variable indexes.
1437       // As a last resort, lower to the stack
1438       .lower();
1439   }
1440 
1441   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1442     .unsupportedIf([=](const LegalityQuery &Query) {
1443         const LLT &EltTy = Query.Types[1].getElementType();
1444         return Query.Types[0] != EltTy;
1445       });
1446 
1447   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1448     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1449     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1450 
1451     // FIXME: Doesn't handle extract of illegal sizes.
1452     getActionDefinitionsBuilder(Op)
1453       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1454       // FIXME: Multiples of 16 should not be legal.
1455       .legalIf([=](const LegalityQuery &Query) {
1456           const LLT BigTy = Query.Types[BigTyIdx];
1457           const LLT LitTy = Query.Types[LitTyIdx];
1458           return (BigTy.getSizeInBits() % 32 == 0) &&
1459                  (LitTy.getSizeInBits() % 16 == 0);
1460         })
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT BigTy = Query.Types[BigTyIdx];
1464           return (BigTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1467       .widenScalarIf(
1468         [=](const LegalityQuery &Query) {
1469           const LLT LitTy = Query.Types[LitTyIdx];
1470           return (LitTy.getScalarSizeInBits() < 16);
1471         },
1472         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1473       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1474       .widenScalarToNextPow2(BigTyIdx, 32);
1475 
1476   }
1477 
1478   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1479     .legalForCartesianProduct(AllS32Vectors, {S32})
1480     .legalForCartesianProduct(AllS64Vectors, {S64})
1481     .clampNumElements(0, V16S32, V32S32)
1482     .clampNumElements(0, V2S64, V16S64)
1483     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1484 
1485   if (ST.hasScalarPackInsts()) {
1486     BuildVector
1487       // FIXME: Should probably widen s1 vectors straight to s32
1488       .minScalarOrElt(0, S16)
1489       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1490       .minScalar(1, S32);
1491 
1492     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1493       .legalFor({V2S16, S32})
1494       .lower();
1495     BuildVector.minScalarOrElt(0, S32);
1496   } else {
1497     BuildVector.customFor({V2S16, S16});
1498     BuildVector.minScalarOrElt(0, S32);
1499 
1500     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1501       .customFor({V2S16, S32})
1502       .lower();
1503   }
1504 
1505   BuildVector.legalIf(isRegisterType(0));
1506 
1507   // FIXME: Clamp maximum size
1508   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1509     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1510     .clampMaxNumElements(0, S32, 32)
1511     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1512     .clampMaxNumElements(0, S16, 64);
1513 
1514   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1515   // pre-legalize.
1516   if (ST.hasVOP3PInsts()) {
1517     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1518       .customFor({V2S16, V2S16})
1519       .lower();
1520   } else
1521     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1522 
1523   // Merge/Unmerge
1524   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1525     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1526     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1527 
1528     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1529       const LLT Ty = Query.Types[TypeIdx];
1530       if (Ty.isVector()) {
1531         const LLT &EltTy = Ty.getElementType();
1532         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1533           return true;
1534         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1535           return true;
1536       }
1537       return false;
1538     };
1539 
1540     auto &Builder = getActionDefinitionsBuilder(Op)
1541       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1542       .lowerFor({{S16, V2S16}})
1543       .lowerIf([=](const LegalityQuery &Query) {
1544           const LLT BigTy = Query.Types[BigTyIdx];
1545           return BigTy.getSizeInBits() == 32;
1546         })
1547       // Try to widen to s16 first for small types.
1548       // TODO: Only do this on targets with legal s16 shifts
1549       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1550       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1551       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1552       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1553                            elementTypeIs(1, S16)),
1554                        changeTo(1, V2S16))
1555       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1556       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1557       // valid.
1558       .clampScalar(LitTyIdx, S32, S512)
1559       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1560       // Break up vectors with weird elements into scalars
1561       .fewerElementsIf(
1562         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1563         scalarize(0))
1564       .fewerElementsIf(
1565         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1566         scalarize(1))
1567       .clampScalar(BigTyIdx, S32, MaxScalar);
1568 
1569     if (Op == G_MERGE_VALUES) {
1570       Builder.widenScalarIf(
1571         // TODO: Use 16-bit shifts if legal for 8-bit values?
1572         [=](const LegalityQuery &Query) {
1573           const LLT Ty = Query.Types[LitTyIdx];
1574           return Ty.getSizeInBits() < 32;
1575         },
1576         changeTo(LitTyIdx, S32));
1577     }
1578 
1579     Builder.widenScalarIf(
1580       [=](const LegalityQuery &Query) {
1581         const LLT Ty = Query.Types[BigTyIdx];
1582         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1583           Ty.getSizeInBits() % 16 != 0;
1584       },
1585       [=](const LegalityQuery &Query) {
1586         // Pick the next power of 2, or a multiple of 64 over 128.
1587         // Whichever is smaller.
1588         const LLT &Ty = Query.Types[BigTyIdx];
1589         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1590         if (NewSizeInBits >= 256) {
1591           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1592           if (RoundedTo < NewSizeInBits)
1593             NewSizeInBits = RoundedTo;
1594         }
1595         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1596       })
1597       // Any vectors left are the wrong size. Scalarize them.
1598       .scalarize(0)
1599       .scalarize(1);
1600   }
1601 
1602   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1603   // RegBankSelect.
1604   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1605     .legalFor({{S32}, {S64}});
1606 
1607   if (ST.hasVOP3PInsts()) {
1608     SextInReg.lowerFor({{V2S16}})
1609       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1610       // get more vector shift opportunities, since we'll get those when
1611       // expanded.
1612       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1613   } else if (ST.has16BitInsts()) {
1614     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1615   } else {
1616     // Prefer to promote to s32 before lowering if we don't have 16-bit
1617     // shifts. This avoid a lot of intermediate truncate and extend operations.
1618     SextInReg.lowerFor({{S32}, {S64}});
1619   }
1620 
1621   SextInReg
1622     .scalarize(0)
1623     .clampScalar(0, S32, S64)
1624     .lower();
1625 
1626   // TODO: Only Try to form v2s16 with legal packed instructions.
1627   getActionDefinitionsBuilder(G_FSHR)
1628     .legalFor({{S32, S32}})
1629     .lowerFor({{V2S16, V2S16}})
1630     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1631     .scalarize(0)
1632     .lower();
1633 
1634   if (ST.hasVOP3PInsts()) {
1635     getActionDefinitionsBuilder(G_FSHL)
1636       .lowerFor({{V2S16, V2S16}})
1637       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1638       .scalarize(0)
1639       .lower();
1640   } else {
1641     getActionDefinitionsBuilder(G_FSHL)
1642       .scalarize(0)
1643       .lower();
1644   }
1645 
1646   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1647     .legalFor({S64});
1648 
1649   getActionDefinitionsBuilder(G_FENCE)
1650     .alwaysLegal();
1651 
1652   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1653       .scalarize(0)
1654       .minScalar(0, S32)
1655       .lower();
1656 
1657   getActionDefinitionsBuilder({
1658       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1659       G_FCOPYSIGN,
1660 
1661       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1662       G_ATOMICRMW_NAND,
1663       G_ATOMICRMW_FSUB,
1664       G_READ_REGISTER,
1665       G_WRITE_REGISTER,
1666 
1667       G_SADDO, G_SSUBO,
1668 
1669        // TODO: Implement
1670       G_FMINIMUM, G_FMAXIMUM}).lower();
1671 
1672   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1673         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1674         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1675     .unsupported();
1676 
1677   getLegacyLegalizerInfo().computeTables();
1678   verify(*ST.getInstrInfo());
1679 }
1680 
1681 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1682                                          MachineInstr &MI) const {
1683   MachineIRBuilder &B = Helper.MIRBuilder;
1684   MachineRegisterInfo &MRI = *B.getMRI();
1685 
1686   switch (MI.getOpcode()) {
1687   case TargetOpcode::G_ADDRSPACE_CAST:
1688     return legalizeAddrSpaceCast(MI, MRI, B);
1689   case TargetOpcode::G_FRINT:
1690     return legalizeFrint(MI, MRI, B);
1691   case TargetOpcode::G_FCEIL:
1692     return legalizeFceil(MI, MRI, B);
1693   case TargetOpcode::G_FREM:
1694     return legalizeFrem(MI, MRI, B);
1695   case TargetOpcode::G_INTRINSIC_TRUNC:
1696     return legalizeIntrinsicTrunc(MI, MRI, B);
1697   case TargetOpcode::G_SITOFP:
1698     return legalizeITOFP(MI, MRI, B, true);
1699   case TargetOpcode::G_UITOFP:
1700     return legalizeITOFP(MI, MRI, B, false);
1701   case TargetOpcode::G_FPTOSI:
1702     return legalizeFPTOI(MI, MRI, B, true);
1703   case TargetOpcode::G_FPTOUI:
1704     return legalizeFPTOI(MI, MRI, B, false);
1705   case TargetOpcode::G_FMINNUM:
1706   case TargetOpcode::G_FMAXNUM:
1707   case TargetOpcode::G_FMINNUM_IEEE:
1708   case TargetOpcode::G_FMAXNUM_IEEE:
1709     return legalizeMinNumMaxNum(Helper, MI);
1710   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1711     return legalizeExtractVectorElt(MI, MRI, B);
1712   case TargetOpcode::G_INSERT_VECTOR_ELT:
1713     return legalizeInsertVectorElt(MI, MRI, B);
1714   case TargetOpcode::G_SHUFFLE_VECTOR:
1715     return legalizeShuffleVector(MI, MRI, B);
1716   case TargetOpcode::G_FSIN:
1717   case TargetOpcode::G_FCOS:
1718     return legalizeSinCos(MI, MRI, B);
1719   case TargetOpcode::G_GLOBAL_VALUE:
1720     return legalizeGlobalValue(MI, MRI, B);
1721   case TargetOpcode::G_LOAD:
1722   case TargetOpcode::G_SEXTLOAD:
1723   case TargetOpcode::G_ZEXTLOAD:
1724     return legalizeLoad(Helper, MI);
1725   case TargetOpcode::G_FMAD:
1726     return legalizeFMad(MI, MRI, B);
1727   case TargetOpcode::G_FDIV:
1728     return legalizeFDIV(MI, MRI, B);
1729   case TargetOpcode::G_UDIV:
1730   case TargetOpcode::G_UREM:
1731   case TargetOpcode::G_UDIVREM:
1732     return legalizeUnsignedDIV_REM(MI, MRI, B);
1733   case TargetOpcode::G_SDIV:
1734   case TargetOpcode::G_SREM:
1735   case TargetOpcode::G_SDIVREM:
1736     return legalizeSignedDIV_REM(MI, MRI, B);
1737   case TargetOpcode::G_ATOMIC_CMPXCHG:
1738     return legalizeAtomicCmpXChg(MI, MRI, B);
1739   case TargetOpcode::G_FLOG:
1740     return legalizeFlog(MI, B, numbers::ln2f);
1741   case TargetOpcode::G_FLOG10:
1742     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1743   case TargetOpcode::G_FEXP:
1744     return legalizeFExp(MI, B);
1745   case TargetOpcode::G_FPOW:
1746     return legalizeFPow(MI, B);
1747   case TargetOpcode::G_FFLOOR:
1748     return legalizeFFloor(MI, MRI, B);
1749   case TargetOpcode::G_BUILD_VECTOR:
1750     return legalizeBuildVector(MI, MRI, B);
1751   default:
1752     return false;
1753   }
1754 
1755   llvm_unreachable("expected switch to return");
1756 }
1757 
1758 Register AMDGPULegalizerInfo::getSegmentAperture(
1759   unsigned AS,
1760   MachineRegisterInfo &MRI,
1761   MachineIRBuilder &B) const {
1762   MachineFunction &MF = B.getMF();
1763   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1764   const LLT S32 = LLT::scalar(32);
1765 
1766   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1767 
1768   if (ST.hasApertureRegs()) {
1769     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1770     // getreg.
1771     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1772         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1773         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1774     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1775         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1776         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1777     unsigned Encoding =
1778         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1779         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1780         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1781 
1782     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1783 
1784     B.buildInstr(AMDGPU::S_GETREG_B32)
1785       .addDef(GetReg)
1786       .addImm(Encoding);
1787     MRI.setType(GetReg, S32);
1788 
1789     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1790     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1791   }
1792 
1793   Register QueuePtr = MRI.createGenericVirtualRegister(
1794     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1795 
1796   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1797     return Register();
1798 
1799   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1800   // private_segment_aperture_base_hi.
1801   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1802 
1803   // TODO: can we be smarter about machine pointer info?
1804   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1805   MachineMemOperand *MMO = MF.getMachineMemOperand(
1806       PtrInfo,
1807       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1808           MachineMemOperand::MOInvariant,
1809       4, commonAlignment(Align(64), StructOffset));
1810 
1811   Register LoadAddr;
1812 
1813   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1814   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1815 }
1816 
1817 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1818   MachineInstr &MI, MachineRegisterInfo &MRI,
1819   MachineIRBuilder &B) const {
1820   MachineFunction &MF = B.getMF();
1821 
1822   const LLT S32 = LLT::scalar(32);
1823   Register Dst = MI.getOperand(0).getReg();
1824   Register Src = MI.getOperand(1).getReg();
1825 
1826   LLT DstTy = MRI.getType(Dst);
1827   LLT SrcTy = MRI.getType(Src);
1828   unsigned DestAS = DstTy.getAddressSpace();
1829   unsigned SrcAS = SrcTy.getAddressSpace();
1830 
1831   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1832   // vector element.
1833   assert(!DstTy.isVector());
1834 
1835   const AMDGPUTargetMachine &TM
1836     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1837 
1838   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1839     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1840     return true;
1841   }
1842 
1843   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1844     // Truncate.
1845     B.buildExtract(Dst, Src, 0);
1846     MI.eraseFromParent();
1847     return true;
1848   }
1849 
1850   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1851     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1852     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1853 
1854     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1855     // another. Merge operands are required to be the same type, but creating an
1856     // extra ptrtoint would be kind of pointless.
1857     auto HighAddr = B.buildConstant(
1858       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1859     B.buildMerge(Dst, {Src, HighAddr});
1860     MI.eraseFromParent();
1861     return true;
1862   }
1863 
1864   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1865     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1866            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1867     unsigned NullVal = TM.getNullPointerValue(DestAS);
1868 
1869     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1870     auto FlatNull = B.buildConstant(SrcTy, 0);
1871 
1872     // Extract low 32-bits of the pointer.
1873     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1874 
1875     auto CmpRes =
1876         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1877     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1878 
1879     MI.eraseFromParent();
1880     return true;
1881   }
1882 
1883   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1884     return false;
1885 
1886   if (!ST.hasFlatAddressSpace())
1887     return false;
1888 
1889   auto SegmentNull =
1890       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1891   auto FlatNull =
1892       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1893 
1894   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1895   if (!ApertureReg.isValid())
1896     return false;
1897 
1898   auto CmpRes =
1899       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1900 
1901   // Coerce the type of the low half of the result so we can use merge_values.
1902   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1903 
1904   // TODO: Should we allow mismatched types but matching sizes in merges to
1905   // avoid the ptrtoint?
1906   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1907   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1908 
1909   MI.eraseFromParent();
1910   return true;
1911 }
1912 
1913 bool AMDGPULegalizerInfo::legalizeFrint(
1914   MachineInstr &MI, MachineRegisterInfo &MRI,
1915   MachineIRBuilder &B) const {
1916   Register Src = MI.getOperand(1).getReg();
1917   LLT Ty = MRI.getType(Src);
1918   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1919 
1920   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1921   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1922 
1923   auto C1 = B.buildFConstant(Ty, C1Val);
1924   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1925 
1926   // TODO: Should this propagate fast-math-flags?
1927   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1928   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1929 
1930   auto C2 = B.buildFConstant(Ty, C2Val);
1931   auto Fabs = B.buildFAbs(Ty, Src);
1932 
1933   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1934   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1935   MI.eraseFromParent();
1936   return true;
1937 }
1938 
1939 bool AMDGPULegalizerInfo::legalizeFceil(
1940   MachineInstr &MI, MachineRegisterInfo &MRI,
1941   MachineIRBuilder &B) const {
1942 
1943   const LLT S1 = LLT::scalar(1);
1944   const LLT S64 = LLT::scalar(64);
1945 
1946   Register Src = MI.getOperand(1).getReg();
1947   assert(MRI.getType(Src) == S64);
1948 
1949   // result = trunc(src)
1950   // if (src > 0.0 && src != result)
1951   //   result += 1.0
1952 
1953   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1954 
1955   const auto Zero = B.buildFConstant(S64, 0.0);
1956   const auto One = B.buildFConstant(S64, 1.0);
1957   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1958   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1959   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1960   auto Add = B.buildSelect(S64, And, One, Zero);
1961 
1962   // TODO: Should this propagate fast-math-flags?
1963   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1964   return true;
1965 }
1966 
1967 bool AMDGPULegalizerInfo::legalizeFrem(
1968   MachineInstr &MI, MachineRegisterInfo &MRI,
1969   MachineIRBuilder &B) const {
1970     Register DstReg = MI.getOperand(0).getReg();
1971     Register Src0Reg = MI.getOperand(1).getReg();
1972     Register Src1Reg = MI.getOperand(2).getReg();
1973     auto Flags = MI.getFlags();
1974     LLT Ty = MRI.getType(DstReg);
1975 
1976     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1977     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1978     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1979     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1980     MI.eraseFromParent();
1981     return true;
1982 }
1983 
1984 static MachineInstrBuilder extractF64Exponent(Register Hi,
1985                                               MachineIRBuilder &B) {
1986   const unsigned FractBits = 52;
1987   const unsigned ExpBits = 11;
1988   LLT S32 = LLT::scalar(32);
1989 
1990   auto Const0 = B.buildConstant(S32, FractBits - 32);
1991   auto Const1 = B.buildConstant(S32, ExpBits);
1992 
1993   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1994     .addUse(Hi)
1995     .addUse(Const0.getReg(0))
1996     .addUse(Const1.getReg(0));
1997 
1998   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1999 }
2000 
2001 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2002   MachineInstr &MI, MachineRegisterInfo &MRI,
2003   MachineIRBuilder &B) const {
2004   const LLT S1 = LLT::scalar(1);
2005   const LLT S32 = LLT::scalar(32);
2006   const LLT S64 = LLT::scalar(64);
2007 
2008   Register Src = MI.getOperand(1).getReg();
2009   assert(MRI.getType(Src) == S64);
2010 
2011   // TODO: Should this use extract since the low half is unused?
2012   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2013   Register Hi = Unmerge.getReg(1);
2014 
2015   // Extract the upper half, since this is where we will find the sign and
2016   // exponent.
2017   auto Exp = extractF64Exponent(Hi, B);
2018 
2019   const unsigned FractBits = 52;
2020 
2021   // Extract the sign bit.
2022   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2023   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2024 
2025   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2026 
2027   const auto Zero32 = B.buildConstant(S32, 0);
2028 
2029   // Extend back to 64-bits.
2030   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2031 
2032   auto Shr = B.buildAShr(S64, FractMask, Exp);
2033   auto Not = B.buildNot(S64, Shr);
2034   auto Tmp0 = B.buildAnd(S64, Src, Not);
2035   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2036 
2037   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2038   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2039 
2040   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2041   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2042   MI.eraseFromParent();
2043   return true;
2044 }
2045 
2046 bool AMDGPULegalizerInfo::legalizeITOFP(
2047   MachineInstr &MI, MachineRegisterInfo &MRI,
2048   MachineIRBuilder &B, bool Signed) const {
2049 
2050   Register Dst = MI.getOperand(0).getReg();
2051   Register Src = MI.getOperand(1).getReg();
2052 
2053   const LLT S64 = LLT::scalar(64);
2054   const LLT S32 = LLT::scalar(32);
2055 
2056   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2057 
2058   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2059 
2060   auto CvtHi = Signed ?
2061     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2062     B.buildUITOFP(S64, Unmerge.getReg(1));
2063 
2064   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2065 
2066   auto ThirtyTwo = B.buildConstant(S32, 32);
2067   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2068     .addUse(CvtHi.getReg(0))
2069     .addUse(ThirtyTwo.getReg(0));
2070 
2071   // TODO: Should this propagate fast-math-flags?
2072   B.buildFAdd(Dst, LdExp, CvtLo);
2073   MI.eraseFromParent();
2074   return true;
2075 }
2076 
2077 // TODO: Copied from DAG implementation. Verify logic and document how this
2078 // actually works.
2079 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2080                                         MachineRegisterInfo &MRI,
2081                                         MachineIRBuilder &B,
2082                                         bool Signed) const {
2083 
2084   Register Dst = MI.getOperand(0).getReg();
2085   Register Src = MI.getOperand(1).getReg();
2086 
2087   const LLT S64 = LLT::scalar(64);
2088   const LLT S32 = LLT::scalar(32);
2089 
2090   const LLT SrcLT = MRI.getType(Src);
2091   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2092 
2093   unsigned Flags = MI.getFlags();
2094 
2095   // The basic idea of converting a floating point number into a pair of 32-bit
2096   // integers is illustrated as follows:
2097   //
2098   //     tf := trunc(val);
2099   //    hif := floor(tf * 2^-32);
2100   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2101   //     hi := fptoi(hif);
2102   //     lo := fptoi(lof);
2103   //
2104   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2105   MachineInstrBuilder Sign;
2106   if (Signed && SrcLT == S32) {
2107     // However, a 32-bit floating point number has only 23 bits mantissa and
2108     // it's not enough to hold all the significant bits of `lof` if val is
2109     // negative. To avoid the loss of precision, We need to take the absolute
2110     // value after truncating and flip the result back based on the original
2111     // signedness.
2112     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2113     Trunc = B.buildFAbs(S32, Trunc, Flags);
2114   }
2115   MachineInstrBuilder K0, K1;
2116   if (SrcLT == S64) {
2117     K0 = B.buildFConstant(S64,
2118                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2119     K1 = B.buildFConstant(S64,
2120                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2121   } else {
2122     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2123     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2124   }
2125 
2126   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2127   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2128   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2129 
2130   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2131                                      : B.buildFPTOUI(S32, FloorMul);
2132   auto Lo = B.buildFPTOUI(S32, Fma);
2133 
2134   if (Signed && SrcLT == S32) {
2135     // Flip the result based on the signedness, which is either all 0s or 1s.
2136     Sign = B.buildMerge(S64, {Sign, Sign});
2137     // r := xor({lo, hi}, sign) - sign;
2138     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2139   } else
2140     B.buildMerge(Dst, {Lo, Hi});
2141   MI.eraseFromParent();
2142 
2143   return true;
2144 }
2145 
2146 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2147                                                MachineInstr &MI) const {
2148   MachineFunction &MF = Helper.MIRBuilder.getMF();
2149   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2150 
2151   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2152                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2153 
2154   // With ieee_mode disabled, the instructions have the correct behavior
2155   // already for G_FMINNUM/G_FMAXNUM
2156   if (!MFI->getMode().IEEE)
2157     return !IsIEEEOp;
2158 
2159   if (IsIEEEOp)
2160     return true;
2161 
2162   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2163 }
2164 
2165 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2166   MachineInstr &MI, MachineRegisterInfo &MRI,
2167   MachineIRBuilder &B) const {
2168   // TODO: Should move some of this into LegalizerHelper.
2169 
2170   // TODO: Promote dynamic indexing of s16 to s32
2171 
2172   // FIXME: Artifact combiner probably should have replaced the truncated
2173   // constant before this, so we shouldn't need
2174   // getConstantVRegValWithLookThrough.
2175   Optional<ValueAndVReg> MaybeIdxVal =
2176       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2177   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2178     return true;
2179   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2180 
2181   Register Dst = MI.getOperand(0).getReg();
2182   Register Vec = MI.getOperand(1).getReg();
2183 
2184   LLT VecTy = MRI.getType(Vec);
2185   LLT EltTy = VecTy.getElementType();
2186   assert(EltTy == MRI.getType(Dst));
2187 
2188   if (IdxVal < VecTy.getNumElements())
2189     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2190   else
2191     B.buildUndef(Dst);
2192 
2193   MI.eraseFromParent();
2194   return true;
2195 }
2196 
2197 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2198   MachineInstr &MI, MachineRegisterInfo &MRI,
2199   MachineIRBuilder &B) const {
2200   // TODO: Should move some of this into LegalizerHelper.
2201 
2202   // TODO: Promote dynamic indexing of s16 to s32
2203 
2204   // FIXME: Artifact combiner probably should have replaced the truncated
2205   // constant before this, so we shouldn't need
2206   // getConstantVRegValWithLookThrough.
2207   Optional<ValueAndVReg> MaybeIdxVal =
2208       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2209   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2210     return true;
2211 
2212   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2213   Register Dst = MI.getOperand(0).getReg();
2214   Register Vec = MI.getOperand(1).getReg();
2215   Register Ins = MI.getOperand(2).getReg();
2216 
2217   LLT VecTy = MRI.getType(Vec);
2218   LLT EltTy = VecTy.getElementType();
2219   assert(EltTy == MRI.getType(Ins));
2220 
2221   if (IdxVal < VecTy.getNumElements())
2222     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2223   else
2224     B.buildUndef(Dst);
2225 
2226   MI.eraseFromParent();
2227   return true;
2228 }
2229 
2230 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2231   MachineInstr &MI, MachineRegisterInfo &MRI,
2232   MachineIRBuilder &B) const {
2233   const LLT V2S16 = LLT::fixed_vector(2, 16);
2234 
2235   Register Dst = MI.getOperand(0).getReg();
2236   Register Src0 = MI.getOperand(1).getReg();
2237   LLT DstTy = MRI.getType(Dst);
2238   LLT SrcTy = MRI.getType(Src0);
2239 
2240   if (SrcTy == V2S16 && DstTy == V2S16 &&
2241       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2242     return true;
2243 
2244   MachineIRBuilder HelperBuilder(MI);
2245   GISelObserverWrapper DummyObserver;
2246   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2247   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2248 }
2249 
2250 bool AMDGPULegalizerInfo::legalizeSinCos(
2251   MachineInstr &MI, MachineRegisterInfo &MRI,
2252   MachineIRBuilder &B) const {
2253 
2254   Register DstReg = MI.getOperand(0).getReg();
2255   Register SrcReg = MI.getOperand(1).getReg();
2256   LLT Ty = MRI.getType(DstReg);
2257   unsigned Flags = MI.getFlags();
2258 
2259   Register TrigVal;
2260   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2261   if (ST.hasTrigReducedRange()) {
2262     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2263     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2264       .addUse(MulVal.getReg(0))
2265       .setMIFlags(Flags).getReg(0);
2266   } else
2267     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2268 
2269   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2270     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2271   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2272     .addUse(TrigVal)
2273     .setMIFlags(Flags);
2274   MI.eraseFromParent();
2275   return true;
2276 }
2277 
2278 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2279                                                   MachineIRBuilder &B,
2280                                                   const GlobalValue *GV,
2281                                                   int64_t Offset,
2282                                                   unsigned GAFlags) const {
2283   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2284   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2285   // to the following code sequence:
2286   //
2287   // For constant address space:
2288   //   s_getpc_b64 s[0:1]
2289   //   s_add_u32 s0, s0, $symbol
2290   //   s_addc_u32 s1, s1, 0
2291   //
2292   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2293   //   a fixup or relocation is emitted to replace $symbol with a literal
2294   //   constant, which is a pc-relative offset from the encoding of the $symbol
2295   //   operand to the global variable.
2296   //
2297   // For global address space:
2298   //   s_getpc_b64 s[0:1]
2299   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2300   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2301   //
2302   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2303   //   fixups or relocations are emitted to replace $symbol@*@lo and
2304   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2305   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2306   //   operand to the global variable.
2307   //
2308   // What we want here is an offset from the value returned by s_getpc
2309   // (which is the address of the s_add_u32 instruction) to the global
2310   // variable, but since the encoding of $symbol starts 4 bytes after the start
2311   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2312   // small. This requires us to add 4 to the global variable offset in order to
2313   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2314   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2315   // instruction.
2316 
2317   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2318 
2319   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2320     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2321 
2322   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2323     .addDef(PCReg);
2324 
2325   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2326   if (GAFlags == SIInstrInfo::MO_NONE)
2327     MIB.addImm(0);
2328   else
2329     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2330 
2331   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2332 
2333   if (PtrTy.getSizeInBits() == 32)
2334     B.buildExtract(DstReg, PCReg, 0);
2335   return true;
2336  }
2337 
2338 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2339   MachineInstr &MI, MachineRegisterInfo &MRI,
2340   MachineIRBuilder &B) const {
2341   Register DstReg = MI.getOperand(0).getReg();
2342   LLT Ty = MRI.getType(DstReg);
2343   unsigned AS = Ty.getAddressSpace();
2344 
2345   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2346   MachineFunction &MF = B.getMF();
2347   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2348 
2349   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2350     if (!MFI->isModuleEntryFunction() &&
2351         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2352       const Function &Fn = MF.getFunction();
2353       DiagnosticInfoUnsupported BadLDSDecl(
2354         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2355         DS_Warning);
2356       Fn.getContext().diagnose(BadLDSDecl);
2357 
2358       // We currently don't have a way to correctly allocate LDS objects that
2359       // aren't directly associated with a kernel. We do force inlining of
2360       // functions that use local objects. However, if these dead functions are
2361       // not eliminated, we don't want a compile time error. Just emit a warning
2362       // and a trap, since there should be no callable path here.
2363       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2364       B.buildUndef(DstReg);
2365       MI.eraseFromParent();
2366       return true;
2367     }
2368 
2369     // TODO: We could emit code to handle the initialization somewhere.
2370     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2371       const SITargetLowering *TLI = ST.getTargetLowering();
2372       if (!TLI->shouldUseLDSConstAddress(GV)) {
2373         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2374         return true; // Leave in place;
2375       }
2376 
2377       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2378         Type *Ty = GV->getValueType();
2379         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2380         // zero-sized type in other languages to declare the dynamic shared
2381         // memory which size is not known at the compile time. They will be
2382         // allocated by the runtime and placed directly after the static
2383         // allocated ones. They all share the same offset.
2384         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2385           // Adjust alignment for that dynamic shared memory array.
2386           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2387           LLT S32 = LLT::scalar(32);
2388           auto Sz =
2389               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2390           B.buildIntToPtr(DstReg, Sz);
2391           MI.eraseFromParent();
2392           return true;
2393         }
2394       }
2395 
2396       B.buildConstant(
2397           DstReg,
2398           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2399       MI.eraseFromParent();
2400       return true;
2401     }
2402 
2403     const Function &Fn = MF.getFunction();
2404     DiagnosticInfoUnsupported BadInit(
2405       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2406     Fn.getContext().diagnose(BadInit);
2407     return true;
2408   }
2409 
2410   const SITargetLowering *TLI = ST.getTargetLowering();
2411 
2412   if (TLI->shouldEmitFixup(GV)) {
2413     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2414     MI.eraseFromParent();
2415     return true;
2416   }
2417 
2418   if (TLI->shouldEmitPCReloc(GV)) {
2419     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2420     MI.eraseFromParent();
2421     return true;
2422   }
2423 
2424   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2425   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2426 
2427   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2428       MachinePointerInfo::getGOT(MF),
2429       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2430           MachineMemOperand::MOInvariant,
2431       8 /*Size*/, Align(8));
2432 
2433   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2434 
2435   if (Ty.getSizeInBits() == 32) {
2436     // Truncate if this is a 32-bit constant adrdess.
2437     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2438     B.buildExtract(DstReg, Load, 0);
2439   } else
2440     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2441 
2442   MI.eraseFromParent();
2443   return true;
2444 }
2445 
2446 static LLT widenToNextPowerOf2(LLT Ty) {
2447   if (Ty.isVector())
2448     return Ty.changeElementCount(
2449         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2450   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2451 }
2452 
2453 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2454                                        MachineInstr &MI) const {
2455   MachineIRBuilder &B = Helper.MIRBuilder;
2456   MachineRegisterInfo &MRI = *B.getMRI();
2457   GISelChangeObserver &Observer = Helper.Observer;
2458 
2459   Register PtrReg = MI.getOperand(1).getReg();
2460   LLT PtrTy = MRI.getType(PtrReg);
2461   unsigned AddrSpace = PtrTy.getAddressSpace();
2462 
2463   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2464     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2465     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2466     Observer.changingInstr(MI);
2467     MI.getOperand(1).setReg(Cast.getReg(0));
2468     Observer.changedInstr(MI);
2469     return true;
2470   }
2471 
2472   if (MI.getOpcode() != AMDGPU::G_LOAD)
2473     return false;
2474 
2475   Register ValReg = MI.getOperand(0).getReg();
2476   LLT ValTy = MRI.getType(ValReg);
2477 
2478   MachineMemOperand *MMO = *MI.memoperands_begin();
2479   const unsigned ValSize = ValTy.getSizeInBits();
2480   const unsigned MemSize = 8 * MMO->getSize();
2481   const Align MemAlign = MMO->getAlign();
2482   const unsigned AlignInBits = 8 * MemAlign.value();
2483 
2484   // Widen non-power-of-2 loads to the alignment if needed
2485   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2486     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2487 
2488     // This was already the correct extending load result type, so just adjust
2489     // the memory type.
2490     if (WideMemSize == ValSize) {
2491       MachineFunction &MF = B.getMF();
2492 
2493       MachineMemOperand *WideMMO =
2494           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2495       Observer.changingInstr(MI);
2496       MI.setMemRefs(MF, {WideMMO});
2497       Observer.changedInstr(MI);
2498       return true;
2499     }
2500 
2501     // Don't bother handling edge case that should probably never be produced.
2502     if (ValSize > WideMemSize)
2503       return false;
2504 
2505     LLT WideTy = widenToNextPowerOf2(ValTy);
2506 
2507     Register WideLoad;
2508     if (!WideTy.isVector()) {
2509       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2510       B.buildTrunc(ValReg, WideLoad).getReg(0);
2511     } else {
2512       // Extract the subvector.
2513 
2514       if (isRegisterType(ValTy)) {
2515         // If this a case where G_EXTRACT is legal, use it.
2516         // (e.g. <3 x s32> -> <4 x s32>)
2517         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2518         B.buildExtract(ValReg, WideLoad, 0);
2519       } else {
2520         // For cases where the widened type isn't a nice register value, unmerge
2521         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2522         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2523         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2524         B.setInsertPt(B.getMBB(), MI.getIterator());
2525         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2526       }
2527     }
2528 
2529     MI.eraseFromParent();
2530     return true;
2531   }
2532 
2533   return false;
2534 }
2535 
2536 bool AMDGPULegalizerInfo::legalizeFMad(
2537   MachineInstr &MI, MachineRegisterInfo &MRI,
2538   MachineIRBuilder &B) const {
2539   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2540   assert(Ty.isScalar());
2541 
2542   MachineFunction &MF = B.getMF();
2543   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2544 
2545   // TODO: Always legal with future ftz flag.
2546   // FIXME: Do we need just output?
2547   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2548     return true;
2549   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2550     return true;
2551 
2552   MachineIRBuilder HelperBuilder(MI);
2553   GISelObserverWrapper DummyObserver;
2554   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2555   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2556 }
2557 
2558 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2559   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2560   Register DstReg = MI.getOperand(0).getReg();
2561   Register PtrReg = MI.getOperand(1).getReg();
2562   Register CmpVal = MI.getOperand(2).getReg();
2563   Register NewVal = MI.getOperand(3).getReg();
2564 
2565   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2566          "this should not have been custom lowered");
2567 
2568   LLT ValTy = MRI.getType(CmpVal);
2569   LLT VecTy = LLT::fixed_vector(2, ValTy);
2570 
2571   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2572 
2573   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2574     .addDef(DstReg)
2575     .addUse(PtrReg)
2576     .addUse(PackedVal)
2577     .setMemRefs(MI.memoperands());
2578 
2579   MI.eraseFromParent();
2580   return true;
2581 }
2582 
2583 bool AMDGPULegalizerInfo::legalizeFlog(
2584   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2585   Register Dst = MI.getOperand(0).getReg();
2586   Register Src = MI.getOperand(1).getReg();
2587   LLT Ty = B.getMRI()->getType(Dst);
2588   unsigned Flags = MI.getFlags();
2589 
2590   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2591   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2592 
2593   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2594   MI.eraseFromParent();
2595   return true;
2596 }
2597 
2598 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2599                                        MachineIRBuilder &B) const {
2600   Register Dst = MI.getOperand(0).getReg();
2601   Register Src = MI.getOperand(1).getReg();
2602   unsigned Flags = MI.getFlags();
2603   LLT Ty = B.getMRI()->getType(Dst);
2604 
2605   auto K = B.buildFConstant(Ty, numbers::log2e);
2606   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2607   B.buildFExp2(Dst, Mul, Flags);
2608   MI.eraseFromParent();
2609   return true;
2610 }
2611 
2612 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2613                                        MachineIRBuilder &B) const {
2614   Register Dst = MI.getOperand(0).getReg();
2615   Register Src0 = MI.getOperand(1).getReg();
2616   Register Src1 = MI.getOperand(2).getReg();
2617   unsigned Flags = MI.getFlags();
2618   LLT Ty = B.getMRI()->getType(Dst);
2619   const LLT S16 = LLT::scalar(16);
2620   const LLT S32 = LLT::scalar(32);
2621 
2622   if (Ty == S32) {
2623     auto Log = B.buildFLog2(S32, Src0, Flags);
2624     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2625       .addUse(Log.getReg(0))
2626       .addUse(Src1)
2627       .setMIFlags(Flags);
2628     B.buildFExp2(Dst, Mul, Flags);
2629   } else if (Ty == S16) {
2630     // There's no f16 fmul_legacy, so we need to convert for it.
2631     auto Log = B.buildFLog2(S16, Src0, Flags);
2632     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2633     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2634     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2635       .addUse(Ext0.getReg(0))
2636       .addUse(Ext1.getReg(0))
2637       .setMIFlags(Flags);
2638 
2639     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2640   } else
2641     return false;
2642 
2643   MI.eraseFromParent();
2644   return true;
2645 }
2646 
2647 // Find a source register, ignoring any possible source modifiers.
2648 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2649   Register ModSrc = OrigSrc;
2650   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2651     ModSrc = SrcFNeg->getOperand(1).getReg();
2652     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2653       ModSrc = SrcFAbs->getOperand(1).getReg();
2654   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2655     ModSrc = SrcFAbs->getOperand(1).getReg();
2656   return ModSrc;
2657 }
2658 
2659 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2660                                          MachineRegisterInfo &MRI,
2661                                          MachineIRBuilder &B) const {
2662 
2663   const LLT S1 = LLT::scalar(1);
2664   const LLT S64 = LLT::scalar(64);
2665   Register Dst = MI.getOperand(0).getReg();
2666   Register OrigSrc = MI.getOperand(1).getReg();
2667   unsigned Flags = MI.getFlags();
2668   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2669          "this should not have been custom lowered");
2670 
2671   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2672   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2673   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2674   // V_FRACT bug is:
2675   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2676   //
2677   // Convert floor(x) to (x - fract(x))
2678 
2679   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2680     .addUse(OrigSrc)
2681     .setMIFlags(Flags);
2682 
2683   // Give source modifier matching some assistance before obscuring a foldable
2684   // pattern.
2685 
2686   // TODO: We can avoid the neg on the fract? The input sign to fract
2687   // shouldn't matter?
2688   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2689 
2690   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2691 
2692   Register Min = MRI.createGenericVirtualRegister(S64);
2693 
2694   // We don't need to concern ourselves with the snan handling difference, so
2695   // use the one which will directly select.
2696   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2697   if (MFI->getMode().IEEE)
2698     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2699   else
2700     B.buildFMinNum(Min, Fract, Const, Flags);
2701 
2702   Register CorrectedFract = Min;
2703   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2704     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2705     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2706   }
2707 
2708   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2709   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2710 
2711   MI.eraseFromParent();
2712   return true;
2713 }
2714 
2715 // Turn an illegal packed v2s16 build vector into bit operations.
2716 // TODO: This should probably be a bitcast action in LegalizerHelper.
2717 bool AMDGPULegalizerInfo::legalizeBuildVector(
2718   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2719   Register Dst = MI.getOperand(0).getReg();
2720   const LLT S32 = LLT::scalar(32);
2721   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2722 
2723   Register Src0 = MI.getOperand(1).getReg();
2724   Register Src1 = MI.getOperand(2).getReg();
2725   assert(MRI.getType(Src0) == LLT::scalar(16));
2726 
2727   auto Merge = B.buildMerge(S32, {Src0, Src1});
2728   B.buildBitcast(Dst, Merge);
2729 
2730   MI.eraseFromParent();
2731   return true;
2732 }
2733 
2734 // Check that this is a G_XOR x, -1
2735 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2736   if (MI.getOpcode() != TargetOpcode::G_XOR)
2737     return false;
2738   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2739   return ConstVal && *ConstVal == -1;
2740 }
2741 
2742 // Return the use branch instruction, otherwise null if the usage is invalid.
2743 static MachineInstr *
2744 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2745                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2746   Register CondDef = MI.getOperand(0).getReg();
2747   if (!MRI.hasOneNonDBGUse(CondDef))
2748     return nullptr;
2749 
2750   MachineBasicBlock *Parent = MI.getParent();
2751   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2752 
2753   if (isNot(MRI, *UseMI)) {
2754     Register NegatedCond = UseMI->getOperand(0).getReg();
2755     if (!MRI.hasOneNonDBGUse(NegatedCond))
2756       return nullptr;
2757 
2758     // We're deleting the def of this value, so we need to remove it.
2759     UseMI->eraseFromParent();
2760 
2761     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2762     Negated = true;
2763   }
2764 
2765   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2766     return nullptr;
2767 
2768   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2769   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2770   if (Next == Parent->end()) {
2771     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2772     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2773       return nullptr;
2774     UncondBrTarget = &*NextMBB;
2775   } else {
2776     if (Next->getOpcode() != AMDGPU::G_BR)
2777       return nullptr;
2778     Br = &*Next;
2779     UncondBrTarget = Br->getOperand(0).getMBB();
2780   }
2781 
2782   return UseMI;
2783 }
2784 
2785 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2786                                          const ArgDescriptor *Arg,
2787                                          const TargetRegisterClass *ArgRC,
2788                                          LLT ArgTy) const {
2789   MCRegister SrcReg = Arg->getRegister();
2790   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2791   assert(DstReg.isVirtual() && "Virtual register expected");
2792 
2793   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2794                                              ArgTy);
2795   if (Arg->isMasked()) {
2796     // TODO: Should we try to emit this once in the entry block?
2797     const LLT S32 = LLT::scalar(32);
2798     const unsigned Mask = Arg->getMask();
2799     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2800 
2801     Register AndMaskSrc = LiveIn;
2802 
2803     if (Shift != 0) {
2804       auto ShiftAmt = B.buildConstant(S32, Shift);
2805       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2806     }
2807 
2808     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2809   } else {
2810     B.buildCopy(DstReg, LiveIn);
2811   }
2812 
2813   return true;
2814 }
2815 
2816 bool AMDGPULegalizerInfo::loadInputValue(
2817     Register DstReg, MachineIRBuilder &B,
2818     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2819   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2820   const ArgDescriptor *Arg;
2821   const TargetRegisterClass *ArgRC;
2822   LLT ArgTy;
2823   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2824 
2825   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2826     return false; // TODO: Handle these
2827   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2828 }
2829 
2830 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2831     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2832     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2833   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2834     return false;
2835 
2836   MI.eraseFromParent();
2837   return true;
2838 }
2839 
2840 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2841                                        MachineRegisterInfo &MRI,
2842                                        MachineIRBuilder &B) const {
2843   Register Dst = MI.getOperand(0).getReg();
2844   LLT DstTy = MRI.getType(Dst);
2845   LLT S16 = LLT::scalar(16);
2846   LLT S32 = LLT::scalar(32);
2847   LLT S64 = LLT::scalar(64);
2848 
2849   if (DstTy == S16)
2850     return legalizeFDIV16(MI, MRI, B);
2851   if (DstTy == S32)
2852     return legalizeFDIV32(MI, MRI, B);
2853   if (DstTy == S64)
2854     return legalizeFDIV64(MI, MRI, B);
2855 
2856   return false;
2857 }
2858 
2859 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2860                                                         Register DstDivReg,
2861                                                         Register DstRemReg,
2862                                                         Register X,
2863                                                         Register Y) const {
2864   const LLT S1 = LLT::scalar(1);
2865   const LLT S32 = LLT::scalar(32);
2866 
2867   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2868   // algorithm used here.
2869 
2870   // Initial estimate of inv(y).
2871   auto FloatY = B.buildUITOFP(S32, Y);
2872   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2873   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2874   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2875   auto Z = B.buildFPTOUI(S32, ScaledY);
2876 
2877   // One round of UNR.
2878   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2879   auto NegYZ = B.buildMul(S32, NegY, Z);
2880   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2881 
2882   // Quotient/remainder estimate.
2883   auto Q = B.buildUMulH(S32, X, Z);
2884   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2885 
2886   // First quotient/remainder refinement.
2887   auto One = B.buildConstant(S32, 1);
2888   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2889   if (DstDivReg)
2890     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2891   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2892 
2893   // Second quotient/remainder refinement.
2894   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2895   if (DstDivReg)
2896     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2897 
2898   if (DstRemReg)
2899     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2900 }
2901 
2902 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2903 //
2904 // Return lo, hi of result
2905 //
2906 // %cvt.lo = G_UITOFP Val.lo
2907 // %cvt.hi = G_UITOFP Val.hi
2908 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2909 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2910 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2911 // %mul2 = G_FMUL %mul1, 2**(-32)
2912 // %trunc = G_INTRINSIC_TRUNC %mul2
2913 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2914 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2915 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2916                                                        Register Val) {
2917   const LLT S32 = LLT::scalar(32);
2918   auto Unmerge = B.buildUnmerge(S32, Val);
2919 
2920   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2921   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2922 
2923   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2924                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2925 
2926   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2927   auto Mul1 =
2928       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2929 
2930   // 2**(-32)
2931   auto Mul2 =
2932       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2933   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2934 
2935   // -(2**32)
2936   auto Mad2 = B.buildFMAD(S32, Trunc,
2937                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2938 
2939   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2940   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2941 
2942   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2943 }
2944 
2945 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2946                                                         Register DstDivReg,
2947                                                         Register DstRemReg,
2948                                                         Register Numer,
2949                                                         Register Denom) const {
2950   const LLT S32 = LLT::scalar(32);
2951   const LLT S64 = LLT::scalar(64);
2952   const LLT S1 = LLT::scalar(1);
2953   Register RcpLo, RcpHi;
2954 
2955   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2956 
2957   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2958 
2959   auto Zero64 = B.buildConstant(S64, 0);
2960   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2961 
2962   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2963   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2964 
2965   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2966   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2967   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2968 
2969   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2970   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2971   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2972   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2973 
2974   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2975   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2976   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2977   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2978   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2979 
2980   auto Zero32 = B.buildConstant(S32, 0);
2981   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2982   auto Add2_HiC =
2983       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2984   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2985   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2986 
2987   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2988   Register NumerLo = UnmergeNumer.getReg(0);
2989   Register NumerHi = UnmergeNumer.getReg(1);
2990 
2991   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2992   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2993   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2994   Register Mul3_Lo = UnmergeMul3.getReg(0);
2995   Register Mul3_Hi = UnmergeMul3.getReg(1);
2996   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2997   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2998   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2999   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3000 
3001   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3002   Register DenomLo = UnmergeDenom.getReg(0);
3003   Register DenomHi = UnmergeDenom.getReg(1);
3004 
3005   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3006   auto C1 = B.buildSExt(S32, CmpHi);
3007 
3008   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3009   auto C2 = B.buildSExt(S32, CmpLo);
3010 
3011   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3012   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3013 
3014   // TODO: Here and below portions of the code can be enclosed into if/endif.
3015   // Currently control flow is unconditional and we have 4 selects after
3016   // potential endif to substitute PHIs.
3017 
3018   // if C3 != 0 ...
3019   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3020   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3021   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3022   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3023 
3024   auto One64 = B.buildConstant(S64, 1);
3025   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3026 
3027   auto C4 =
3028       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3029   auto C5 =
3030       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3031   auto C6 = B.buildSelect(
3032       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3033 
3034   // if (C6 != 0)
3035   auto Add4 = B.buildAdd(S64, Add3, One64);
3036   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3037 
3038   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3039   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3040   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3041 
3042   // endif C6
3043   // endif C3
3044 
3045   if (DstDivReg) {
3046     auto Sel1 = B.buildSelect(
3047         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3048     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3049                   Sel1, MulHi3);
3050   }
3051 
3052   if (DstRemReg) {
3053     auto Sel2 = B.buildSelect(
3054         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3055     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3056                   Sel2, Sub1);
3057   }
3058 }
3059 
3060 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3061                                                   MachineRegisterInfo &MRI,
3062                                                   MachineIRBuilder &B) const {
3063   Register DstDivReg, DstRemReg;
3064   switch (MI.getOpcode()) {
3065   default:
3066     llvm_unreachable("Unexpected opcode!");
3067   case AMDGPU::G_UDIV: {
3068     DstDivReg = MI.getOperand(0).getReg();
3069     break;
3070   }
3071   case AMDGPU::G_UREM: {
3072     DstRemReg = MI.getOperand(0).getReg();
3073     break;
3074   }
3075   case AMDGPU::G_UDIVREM: {
3076     DstDivReg = MI.getOperand(0).getReg();
3077     DstRemReg = MI.getOperand(1).getReg();
3078     break;
3079   }
3080   }
3081 
3082   const LLT S64 = LLT::scalar(64);
3083   const LLT S32 = LLT::scalar(32);
3084   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3085   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3086   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3087   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3088 
3089   if (Ty == S32)
3090     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3091   else if (Ty == S64)
3092     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3093   else
3094     return false;
3095 
3096   MI.eraseFromParent();
3097   return true;
3098 }
3099 
3100 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3101                                                 MachineRegisterInfo &MRI,
3102                                                 MachineIRBuilder &B) const {
3103   const LLT S64 = LLT::scalar(64);
3104   const LLT S32 = LLT::scalar(32);
3105 
3106   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3107   if (Ty != S32 && Ty != S64)
3108     return false;
3109 
3110   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3111   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3112   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3113 
3114   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3115   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3116   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3117 
3118   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3119   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3120 
3121   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3122   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3123 
3124   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3125   switch (MI.getOpcode()) {
3126   default:
3127     llvm_unreachable("Unexpected opcode!");
3128   case AMDGPU::G_SDIV: {
3129     DstDivReg = MI.getOperand(0).getReg();
3130     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3131     break;
3132   }
3133   case AMDGPU::G_SREM: {
3134     DstRemReg = MI.getOperand(0).getReg();
3135     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3136     break;
3137   }
3138   case AMDGPU::G_SDIVREM: {
3139     DstDivReg = MI.getOperand(0).getReg();
3140     DstRemReg = MI.getOperand(1).getReg();
3141     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3142     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3143     break;
3144   }
3145   }
3146 
3147   if (Ty == S32)
3148     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3149   else
3150     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3151 
3152   if (DstDivReg) {
3153     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3154     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3155     B.buildSub(DstDivReg, SignXor, Sign);
3156   }
3157 
3158   if (DstRemReg) {
3159     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3160     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3161     B.buildSub(DstRemReg, SignXor, Sign);
3162   }
3163 
3164   MI.eraseFromParent();
3165   return true;
3166 }
3167 
3168 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3169                                                  MachineRegisterInfo &MRI,
3170                                                  MachineIRBuilder &B) const {
3171   Register Res = MI.getOperand(0).getReg();
3172   Register LHS = MI.getOperand(1).getReg();
3173   Register RHS = MI.getOperand(2).getReg();
3174   uint16_t Flags = MI.getFlags();
3175   LLT ResTy = MRI.getType(Res);
3176 
3177   const MachineFunction &MF = B.getMF();
3178   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3179                             MI.getFlag(MachineInstr::FmAfn);
3180 
3181   if (!AllowInaccurateRcp)
3182     return false;
3183 
3184   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3185     // 1 / x -> RCP(x)
3186     if (CLHS->isExactlyValue(1.0)) {
3187       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3188         .addUse(RHS)
3189         .setMIFlags(Flags);
3190 
3191       MI.eraseFromParent();
3192       return true;
3193     }
3194 
3195     // -1 / x -> RCP( FNEG(x) )
3196     if (CLHS->isExactlyValue(-1.0)) {
3197       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3198       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3199         .addUse(FNeg.getReg(0))
3200         .setMIFlags(Flags);
3201 
3202       MI.eraseFromParent();
3203       return true;
3204     }
3205   }
3206 
3207   // x / y -> x * (1.0 / y)
3208   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3209     .addUse(RHS)
3210     .setMIFlags(Flags);
3211   B.buildFMul(Res, LHS, RCP, Flags);
3212 
3213   MI.eraseFromParent();
3214   return true;
3215 }
3216 
3217 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3218                                                    MachineRegisterInfo &MRI,
3219                                                    MachineIRBuilder &B) const {
3220   Register Res = MI.getOperand(0).getReg();
3221   Register X = MI.getOperand(1).getReg();
3222   Register Y = MI.getOperand(2).getReg();
3223   uint16_t Flags = MI.getFlags();
3224   LLT ResTy = MRI.getType(Res);
3225 
3226   const MachineFunction &MF = B.getMF();
3227   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3228                             MI.getFlag(MachineInstr::FmAfn);
3229 
3230   if (!AllowInaccurateRcp)
3231     return false;
3232 
3233   auto NegY = B.buildFNeg(ResTy, Y);
3234   auto One = B.buildFConstant(ResTy, 1.0);
3235 
3236   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3237     .addUse(Y)
3238     .setMIFlags(Flags);
3239 
3240   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3241   R = B.buildFMA(ResTy, Tmp0, R, R);
3242 
3243   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3244   R = B.buildFMA(ResTy, Tmp1, R, R);
3245 
3246   auto Ret = B.buildFMul(ResTy, X, R);
3247   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3248 
3249   B.buildFMA(Res, Tmp2, R, Ret);
3250   MI.eraseFromParent();
3251   return true;
3252 }
3253 
3254 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3255                                          MachineRegisterInfo &MRI,
3256                                          MachineIRBuilder &B) const {
3257   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3258     return true;
3259 
3260   Register Res = MI.getOperand(0).getReg();
3261   Register LHS = MI.getOperand(1).getReg();
3262   Register RHS = MI.getOperand(2).getReg();
3263 
3264   uint16_t Flags = MI.getFlags();
3265 
3266   LLT S16 = LLT::scalar(16);
3267   LLT S32 = LLT::scalar(32);
3268 
3269   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3270   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3271 
3272   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3273     .addUse(RHSExt.getReg(0))
3274     .setMIFlags(Flags);
3275 
3276   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3277   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3278 
3279   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3280     .addUse(RDst.getReg(0))
3281     .addUse(RHS)
3282     .addUse(LHS)
3283     .setMIFlags(Flags);
3284 
3285   MI.eraseFromParent();
3286   return true;
3287 }
3288 
3289 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3290 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3291 static void toggleSPDenormMode(bool Enable,
3292                                MachineIRBuilder &B,
3293                                const GCNSubtarget &ST,
3294                                AMDGPU::SIModeRegisterDefaults Mode) {
3295   // Set SP denorm mode to this value.
3296   unsigned SPDenormMode =
3297     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3298 
3299   if (ST.hasDenormModeInst()) {
3300     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3301     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3302 
3303     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3304     B.buildInstr(AMDGPU::S_DENORM_MODE)
3305       .addImm(NewDenormModeValue);
3306 
3307   } else {
3308     // Select FP32 bit field in mode register.
3309     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3310                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3311                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3312 
3313     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3314       .addImm(SPDenormMode)
3315       .addImm(SPDenormModeBitField);
3316   }
3317 }
3318 
3319 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3320                                          MachineRegisterInfo &MRI,
3321                                          MachineIRBuilder &B) const {
3322   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3323     return true;
3324 
3325   Register Res = MI.getOperand(0).getReg();
3326   Register LHS = MI.getOperand(1).getReg();
3327   Register RHS = MI.getOperand(2).getReg();
3328   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3329   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3330 
3331   uint16_t Flags = MI.getFlags();
3332 
3333   LLT S32 = LLT::scalar(32);
3334   LLT S1 = LLT::scalar(1);
3335 
3336   auto One = B.buildFConstant(S32, 1.0f);
3337 
3338   auto DenominatorScaled =
3339     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3340       .addUse(LHS)
3341       .addUse(RHS)
3342       .addImm(0)
3343       .setMIFlags(Flags);
3344   auto NumeratorScaled =
3345     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3346       .addUse(LHS)
3347       .addUse(RHS)
3348       .addImm(1)
3349       .setMIFlags(Flags);
3350 
3351   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3352     .addUse(DenominatorScaled.getReg(0))
3353     .setMIFlags(Flags);
3354   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3355 
3356   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3357   // aren't modeled as reading it.
3358   if (!Mode.allFP32Denormals())
3359     toggleSPDenormMode(true, B, ST, Mode);
3360 
3361   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3362   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3363   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3364   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3365   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3366   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3367 
3368   if (!Mode.allFP32Denormals())
3369     toggleSPDenormMode(false, B, ST, Mode);
3370 
3371   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3372     .addUse(Fma4.getReg(0))
3373     .addUse(Fma1.getReg(0))
3374     .addUse(Fma3.getReg(0))
3375     .addUse(NumeratorScaled.getReg(1))
3376     .setMIFlags(Flags);
3377 
3378   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3379     .addUse(Fmas.getReg(0))
3380     .addUse(RHS)
3381     .addUse(LHS)
3382     .setMIFlags(Flags);
3383 
3384   MI.eraseFromParent();
3385   return true;
3386 }
3387 
3388 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3389                                          MachineRegisterInfo &MRI,
3390                                          MachineIRBuilder &B) const {
3391   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3392     return true;
3393 
3394   Register Res = MI.getOperand(0).getReg();
3395   Register LHS = MI.getOperand(1).getReg();
3396   Register RHS = MI.getOperand(2).getReg();
3397 
3398   uint16_t Flags = MI.getFlags();
3399 
3400   LLT S64 = LLT::scalar(64);
3401   LLT S1 = LLT::scalar(1);
3402 
3403   auto One = B.buildFConstant(S64, 1.0);
3404 
3405   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3406     .addUse(LHS)
3407     .addUse(RHS)
3408     .addImm(0)
3409     .setMIFlags(Flags);
3410 
3411   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3412 
3413   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3414     .addUse(DivScale0.getReg(0))
3415     .setMIFlags(Flags);
3416 
3417   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3418   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3419   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3420 
3421   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3422     .addUse(LHS)
3423     .addUse(RHS)
3424     .addImm(1)
3425     .setMIFlags(Flags);
3426 
3427   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3428   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3429   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3430 
3431   Register Scale;
3432   if (!ST.hasUsableDivScaleConditionOutput()) {
3433     // Workaround a hardware bug on SI where the condition output from div_scale
3434     // is not usable.
3435 
3436     LLT S32 = LLT::scalar(32);
3437 
3438     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3439     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3440     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3441     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3442 
3443     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3444                               Scale1Unmerge.getReg(1));
3445     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3446                               Scale0Unmerge.getReg(1));
3447     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3448   } else {
3449     Scale = DivScale1.getReg(1);
3450   }
3451 
3452   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3453     .addUse(Fma4.getReg(0))
3454     .addUse(Fma3.getReg(0))
3455     .addUse(Mul.getReg(0))
3456     .addUse(Scale)
3457     .setMIFlags(Flags);
3458 
3459   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3460     .addUse(Fmas.getReg(0))
3461     .addUse(RHS)
3462     .addUse(LHS)
3463     .setMIFlags(Flags);
3464 
3465   MI.eraseFromParent();
3466   return true;
3467 }
3468 
3469 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3470                                                  MachineRegisterInfo &MRI,
3471                                                  MachineIRBuilder &B) const {
3472   Register Res = MI.getOperand(0).getReg();
3473   Register LHS = MI.getOperand(2).getReg();
3474   Register RHS = MI.getOperand(3).getReg();
3475   uint16_t Flags = MI.getFlags();
3476 
3477   LLT S32 = LLT::scalar(32);
3478   LLT S1 = LLT::scalar(1);
3479 
3480   auto Abs = B.buildFAbs(S32, RHS, Flags);
3481   const APFloat C0Val(1.0f);
3482 
3483   auto C0 = B.buildConstant(S32, 0x6f800000);
3484   auto C1 = B.buildConstant(S32, 0x2f800000);
3485   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3486 
3487   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3488   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3489 
3490   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3491 
3492   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3493     .addUse(Mul0.getReg(0))
3494     .setMIFlags(Flags);
3495 
3496   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3497 
3498   B.buildFMul(Res, Sel, Mul1, Flags);
3499 
3500   MI.eraseFromParent();
3501   return true;
3502 }
3503 
3504 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3505 // FIXME: Why do we handle this one but not other removed instructions?
3506 //
3507 // Reciprocal square root.  The clamp prevents infinite results, clamping
3508 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3509 // +-max_float.
3510 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3511                                                     MachineRegisterInfo &MRI,
3512                                                     MachineIRBuilder &B) const {
3513   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3514     return true;
3515 
3516   Register Dst = MI.getOperand(0).getReg();
3517   Register Src = MI.getOperand(2).getReg();
3518   auto Flags = MI.getFlags();
3519 
3520   LLT Ty = MRI.getType(Dst);
3521 
3522   const fltSemantics *FltSemantics;
3523   if (Ty == LLT::scalar(32))
3524     FltSemantics = &APFloat::IEEEsingle();
3525   else if (Ty == LLT::scalar(64))
3526     FltSemantics = &APFloat::IEEEdouble();
3527   else
3528     return false;
3529 
3530   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3531     .addUse(Src)
3532     .setMIFlags(Flags);
3533 
3534   // We don't need to concern ourselves with the snan handling difference, since
3535   // the rsq quieted (or not) so use the one which will directly select.
3536   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3537   const bool UseIEEE = MFI->getMode().IEEE;
3538 
3539   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3540   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3541                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3542 
3543   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3544 
3545   if (UseIEEE)
3546     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3547   else
3548     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3549   MI.eraseFromParent();
3550   return true;
3551 }
3552 
3553 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3554   switch (IID) {
3555   case Intrinsic::amdgcn_ds_fadd:
3556     return AMDGPU::G_ATOMICRMW_FADD;
3557   case Intrinsic::amdgcn_ds_fmin:
3558     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3559   case Intrinsic::amdgcn_ds_fmax:
3560     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3561   default:
3562     llvm_unreachable("not a DS FP intrinsic");
3563   }
3564 }
3565 
3566 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3567                                                       MachineInstr &MI,
3568                                                       Intrinsic::ID IID) const {
3569   GISelChangeObserver &Observer = Helper.Observer;
3570   Observer.changingInstr(MI);
3571 
3572   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3573 
3574   // The remaining operands were used to set fields in the MemOperand on
3575   // construction.
3576   for (int I = 6; I > 3; --I)
3577     MI.RemoveOperand(I);
3578 
3579   MI.RemoveOperand(1); // Remove the intrinsic ID.
3580   Observer.changedInstr(MI);
3581   return true;
3582 }
3583 
3584 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3585                                             MachineRegisterInfo &MRI,
3586                                             MachineIRBuilder &B) const {
3587   uint64_t Offset =
3588     ST.getTargetLowering()->getImplicitParameterOffset(
3589       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3590   LLT DstTy = MRI.getType(DstReg);
3591   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3592 
3593   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3594   if (!loadInputValue(KernargPtrReg, B,
3595                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3596     return false;
3597 
3598   // FIXME: This should be nuw
3599   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3600   return true;
3601 }
3602 
3603 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3604                                                  MachineRegisterInfo &MRI,
3605                                                  MachineIRBuilder &B) const {
3606   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3607   if (!MFI->isEntryFunction()) {
3608     return legalizePreloadedArgIntrin(MI, MRI, B,
3609                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3610   }
3611 
3612   Register DstReg = MI.getOperand(0).getReg();
3613   if (!getImplicitArgPtr(DstReg, MRI, B))
3614     return false;
3615 
3616   MI.eraseFromParent();
3617   return true;
3618 }
3619 
3620 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3621                                               MachineRegisterInfo &MRI,
3622                                               MachineIRBuilder &B,
3623                                               unsigned AddrSpace) const {
3624   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3625   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3626   Register Hi32 = Unmerge.getReg(1);
3627 
3628   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3629   MI.eraseFromParent();
3630   return true;
3631 }
3632 
3633 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3634 // offset (the offset that is included in bounds checking and swizzling, to be
3635 // split between the instruction's voffset and immoffset fields) and soffset
3636 // (the offset that is excluded from bounds checking and swizzling, to go in
3637 // the instruction's soffset field).  This function takes the first kind of
3638 // offset and figures out how to split it between voffset and immoffset.
3639 std::tuple<Register, unsigned, unsigned>
3640 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3641                                         Register OrigOffset) const {
3642   const unsigned MaxImm = 4095;
3643   Register BaseReg;
3644   unsigned TotalConstOffset;
3645   const LLT S32 = LLT::scalar(32);
3646   MachineRegisterInfo &MRI = *B.getMRI();
3647 
3648   std::tie(BaseReg, TotalConstOffset) =
3649       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3650 
3651   unsigned ImmOffset = TotalConstOffset;
3652 
3653   // If BaseReg is a pointer, convert it to int.
3654   if (MRI.getType(BaseReg).isPointer())
3655     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3656 
3657   // If the immediate value is too big for the immoffset field, put the value
3658   // and -4096 into the immoffset field so that the value that is copied/added
3659   // for the voffset field is a multiple of 4096, and it stands more chance
3660   // of being CSEd with the copy/add for another similar load/store.
3661   // However, do not do that rounding down to a multiple of 4096 if that is a
3662   // negative number, as it appears to be illegal to have a negative offset
3663   // in the vgpr, even if adding the immediate offset makes it positive.
3664   unsigned Overflow = ImmOffset & ~MaxImm;
3665   ImmOffset -= Overflow;
3666   if ((int32_t)Overflow < 0) {
3667     Overflow += ImmOffset;
3668     ImmOffset = 0;
3669   }
3670 
3671   if (Overflow != 0) {
3672     if (!BaseReg) {
3673       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3674     } else {
3675       auto OverflowVal = B.buildConstant(S32, Overflow);
3676       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3677     }
3678   }
3679 
3680   if (!BaseReg)
3681     BaseReg = B.buildConstant(S32, 0).getReg(0);
3682 
3683   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3684 }
3685 
3686 /// Handle register layout difference for f16 images for some subtargets.
3687 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3688                                              MachineRegisterInfo &MRI,
3689                                              Register Reg,
3690                                              bool ImageStore) const {
3691   const LLT S16 = LLT::scalar(16);
3692   const LLT S32 = LLT::scalar(32);
3693   LLT StoreVT = MRI.getType(Reg);
3694   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3695 
3696   if (ST.hasUnpackedD16VMem()) {
3697     auto Unmerge = B.buildUnmerge(S16, Reg);
3698 
3699     SmallVector<Register, 4> WideRegs;
3700     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3701       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3702 
3703     int NumElts = StoreVT.getNumElements();
3704 
3705     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3706         .getReg(0);
3707   }
3708 
3709   if (ImageStore && ST.hasImageStoreD16Bug()) {
3710     if (StoreVT.getNumElements() == 2) {
3711       SmallVector<Register, 4> PackedRegs;
3712       Reg = B.buildBitcast(S32, Reg).getReg(0);
3713       PackedRegs.push_back(Reg);
3714       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3715       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3716           .getReg(0);
3717     }
3718 
3719     if (StoreVT.getNumElements() == 3) {
3720       SmallVector<Register, 4> PackedRegs;
3721       auto Unmerge = B.buildUnmerge(S16, Reg);
3722       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3723         PackedRegs.push_back(Unmerge.getReg(I));
3724       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3725       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3726       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3727     }
3728 
3729     if (StoreVT.getNumElements() == 4) {
3730       SmallVector<Register, 4> PackedRegs;
3731       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3732       auto Unmerge = B.buildUnmerge(S32, Reg);
3733       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3734         PackedRegs.push_back(Unmerge.getReg(I));
3735       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3736       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3737           .getReg(0);
3738     }
3739 
3740     llvm_unreachable("invalid data type");
3741   }
3742 
3743   return Reg;
3744 }
3745 
3746 Register AMDGPULegalizerInfo::fixStoreSourceType(
3747   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3748   MachineRegisterInfo *MRI = B.getMRI();
3749   LLT Ty = MRI->getType(VData);
3750 
3751   const LLT S16 = LLT::scalar(16);
3752 
3753   // Fixup illegal register types for i8 stores.
3754   if (Ty == LLT::scalar(8) || Ty == S16) {
3755     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3756     return AnyExt;
3757   }
3758 
3759   if (Ty.isVector()) {
3760     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3761       if (IsFormat)
3762         return handleD16VData(B, *MRI, VData);
3763     }
3764   }
3765 
3766   return VData;
3767 }
3768 
3769 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3770                                               MachineRegisterInfo &MRI,
3771                                               MachineIRBuilder &B,
3772                                               bool IsTyped,
3773                                               bool IsFormat) const {
3774   Register VData = MI.getOperand(1).getReg();
3775   LLT Ty = MRI.getType(VData);
3776   LLT EltTy = Ty.getScalarType();
3777   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3778   const LLT S32 = LLT::scalar(32);
3779 
3780   VData = fixStoreSourceType(B, VData, IsFormat);
3781   Register RSrc = MI.getOperand(2).getReg();
3782 
3783   MachineMemOperand *MMO = *MI.memoperands_begin();
3784   const int MemSize = MMO->getSize();
3785 
3786   unsigned ImmOffset;
3787   unsigned TotalOffset;
3788 
3789   // The typed intrinsics add an immediate after the registers.
3790   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3791 
3792   // The struct intrinsic variants add one additional operand over raw.
3793   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3794   Register VIndex;
3795   int OpOffset = 0;
3796   if (HasVIndex) {
3797     VIndex = MI.getOperand(3).getReg();
3798     OpOffset = 1;
3799   }
3800 
3801   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3802   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3803 
3804   unsigned Format = 0;
3805   if (IsTyped) {
3806     Format = MI.getOperand(5 + OpOffset).getImm();
3807     ++OpOffset;
3808   }
3809 
3810   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3811 
3812   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3813   if (TotalOffset != 0)
3814     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3815 
3816   unsigned Opc;
3817   if (IsTyped) {
3818     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3819                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3820   } else if (IsFormat) {
3821     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3822                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3823   } else {
3824     switch (MemSize) {
3825     case 1:
3826       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3827       break;
3828     case 2:
3829       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3830       break;
3831     default:
3832       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3833       break;
3834     }
3835   }
3836 
3837   if (!VIndex)
3838     VIndex = B.buildConstant(S32, 0).getReg(0);
3839 
3840   auto MIB = B.buildInstr(Opc)
3841     .addUse(VData)              // vdata
3842     .addUse(RSrc)               // rsrc
3843     .addUse(VIndex)             // vindex
3844     .addUse(VOffset)            // voffset
3845     .addUse(SOffset)            // soffset
3846     .addImm(ImmOffset);         // offset(imm)
3847 
3848   if (IsTyped)
3849     MIB.addImm(Format);
3850 
3851   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3852      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3853      .addMemOperand(MMO);
3854 
3855   MI.eraseFromParent();
3856   return true;
3857 }
3858 
3859 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3860                                              MachineRegisterInfo &MRI,
3861                                              MachineIRBuilder &B,
3862                                              bool IsFormat,
3863                                              bool IsTyped) const {
3864   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3865   MachineMemOperand *MMO = *MI.memoperands_begin();
3866   const int MemSize = MMO->getSize();
3867   const LLT S32 = LLT::scalar(32);
3868 
3869   Register Dst = MI.getOperand(0).getReg();
3870   Register RSrc = MI.getOperand(2).getReg();
3871 
3872   // The typed intrinsics add an immediate after the registers.
3873   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3874 
3875   // The struct intrinsic variants add one additional operand over raw.
3876   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3877   Register VIndex;
3878   int OpOffset = 0;
3879   if (HasVIndex) {
3880     VIndex = MI.getOperand(3).getReg();
3881     OpOffset = 1;
3882   }
3883 
3884   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3885   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3886 
3887   unsigned Format = 0;
3888   if (IsTyped) {
3889     Format = MI.getOperand(5 + OpOffset).getImm();
3890     ++OpOffset;
3891   }
3892 
3893   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3894   unsigned ImmOffset;
3895   unsigned TotalOffset;
3896 
3897   LLT Ty = MRI.getType(Dst);
3898   LLT EltTy = Ty.getScalarType();
3899   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3900   const bool Unpacked = ST.hasUnpackedD16VMem();
3901 
3902   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3903   if (TotalOffset != 0)
3904     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3905 
3906   unsigned Opc;
3907 
3908   if (IsTyped) {
3909     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3910                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3911   } else if (IsFormat) {
3912     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3913                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3914   } else {
3915     switch (MemSize) {
3916     case 1:
3917       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3918       break;
3919     case 2:
3920       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3921       break;
3922     default:
3923       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3924       break;
3925     }
3926   }
3927 
3928   Register LoadDstReg;
3929 
3930   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3931   LLT UnpackedTy = Ty.changeElementSize(32);
3932 
3933   if (IsExtLoad)
3934     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3935   else if (Unpacked && IsD16 && Ty.isVector())
3936     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3937   else
3938     LoadDstReg = Dst;
3939 
3940   if (!VIndex)
3941     VIndex = B.buildConstant(S32, 0).getReg(0);
3942 
3943   auto MIB = B.buildInstr(Opc)
3944     .addDef(LoadDstReg)         // vdata
3945     .addUse(RSrc)               // rsrc
3946     .addUse(VIndex)             // vindex
3947     .addUse(VOffset)            // voffset
3948     .addUse(SOffset)            // soffset
3949     .addImm(ImmOffset);         // offset(imm)
3950 
3951   if (IsTyped)
3952     MIB.addImm(Format);
3953 
3954   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3955      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3956      .addMemOperand(MMO);
3957 
3958   if (LoadDstReg != Dst) {
3959     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3960 
3961     // Widen result for extending loads was widened.
3962     if (IsExtLoad)
3963       B.buildTrunc(Dst, LoadDstReg);
3964     else {
3965       // Repack to original 16-bit vector result
3966       // FIXME: G_TRUNC should work, but legalization currently fails
3967       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3968       SmallVector<Register, 4> Repack;
3969       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3970         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3971       B.buildMerge(Dst, Repack);
3972     }
3973   }
3974 
3975   MI.eraseFromParent();
3976   return true;
3977 }
3978 
3979 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3980                                                MachineIRBuilder &B,
3981                                                bool IsInc) const {
3982   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3983                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3984   B.buildInstr(Opc)
3985     .addDef(MI.getOperand(0).getReg())
3986     .addUse(MI.getOperand(2).getReg())
3987     .addUse(MI.getOperand(3).getReg())
3988     .cloneMemRefs(MI);
3989   MI.eraseFromParent();
3990   return true;
3991 }
3992 
3993 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3994   switch (IntrID) {
3995   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3996   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3997     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3998   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3999   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4000     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4001   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4002   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4003     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4004   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4005   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4006     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4007   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4008   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4009     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4010   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4011   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4012     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4013   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4014   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4015     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4016   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4017   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4018     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4019   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4020   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4021     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4022   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4023   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4024     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4025   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4026   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4027     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4028   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4029   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4030     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4031   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4032   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4033     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4034   case Intrinsic::amdgcn_buffer_atomic_fadd:
4035   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4036   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4037     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4038   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4039   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4040     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4041   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4042   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4043     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4044   default:
4045     llvm_unreachable("unhandled atomic opcode");
4046   }
4047 }
4048 
4049 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4050                                                MachineIRBuilder &B,
4051                                                Intrinsic::ID IID) const {
4052   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4053                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4054   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4055 
4056   Register Dst;
4057 
4058   int OpOffset = 0;
4059   if (HasReturn) {
4060     // A few FP atomics do not support return values.
4061     Dst = MI.getOperand(0).getReg();
4062   } else {
4063     OpOffset = -1;
4064   }
4065 
4066   Register VData = MI.getOperand(2 + OpOffset).getReg();
4067   Register CmpVal;
4068 
4069   if (IsCmpSwap) {
4070     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4071     ++OpOffset;
4072   }
4073 
4074   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4075   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4076 
4077   // The struct intrinsic variants add one additional operand over raw.
4078   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4079   Register VIndex;
4080   if (HasVIndex) {
4081     VIndex = MI.getOperand(4 + OpOffset).getReg();
4082     ++OpOffset;
4083   }
4084 
4085   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4086   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4087   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4088 
4089   MachineMemOperand *MMO = *MI.memoperands_begin();
4090 
4091   unsigned ImmOffset;
4092   unsigned TotalOffset;
4093   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4094   if (TotalOffset != 0)
4095     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4096 
4097   if (!VIndex)
4098     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4099 
4100   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4101 
4102   if (HasReturn)
4103     MIB.addDef(Dst);
4104 
4105   MIB.addUse(VData); // vdata
4106 
4107   if (IsCmpSwap)
4108     MIB.addReg(CmpVal);
4109 
4110   MIB.addUse(RSrc)               // rsrc
4111      .addUse(VIndex)             // vindex
4112      .addUse(VOffset)            // voffset
4113      .addUse(SOffset)            // soffset
4114      .addImm(ImmOffset)          // offset(imm)
4115      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4116      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4117      .addMemOperand(MMO);
4118 
4119   MI.eraseFromParent();
4120   return true;
4121 }
4122 
4123 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4124 /// vector with s16 typed elements.
4125 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4126                                       SmallVectorImpl<Register> &PackedAddrs,
4127                                       unsigned ArgOffset,
4128                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4129                                       bool IsA16, bool IsG16) {
4130   const LLT S16 = LLT::scalar(16);
4131   const LLT V2S16 = LLT::fixed_vector(2, 16);
4132   auto EndIdx = Intr->VAddrEnd;
4133 
4134   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4135     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4136     if (!SrcOp.isReg())
4137       continue; // _L to _LZ may have eliminated this.
4138 
4139     Register AddrReg = SrcOp.getReg();
4140 
4141     if (I < Intr->GradientStart) {
4142       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4143       PackedAddrs.push_back(AddrReg);
4144     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4145                (I >= Intr->CoordStart && !IsA16)) {
4146       // Handle any gradient or coordinate operands that should not be packed
4147       PackedAddrs.push_back(AddrReg);
4148     } else {
4149       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4150       // derivatives dx/dh and dx/dv are packed with undef.
4151       if (((I + 1) >= EndIdx) ||
4152           ((Intr->NumGradients / 2) % 2 == 1 &&
4153            (I == static_cast<unsigned>(Intr->GradientStart +
4154                                        (Intr->NumGradients / 2) - 1) ||
4155             I == static_cast<unsigned>(Intr->GradientStart +
4156                                        Intr->NumGradients - 1))) ||
4157           // Check for _L to _LZ optimization
4158           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4159         PackedAddrs.push_back(
4160             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4161                 .getReg(0));
4162       } else {
4163         PackedAddrs.push_back(
4164             B.buildBuildVector(
4165                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4166                 .getReg(0));
4167         ++I;
4168       }
4169     }
4170   }
4171 }
4172 
4173 /// Convert from separate vaddr components to a single vector address register,
4174 /// and replace the remaining operands with $noreg.
4175 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4176                                      int DimIdx, int NumVAddrs) {
4177   const LLT S32 = LLT::scalar(32);
4178 
4179   SmallVector<Register, 8> AddrRegs;
4180   for (int I = 0; I != NumVAddrs; ++I) {
4181     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4182     if (SrcOp.isReg()) {
4183       AddrRegs.push_back(SrcOp.getReg());
4184       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4185     }
4186   }
4187 
4188   int NumAddrRegs = AddrRegs.size();
4189   if (NumAddrRegs != 1) {
4190     // Round up to 8 elements for v5-v7
4191     // FIXME: Missing intermediate sized register classes and instructions.
4192     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4193       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4194       auto Undef = B.buildUndef(S32);
4195       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4196       NumAddrRegs = RoundedNumRegs;
4197     }
4198 
4199     auto VAddr =
4200         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4201     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4202   }
4203 
4204   for (int I = 1; I != NumVAddrs; ++I) {
4205     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4206     if (SrcOp.isReg())
4207       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4208   }
4209 }
4210 
4211 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4212 ///
4213 /// Depending on the subtarget, load/store with 16-bit element data need to be
4214 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4215 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4216 /// registers.
4217 ///
4218 /// We don't want to directly select image instructions just yet, but also want
4219 /// to exposes all register repacking to the legalizer/combiners. We also don't
4220 /// want a selected instrution entering RegBankSelect. In order to avoid
4221 /// defining a multitude of intermediate image instructions, directly hack on
4222 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4223 /// now unnecessary arguments with $noreg.
4224 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4225     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4226     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4227 
4228   const unsigned NumDefs = MI.getNumExplicitDefs();
4229   const unsigned ArgOffset = NumDefs + 1;
4230   bool IsTFE = NumDefs == 2;
4231   // We are only processing the operands of d16 image operations on subtargets
4232   // that use the unpacked register layout, or need to repack the TFE result.
4233 
4234   // TODO: Do we need to guard against already legalized intrinsics?
4235   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4236       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4237 
4238   MachineRegisterInfo *MRI = B.getMRI();
4239   const LLT S32 = LLT::scalar(32);
4240   const LLT S16 = LLT::scalar(16);
4241   const LLT V2S16 = LLT::fixed_vector(2, 16);
4242 
4243   unsigned DMask = 0;
4244 
4245   // Check for 16 bit addresses and pack if true.
4246   LLT GradTy =
4247       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4248   LLT AddrTy =
4249       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4250   const bool IsG16 = GradTy == S16;
4251   const bool IsA16 = AddrTy == S16;
4252 
4253   int DMaskLanes = 0;
4254   if (!BaseOpcode->Atomic) {
4255     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4256     if (BaseOpcode->Gather4) {
4257       DMaskLanes = 4;
4258     } else if (DMask != 0) {
4259       DMaskLanes = countPopulation(DMask);
4260     } else if (!IsTFE && !BaseOpcode->Store) {
4261       // If dmask is 0, this is a no-op load. This can be eliminated.
4262       B.buildUndef(MI.getOperand(0));
4263       MI.eraseFromParent();
4264       return true;
4265     }
4266   }
4267 
4268   Observer.changingInstr(MI);
4269   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4270 
4271   unsigned NewOpcode = NumDefs == 0 ?
4272     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4273 
4274   // Track that we legalized this
4275   MI.setDesc(B.getTII().get(NewOpcode));
4276 
4277   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4278   // dmask to be at least 1 otherwise the instruction will fail
4279   if (IsTFE && DMask == 0) {
4280     DMask = 0x1;
4281     DMaskLanes = 1;
4282     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4283   }
4284 
4285   if (BaseOpcode->Atomic) {
4286     Register VData0 = MI.getOperand(2).getReg();
4287     LLT Ty = MRI->getType(VData0);
4288 
4289     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4290     if (Ty.isVector())
4291       return false;
4292 
4293     if (BaseOpcode->AtomicX2) {
4294       Register VData1 = MI.getOperand(3).getReg();
4295       // The two values are packed in one register.
4296       LLT PackedTy = LLT::fixed_vector(2, Ty);
4297       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4298       MI.getOperand(2).setReg(Concat.getReg(0));
4299       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4300     }
4301   }
4302 
4303   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4304 
4305   // Optimize _L to _LZ when _L is zero
4306   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4307           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4308     const ConstantFP *ConstantLod;
4309 
4310     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4311                  m_GFCst(ConstantLod))) {
4312       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4313         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4314         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4315             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4316                                                       Intr->Dim);
4317 
4318         // The starting indexes should remain in the same place.
4319         --CorrectedNumVAddrs;
4320 
4321         MI.getOperand(MI.getNumExplicitDefs())
4322             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4323         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4324         Intr = NewImageDimIntr;
4325       }
4326     }
4327   }
4328 
4329   // Optimize _mip away, when 'lod' is zero
4330   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4331     int64_t ConstantLod;
4332     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4333                  m_ICst(ConstantLod))) {
4334       if (ConstantLod == 0) {
4335         // TODO: Change intrinsic opcode and remove operand instead or replacing
4336         // it with 0, as the _L to _LZ handling is done above.
4337         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4338         --CorrectedNumVAddrs;
4339       }
4340     }
4341   }
4342 
4343   // Rewrite the addressing register layout before doing anything else.
4344   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4345     // 16 bit gradients are supported, but are tied to the A16 control
4346     // so both gradients and addresses must be 16 bit
4347     return false;
4348   }
4349 
4350   if (IsA16 && !ST.hasA16()) {
4351     // A16 not supported
4352     return false;
4353   }
4354 
4355   if (IsA16 || IsG16) {
4356     if (Intr->NumVAddrs > 1) {
4357       SmallVector<Register, 4> PackedRegs;
4358 
4359       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4360                                 IsG16);
4361 
4362       // See also below in the non-a16 branch
4363       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4364 
4365       if (!UseNSA && PackedRegs.size() > 1) {
4366         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4367         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4368         PackedRegs[0] = Concat.getReg(0);
4369         PackedRegs.resize(1);
4370       }
4371 
4372       const unsigned NumPacked = PackedRegs.size();
4373       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4374         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4375         if (!SrcOp.isReg()) {
4376           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4377           continue;
4378         }
4379 
4380         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4381 
4382         if (I - Intr->VAddrStart < NumPacked)
4383           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4384         else
4385           SrcOp.setReg(AMDGPU::NoRegister);
4386       }
4387     }
4388   } else {
4389     // If the register allocator cannot place the address registers contiguously
4390     // without introducing moves, then using the non-sequential address encoding
4391     // is always preferable, since it saves VALU instructions and is usually a
4392     // wash in terms of code size or even better.
4393     //
4394     // However, we currently have no way of hinting to the register allocator
4395     // that MIMG addresses should be placed contiguously when it is possible to
4396     // do so, so force non-NSA for the common 2-address case as a heuristic.
4397     //
4398     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4399     // allocation when possible.
4400     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4401 
4402     if (!UseNSA && Intr->NumVAddrs > 1)
4403       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4404                                Intr->NumVAddrs);
4405   }
4406 
4407   int Flags = 0;
4408   if (IsA16)
4409     Flags |= 1;
4410   if (IsG16)
4411     Flags |= 2;
4412   MI.addOperand(MachineOperand::CreateImm(Flags));
4413 
4414   if (BaseOpcode->Store) { // No TFE for stores?
4415     // TODO: Handle dmask trim
4416     Register VData = MI.getOperand(1).getReg();
4417     LLT Ty = MRI->getType(VData);
4418     if (!Ty.isVector() || Ty.getElementType() != S16)
4419       return true;
4420 
4421     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4422     if (RepackedReg != VData) {
4423       MI.getOperand(1).setReg(RepackedReg);
4424     }
4425 
4426     return true;
4427   }
4428 
4429   Register DstReg = MI.getOperand(0).getReg();
4430   LLT Ty = MRI->getType(DstReg);
4431   const LLT EltTy = Ty.getScalarType();
4432   const bool IsD16 = Ty.getScalarType() == S16;
4433   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4434 
4435   // Confirm that the return type is large enough for the dmask specified
4436   if (NumElts < DMaskLanes)
4437     return false;
4438 
4439   if (NumElts > 4 || DMaskLanes > 4)
4440     return false;
4441 
4442   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4443   const LLT AdjustedTy =
4444       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4445 
4446   // The raw dword aligned data component of the load. The only legal cases
4447   // where this matters should be when using the packed D16 format, for
4448   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4449   LLT RoundedTy;
4450 
4451   // S32 vector to to cover all data, plus TFE result element.
4452   LLT TFETy;
4453 
4454   // Register type to use for each loaded component. Will be S32 or V2S16.
4455   LLT RegTy;
4456 
4457   if (IsD16 && ST.hasUnpackedD16VMem()) {
4458     RoundedTy =
4459         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4460     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4461     RegTy = S32;
4462   } else {
4463     unsigned EltSize = EltTy.getSizeInBits();
4464     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4465     unsigned RoundedSize = 32 * RoundedElts;
4466     RoundedTy = LLT::scalarOrVector(
4467         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4468     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4469     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4470   }
4471 
4472   // The return type does not need adjustment.
4473   // TODO: Should we change s16 case to s32 or <2 x s16>?
4474   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4475     return true;
4476 
4477   Register Dst1Reg;
4478 
4479   // Insert after the instruction.
4480   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4481 
4482   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4483   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4484   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4485   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4486 
4487   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4488 
4489   MI.getOperand(0).setReg(NewResultReg);
4490 
4491   // In the IR, TFE is supposed to be used with a 2 element struct return
4492   // type. The intruction really returns these two values in one contiguous
4493   // register, with one additional dword beyond the loaded data. Rewrite the
4494   // return type to use a single register result.
4495 
4496   if (IsTFE) {
4497     Dst1Reg = MI.getOperand(1).getReg();
4498     if (MRI->getType(Dst1Reg) != S32)
4499       return false;
4500 
4501     // TODO: Make sure the TFE operand bit is set.
4502     MI.RemoveOperand(1);
4503 
4504     // Handle the easy case that requires no repack instructions.
4505     if (Ty == S32) {
4506       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4507       return true;
4508     }
4509   }
4510 
4511   // Now figure out how to copy the new result register back into the old
4512   // result.
4513   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4514 
4515   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4516 
4517   if (ResultNumRegs == 1) {
4518     assert(!IsTFE);
4519     ResultRegs[0] = NewResultReg;
4520   } else {
4521     // We have to repack into a new vector of some kind.
4522     for (int I = 0; I != NumDataRegs; ++I)
4523       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4524     B.buildUnmerge(ResultRegs, NewResultReg);
4525 
4526     // Drop the final TFE element to get the data part. The TFE result is
4527     // directly written to the right place already.
4528     if (IsTFE)
4529       ResultRegs.resize(NumDataRegs);
4530   }
4531 
4532   // For an s16 scalar result, we form an s32 result with a truncate regardless
4533   // of packed vs. unpacked.
4534   if (IsD16 && !Ty.isVector()) {
4535     B.buildTrunc(DstReg, ResultRegs[0]);
4536     return true;
4537   }
4538 
4539   // Avoid a build/concat_vector of 1 entry.
4540   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4541     B.buildBitcast(DstReg, ResultRegs[0]);
4542     return true;
4543   }
4544 
4545   assert(Ty.isVector());
4546 
4547   if (IsD16) {
4548     // For packed D16 results with TFE enabled, all the data components are
4549     // S32. Cast back to the expected type.
4550     //
4551     // TODO: We don't really need to use load s32 elements. We would only need one
4552     // cast for the TFE result if a multiple of v2s16 was used.
4553     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4554       for (Register &Reg : ResultRegs)
4555         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4556     } else if (ST.hasUnpackedD16VMem()) {
4557       for (Register &Reg : ResultRegs)
4558         Reg = B.buildTrunc(S16, Reg).getReg(0);
4559     }
4560   }
4561 
4562   auto padWithUndef = [&](LLT Ty, int NumElts) {
4563     if (NumElts == 0)
4564       return;
4565     Register Undef = B.buildUndef(Ty).getReg(0);
4566     for (int I = 0; I != NumElts; ++I)
4567       ResultRegs.push_back(Undef);
4568   };
4569 
4570   // Pad out any elements eliminated due to the dmask.
4571   LLT ResTy = MRI->getType(ResultRegs[0]);
4572   if (!ResTy.isVector()) {
4573     padWithUndef(ResTy, NumElts - ResultRegs.size());
4574     B.buildBuildVector(DstReg, ResultRegs);
4575     return true;
4576   }
4577 
4578   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4579   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4580 
4581   // Deal with the one annoying legal case.
4582   const LLT V3S16 = LLT::fixed_vector(3, 16);
4583   if (Ty == V3S16) {
4584     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4585     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4586     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4587     return true;
4588   }
4589 
4590   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4591   B.buildConcatVectors(DstReg, ResultRegs);
4592   return true;
4593 }
4594 
4595 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4596   LegalizerHelper &Helper, MachineInstr &MI) const {
4597   MachineIRBuilder &B = Helper.MIRBuilder;
4598   GISelChangeObserver &Observer = Helper.Observer;
4599 
4600   Register Dst = MI.getOperand(0).getReg();
4601   LLT Ty = B.getMRI()->getType(Dst);
4602   unsigned Size = Ty.getSizeInBits();
4603   MachineFunction &MF = B.getMF();
4604 
4605   Observer.changingInstr(MI);
4606 
4607   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4608     Ty = getBitcastRegisterType(Ty);
4609     Helper.bitcastDst(MI, Ty, 0);
4610     Dst = MI.getOperand(0).getReg();
4611     B.setInsertPt(B.getMBB(), MI);
4612   }
4613 
4614   // FIXME: We don't really need this intermediate instruction. The intrinsic
4615   // should be fixed to have a memory operand. Since it's readnone, we're not
4616   // allowed to add one.
4617   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4618   MI.RemoveOperand(1); // Remove intrinsic ID
4619 
4620   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4621   // TODO: Should this use datalayout alignment?
4622   const unsigned MemSize = (Size + 7) / 8;
4623   const Align MemAlign(4);
4624   MachineMemOperand *MMO = MF.getMachineMemOperand(
4625       MachinePointerInfo(),
4626       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4627           MachineMemOperand::MOInvariant,
4628       MemSize, MemAlign);
4629   MI.addMemOperand(MF, MMO);
4630 
4631   // There are no 96-bit result scalar loads, but widening to 128-bit should
4632   // always be legal. We may need to restore this to a 96-bit result if it turns
4633   // out this needs to be converted to a vector load during RegBankSelect.
4634   if (!isPowerOf2_32(Size)) {
4635     if (Ty.isVector())
4636       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4637     else
4638       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4639   }
4640 
4641   Observer.changedInstr(MI);
4642   return true;
4643 }
4644 
4645 // TODO: Move to selection
4646 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4647                                                 MachineRegisterInfo &MRI,
4648                                                 MachineIRBuilder &B) const {
4649   if (!ST.isTrapHandlerEnabled() ||
4650       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4651     return legalizeTrapEndpgm(MI, MRI, B);
4652 
4653   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4654     switch (*HsaAbiVer) {
4655     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4656     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4657       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4658     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4659       return ST.supportsGetDoorbellID() ?
4660           legalizeTrapHsa(MI, MRI, B) :
4661           legalizeTrapHsaQueuePtr(MI, MRI, B);
4662     }
4663   }
4664 
4665   llvm_unreachable("Unknown trap handler");
4666 }
4667 
4668 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4669     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4670   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4671   MI.eraseFromParent();
4672   return true;
4673 }
4674 
4675 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4676     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4677   // Pass queue pointer to trap handler as input, and insert trap instruction
4678   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4679   Register LiveIn =
4680     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4681   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4682     return false;
4683 
4684   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4685   B.buildCopy(SGPR01, LiveIn);
4686   B.buildInstr(AMDGPU::S_TRAP)
4687       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4688       .addReg(SGPR01, RegState::Implicit);
4689 
4690   MI.eraseFromParent();
4691   return true;
4692 }
4693 
4694 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4695     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4696   B.buildInstr(AMDGPU::S_TRAP)
4697       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4698   MI.eraseFromParent();
4699   return true;
4700 }
4701 
4702 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4703     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4704   // Is non-HSA path or trap-handler disabled? then, report a warning
4705   // accordingly
4706   if (!ST.isTrapHandlerEnabled() ||
4707       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4708     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4709                                      "debugtrap handler not supported",
4710                                      MI.getDebugLoc(), DS_Warning);
4711     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4712     Ctx.diagnose(NoTrap);
4713   } else {
4714     // Insert debug-trap instruction
4715     B.buildInstr(AMDGPU::S_TRAP)
4716         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4717   }
4718 
4719   MI.eraseFromParent();
4720   return true;
4721 }
4722 
4723 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4724                                                MachineIRBuilder &B) const {
4725   MachineRegisterInfo &MRI = *B.getMRI();
4726   const LLT S16 = LLT::scalar(16);
4727   const LLT S32 = LLT::scalar(32);
4728 
4729   Register DstReg = MI.getOperand(0).getReg();
4730   Register NodePtr = MI.getOperand(2).getReg();
4731   Register RayExtent = MI.getOperand(3).getReg();
4732   Register RayOrigin = MI.getOperand(4).getReg();
4733   Register RayDir = MI.getOperand(5).getReg();
4734   Register RayInvDir = MI.getOperand(6).getReg();
4735   Register TDescr = MI.getOperand(7).getReg();
4736 
4737   if (!ST.hasGFX10_AEncoding()) {
4738     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4739                                         "intrinsic not supported on subtarget",
4740                                         MI.getDebugLoc());
4741     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4742     return false;
4743   }
4744 
4745   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4746   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4747   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4748                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4749                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4750                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4751 
4752   SmallVector<Register, 12> Ops;
4753   if (Is64) {
4754     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4755     Ops.push_back(Unmerge.getReg(0));
4756     Ops.push_back(Unmerge.getReg(1));
4757   } else {
4758     Ops.push_back(NodePtr);
4759   }
4760   Ops.push_back(RayExtent);
4761 
4762   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4763     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4764     Ops.push_back(Unmerge.getReg(0));
4765     Ops.push_back(Unmerge.getReg(1));
4766     Ops.push_back(Unmerge.getReg(2));
4767   };
4768 
4769   packLanes(RayOrigin);
4770   if (IsA16) {
4771     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4772     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4773     Register R1 = MRI.createGenericVirtualRegister(S32);
4774     Register R2 = MRI.createGenericVirtualRegister(S32);
4775     Register R3 = MRI.createGenericVirtualRegister(S32);
4776     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4777     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4778     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4779     Ops.push_back(R1);
4780     Ops.push_back(R2);
4781     Ops.push_back(R3);
4782   } else {
4783     packLanes(RayDir);
4784     packLanes(RayInvDir);
4785   }
4786 
4787   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4788     .addDef(DstReg)
4789     .addImm(Opcode);
4790 
4791   for (Register R : Ops) {
4792     MIB.addUse(R);
4793   }
4794 
4795   MIB.addUse(TDescr)
4796      .addImm(IsA16 ? 1 : 0)
4797      .cloneMemRefs(MI);
4798 
4799   MI.eraseFromParent();
4800   return true;
4801 }
4802 
4803 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4804                                             MachineInstr &MI) const {
4805   MachineIRBuilder &B = Helper.MIRBuilder;
4806   MachineRegisterInfo &MRI = *B.getMRI();
4807 
4808   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4809   auto IntrID = MI.getIntrinsicID();
4810   switch (IntrID) {
4811   case Intrinsic::amdgcn_if:
4812   case Intrinsic::amdgcn_else: {
4813     MachineInstr *Br = nullptr;
4814     MachineBasicBlock *UncondBrTarget = nullptr;
4815     bool Negated = false;
4816     if (MachineInstr *BrCond =
4817             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4818       const SIRegisterInfo *TRI
4819         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4820 
4821       Register Def = MI.getOperand(1).getReg();
4822       Register Use = MI.getOperand(3).getReg();
4823 
4824       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4825 
4826       if (Negated)
4827         std::swap(CondBrTarget, UncondBrTarget);
4828 
4829       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4830       if (IntrID == Intrinsic::amdgcn_if) {
4831         B.buildInstr(AMDGPU::SI_IF)
4832           .addDef(Def)
4833           .addUse(Use)
4834           .addMBB(UncondBrTarget);
4835       } else {
4836         B.buildInstr(AMDGPU::SI_ELSE)
4837             .addDef(Def)
4838             .addUse(Use)
4839             .addMBB(UncondBrTarget);
4840       }
4841 
4842       if (Br) {
4843         Br->getOperand(0).setMBB(CondBrTarget);
4844       } else {
4845         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4846         // since we're swapping branch targets it needs to be reinserted.
4847         // FIXME: IRTranslator should probably not do this
4848         B.buildBr(*CondBrTarget);
4849       }
4850 
4851       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4852       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4853       MI.eraseFromParent();
4854       BrCond->eraseFromParent();
4855       return true;
4856     }
4857 
4858     return false;
4859   }
4860   case Intrinsic::amdgcn_loop: {
4861     MachineInstr *Br = nullptr;
4862     MachineBasicBlock *UncondBrTarget = nullptr;
4863     bool Negated = false;
4864     if (MachineInstr *BrCond =
4865             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4866       const SIRegisterInfo *TRI
4867         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4868 
4869       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4870       Register Reg = MI.getOperand(2).getReg();
4871 
4872       if (Negated)
4873         std::swap(CondBrTarget, UncondBrTarget);
4874 
4875       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4876       B.buildInstr(AMDGPU::SI_LOOP)
4877         .addUse(Reg)
4878         .addMBB(UncondBrTarget);
4879 
4880       if (Br)
4881         Br->getOperand(0).setMBB(CondBrTarget);
4882       else
4883         B.buildBr(*CondBrTarget);
4884 
4885       MI.eraseFromParent();
4886       BrCond->eraseFromParent();
4887       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4888       return true;
4889     }
4890 
4891     return false;
4892   }
4893   case Intrinsic::amdgcn_kernarg_segment_ptr:
4894     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4895       // This only makes sense to call in a kernel, so just lower to null.
4896       B.buildConstant(MI.getOperand(0).getReg(), 0);
4897       MI.eraseFromParent();
4898       return true;
4899     }
4900 
4901     return legalizePreloadedArgIntrin(
4902       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4903   case Intrinsic::amdgcn_implicitarg_ptr:
4904     return legalizeImplicitArgPtr(MI, MRI, B);
4905   case Intrinsic::amdgcn_workitem_id_x:
4906     return legalizePreloadedArgIntrin(MI, MRI, B,
4907                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4908   case Intrinsic::amdgcn_workitem_id_y:
4909     return legalizePreloadedArgIntrin(MI, MRI, B,
4910                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4911   case Intrinsic::amdgcn_workitem_id_z:
4912     return legalizePreloadedArgIntrin(MI, MRI, B,
4913                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4914   case Intrinsic::amdgcn_workgroup_id_x:
4915     return legalizePreloadedArgIntrin(MI, MRI, B,
4916                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4917   case Intrinsic::amdgcn_workgroup_id_y:
4918     return legalizePreloadedArgIntrin(MI, MRI, B,
4919                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4920   case Intrinsic::amdgcn_workgroup_id_z:
4921     return legalizePreloadedArgIntrin(MI, MRI, B,
4922                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4923   case Intrinsic::amdgcn_dispatch_ptr:
4924     return legalizePreloadedArgIntrin(MI, MRI, B,
4925                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4926   case Intrinsic::amdgcn_queue_ptr:
4927     return legalizePreloadedArgIntrin(MI, MRI, B,
4928                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4929   case Intrinsic::amdgcn_implicit_buffer_ptr:
4930     return legalizePreloadedArgIntrin(
4931       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4932   case Intrinsic::amdgcn_dispatch_id:
4933     return legalizePreloadedArgIntrin(MI, MRI, B,
4934                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4935   case Intrinsic::amdgcn_fdiv_fast:
4936     return legalizeFDIVFastIntrin(MI, MRI, B);
4937   case Intrinsic::amdgcn_is_shared:
4938     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4939   case Intrinsic::amdgcn_is_private:
4940     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4941   case Intrinsic::amdgcn_wavefrontsize: {
4942     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4943     MI.eraseFromParent();
4944     return true;
4945   }
4946   case Intrinsic::amdgcn_s_buffer_load:
4947     return legalizeSBufferLoad(Helper, MI);
4948   case Intrinsic::amdgcn_raw_buffer_store:
4949   case Intrinsic::amdgcn_struct_buffer_store:
4950     return legalizeBufferStore(MI, MRI, B, false, false);
4951   case Intrinsic::amdgcn_raw_buffer_store_format:
4952   case Intrinsic::amdgcn_struct_buffer_store_format:
4953     return legalizeBufferStore(MI, MRI, B, false, true);
4954   case Intrinsic::amdgcn_raw_tbuffer_store:
4955   case Intrinsic::amdgcn_struct_tbuffer_store:
4956     return legalizeBufferStore(MI, MRI, B, true, true);
4957   case Intrinsic::amdgcn_raw_buffer_load:
4958   case Intrinsic::amdgcn_struct_buffer_load:
4959     return legalizeBufferLoad(MI, MRI, B, false, false);
4960   case Intrinsic::amdgcn_raw_buffer_load_format:
4961   case Intrinsic::amdgcn_struct_buffer_load_format:
4962     return legalizeBufferLoad(MI, MRI, B, true, false);
4963   case Intrinsic::amdgcn_raw_tbuffer_load:
4964   case Intrinsic::amdgcn_struct_tbuffer_load:
4965     return legalizeBufferLoad(MI, MRI, B, true, true);
4966   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4967   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4968   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4969   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4970   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4971   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4972   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4973   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4974   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4975   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4976   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4977   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4978   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4979   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4980   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4981   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4982   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4983   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4984   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4985   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4986   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4987   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4988   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4989   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4990   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4991   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4992   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4993   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4994   case Intrinsic::amdgcn_buffer_atomic_fadd:
4995   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4996   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4997   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4998   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4999     return legalizeBufferAtomic(MI, B, IntrID);
5000   case Intrinsic::amdgcn_atomic_inc:
5001     return legalizeAtomicIncDec(MI, B, true);
5002   case Intrinsic::amdgcn_atomic_dec:
5003     return legalizeAtomicIncDec(MI, B, false);
5004   case Intrinsic::trap:
5005     return legalizeTrapIntrinsic(MI, MRI, B);
5006   case Intrinsic::debugtrap:
5007     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5008   case Intrinsic::amdgcn_rsq_clamp:
5009     return legalizeRsqClampIntrinsic(MI, MRI, B);
5010   case Intrinsic::amdgcn_ds_fadd:
5011   case Intrinsic::amdgcn_ds_fmin:
5012   case Intrinsic::amdgcn_ds_fmax:
5013     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5014   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5015     return legalizeBVHIntrinsic(MI, B);
5016   default: {
5017     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5018             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5019       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5020     return true;
5021   }
5022   }
5023 
5024   return true;
5025 }
5026