1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226   return [=](const LegalityQuery &Query) {
227     const LLT Ty = Query.Types[TypeIdx];
228     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230   };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237                                     bool IsLoad) {
238   switch (AS) {
239   case AMDGPUAS::PRIVATE_ADDRESS:
240     // FIXME: Private element size.
241     return ST.enableFlatScratch() ? 128 : 32;
242   case AMDGPUAS::LOCAL_ADDRESS:
243     return ST.useDS128() ? 128 : 64;
244   case AMDGPUAS::GLOBAL_ADDRESS:
245   case AMDGPUAS::CONSTANT_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247     // Treat constant and global as identical. SMRD loads are sometimes usable for
248     // global loads (ideally constant address space should be eliminated)
249     // depending on the context. Legality cannot be context dependent, but
250     // RegBankSelect can split the load as necessary depending on the pointer
251     // register bank/uniformity and if the memory is invariant or not written in a
252     // kernel.
253     return IsLoad ? 512 : 128;
254   default:
255     // Flat addresses may contextually need to be split to 32-bit parts if they
256     // may alias scratch depending on the subtarget.
257     return 128;
258   }
259 }
260 
261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262                                  const LegalityQuery &Query) {
263   const LLT Ty = Query.Types[0];
264 
265   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
266   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
267 
268   unsigned RegSize = Ty.getSizeInBits();
269   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
270   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
271   unsigned AS = Query.Types[1].getAddressSpace();
272 
273   // All of these need to be custom lowered to cast the pointer operand.
274   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
275     return false;
276 
277   // TODO: We should be able to widen loads if the alignment is high enough, but
278   // we also need to modify the memory access size.
279 #if 0
280   // Accept widening loads based on alignment.
281   if (IsLoad && MemSize < Size)
282     MemSize = std::max(MemSize, Align);
283 #endif
284 
285   // Only 1-byte and 2-byte to 32-bit extloads are valid.
286   if (MemSize != RegSize && RegSize != 32)
287     return false;
288 
289   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
290     return false;
291 
292   switch (MemSize) {
293   case 8:
294   case 16:
295   case 32:
296   case 64:
297   case 128:
298     break;
299   case 96:
300     if (!ST.hasDwordx3LoadStores())
301       return false;
302     break;
303   case 256:
304   case 512:
305     // These may contextually need to be broken down.
306     break;
307   default:
308     return false;
309   }
310 
311   assert(RegSize >= MemSize);
312 
313   if (AlignBits < MemSize) {
314     const SITargetLowering *TLI = ST.getTargetLowering();
315     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
316                                                  Align(AlignBits / 8)))
317       return false;
318   }
319 
320   return true;
321 }
322 
323 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
324 // workaround this. Eventually it should ignore the type for loads and only care
325 // about the size. Return true in cases where we will workaround this for now by
326 // bitcasting.
327 static bool loadStoreBitcastWorkaround(const LLT Ty) {
328   if (EnableNewLegality)
329     return false;
330 
331   const unsigned Size = Ty.getSizeInBits();
332   if (Size <= 64)
333     return false;
334   if (!Ty.isVector())
335     return true;
336 
337   LLT EltTy = Ty.getElementType();
338   if (EltTy.isPointer())
339     return true;
340 
341   unsigned EltSize = EltTy.getSizeInBits();
342   return EltSize != 32 && EltSize != 64;
343 }
344 
345 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
346   const LLT Ty = Query.Types[0];
347   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
348          !loadStoreBitcastWorkaround(Ty);
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354                                        const unsigned MemSizeInBits) {
355   const unsigned Size = Ty.getSizeInBits();
356     if (Size != MemSizeInBits)
357       return Size <= 32 && Ty.isVector();
358 
359   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360     return true;
361   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362          !isRegisterVectorElementType(Ty.getElementType());
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369                             unsigned AlignInBits, unsigned AddrSpace,
370                             unsigned Opcode) {
371   // We don't want to widen cases that are naturally legal.
372   if (isPowerOf2_32(SizeInBits))
373     return false;
374 
375   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376   // end up widening these for a scalar load during RegBankSelect, since there
377   // aren't 96-bit scalar loads.
378   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379     return false;
380 
381   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382     return false;
383 
384   // A load is known dereferenceable up to the alignment, so it's legal to widen
385   // to it.
386   //
387   // TODO: Could check dereferenceable for less aligned cases.
388   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389   if (AlignInBits < RoundedSize)
390     return false;
391 
392   // Do not widen if it would introduce a slow unaligned load.
393   const SITargetLowering *TLI = ST.getTargetLowering();
394   bool Fast = false;
395   return TLI->allowsMisalignedMemoryAccessesImpl(
396              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397              MachineMemOperand::MOLoad, &Fast) &&
398          Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402                             unsigned Opcode) {
403   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404     return false;
405 
406   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407                          Query.MMODescrs[0].AlignInBits,
408                          Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412                                          const GCNTargetMachine &TM)
413   :  ST(ST_) {
414   using namespace TargetOpcode;
415 
416   auto GetAddrSpacePtr = [&TM](unsigned AS) {
417     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418   };
419 
420   const LLT S1 = LLT::scalar(1);
421   const LLT S8 = LLT::scalar(8);
422   const LLT S16 = LLT::scalar(16);
423   const LLT S32 = LLT::scalar(32);
424   const LLT S64 = LLT::scalar(64);
425   const LLT S128 = LLT::scalar(128);
426   const LLT S256 = LLT::scalar(256);
427   const LLT S512 = LLT::scalar(512);
428   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430   const LLT V2S8 = LLT::vector(2, 8);
431   const LLT V2S16 = LLT::vector(2, 16);
432   const LLT V4S16 = LLT::vector(4, 16);
433 
434   const LLT V2S32 = LLT::vector(2, 32);
435   const LLT V3S32 = LLT::vector(3, 32);
436   const LLT V4S32 = LLT::vector(4, 32);
437   const LLT V5S32 = LLT::vector(5, 32);
438   const LLT V6S32 = LLT::vector(6, 32);
439   const LLT V7S32 = LLT::vector(7, 32);
440   const LLT V8S32 = LLT::vector(8, 32);
441   const LLT V9S32 = LLT::vector(9, 32);
442   const LLT V10S32 = LLT::vector(10, 32);
443   const LLT V11S32 = LLT::vector(11, 32);
444   const LLT V12S32 = LLT::vector(12, 32);
445   const LLT V13S32 = LLT::vector(13, 32);
446   const LLT V14S32 = LLT::vector(14, 32);
447   const LLT V15S32 = LLT::vector(15, 32);
448   const LLT V16S32 = LLT::vector(16, 32);
449   const LLT V32S32 = LLT::vector(32, 32);
450 
451   const LLT V2S64 = LLT::vector(2, 64);
452   const LLT V3S64 = LLT::vector(3, 64);
453   const LLT V4S64 = LLT::vector(4, 64);
454   const LLT V5S64 = LLT::vector(5, 64);
455   const LLT V6S64 = LLT::vector(6, 64);
456   const LLT V7S64 = LLT::vector(7, 64);
457   const LLT V8S64 = LLT::vector(8, 64);
458   const LLT V16S64 = LLT::vector(16, 64);
459 
460   std::initializer_list<LLT> AllS32Vectors =
461     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463   std::initializer_list<LLT> AllS64Vectors =
464     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474   const LLT CodePtr = FlatPtr;
475 
476   const std::initializer_list<LLT> AddrSpaces64 = {
477     GlobalPtr, ConstantPtr, FlatPtr
478   };
479 
480   const std::initializer_list<LLT> AddrSpaces32 = {
481     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482   };
483 
484   const std::initializer_list<LLT> FPTypesBase = {
485     S32, S64
486   };
487 
488   const std::initializer_list<LLT> FPTypes16 = {
489     S32, S64, S16
490   };
491 
492   const std::initializer_list<LLT> FPTypesPK16 = {
493     S32, S64, S16, V2S16
494   };
495 
496   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498   setAction({G_BRCOND, S1}, Legal); // VCC branches
499   setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502   // elements for v3s16
503   getActionDefinitionsBuilder(G_PHI)
504     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505     .legalFor(AllS32Vectors)
506     .legalFor(AllS64Vectors)
507     .legalFor(AddrSpaces64)
508     .legalFor(AddrSpaces32)
509     .legalIf(isPointer(0))
510     .clampScalar(0, S16, S256)
511     .widenScalarToNextPow2(0, 32)
512     .clampMaxNumElements(0, S32, 16)
513     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514     .scalarize(0);
515 
516   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517     // Full set of gfx9 features.
518     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519       .legalFor({S32, S16, V2S16})
520       .clampScalar(0, S16, S32)
521       .clampMaxNumElements(0, S16, 2)
522       .scalarize(0)
523       .widenScalarToNextPow2(0, 32);
524 
525     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526       .legalFor({S32, S16, V2S16}) // Clamp modifier
527       .minScalarOrElt(0, S16)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32)
531       .lower();
532   } else if (ST.has16BitInsts()) {
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16})
535       .clampScalar(0, S16, S32)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539     // Technically the saturating operations require clamp bit support, but this
540     // was introduced at the same time as 16-bit operations.
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542       .legalFor({S32, S16}) // Clamp modifier
543       .minScalar(0, S16)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 16)
546       .lower();
547 
548     // We're just lowering this, but it helps get a better result to try to
549     // coerce to the desired type first.
550     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551       .minScalar(0, S16)
552       .scalarize(0)
553       .lower();
554   } else {
555     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556       .legalFor({S32})
557       .clampScalar(0, S32, S32)
558       .scalarize(0);
559 
560     if (ST.hasIntClamp()) {
561       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562         .legalFor({S32}) // Clamp modifier.
563         .scalarize(0)
564         .minScalarOrElt(0, S32)
565         .lower();
566     } else {
567       // Clamp bit support was added in VI, along with 16-bit operations.
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .minScalar(0, S32)
570         .scalarize(0)
571         .lower();
572     }
573 
574     // FIXME: DAG expansion gets better results. The widening uses the smaller
575     // range values and goes for the min/max lowering directly.
576     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577       .minScalar(0, S32)
578       .scalarize(0)
579       .lower();
580   }
581 
582   getActionDefinitionsBuilder(
583       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
584       .customFor({S32, S64})
585       .clampScalar(0, S32, S64)
586       .widenScalarToNextPow2(0, 32)
587       .scalarize(0);
588 
589   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
590                    .legalFor({S32})
591                    .maxScalarOrElt(0, S32);
592 
593   if (ST.hasVOP3PInsts()) {
594     Mulh
595       .clampMaxNumElements(0, S8, 2)
596       .lowerFor({V2S8});
597   }
598 
599   Mulh
600     .scalarize(0)
601     .lower();
602 
603   // Report legal for any types we can handle anywhere. For the cases only legal
604   // on the SALU, RegBankSelect will be able to re-legalize.
605   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
606     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
607     .clampScalar(0, S32, S64)
608     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
609     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
610     .widenScalarToNextPow2(0)
611     .scalarize(0);
612 
613   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
614                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
615     .legalFor({{S32, S1}, {S32, S32}})
616     .minScalar(0, S32)
617     // TODO: .scalarize(0)
618     .lower();
619 
620   getActionDefinitionsBuilder(G_BITCAST)
621     // Don't worry about the size constraint.
622     .legalIf(all(isRegisterType(0), isRegisterType(1)))
623     .lower();
624 
625 
626   getActionDefinitionsBuilder(G_CONSTANT)
627     .legalFor({S1, S32, S64, S16, GlobalPtr,
628                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
629     .legalIf(isPointer(0))
630     .clampScalar(0, S32, S64)
631     .widenScalarToNextPow2(0);
632 
633   getActionDefinitionsBuilder(G_FCONSTANT)
634     .legalFor({S32, S64, S16})
635     .clampScalar(0, S16, S64);
636 
637   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
638       .legalIf(isRegisterType(0))
639       // s1 and s16 are special cases because they have legal operations on
640       // them, but don't really occupy registers in the normal way.
641       .legalFor({S1, S16})
642       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
643       .clampScalarOrElt(0, S32, MaxScalar)
644       .widenScalarToNextPow2(0, 32)
645       .clampMaxNumElements(0, S32, 16);
646 
647   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
648 
649   // If the amount is divergent, we have to do a wave reduction to get the
650   // maximum value, so this is expanded during RegBankSelect.
651   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
652     .legalFor({{PrivatePtr, S32}});
653 
654   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
655     .customIf(typeIsNot(0, PrivatePtr));
656 
657   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
658 
659   auto &FPOpActions = getActionDefinitionsBuilder(
660     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
661     .legalFor({S32, S64});
662   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
663     .customFor({S32, S64});
664   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
665     .customFor({S32, S64});
666 
667   if (ST.has16BitInsts()) {
668     if (ST.hasVOP3PInsts())
669       FPOpActions.legalFor({S16, V2S16});
670     else
671       FPOpActions.legalFor({S16});
672 
673     TrigActions.customFor({S16});
674     FDIVActions.customFor({S16});
675   }
676 
677   auto &MinNumMaxNum = getActionDefinitionsBuilder({
678       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
679 
680   if (ST.hasVOP3PInsts()) {
681     MinNumMaxNum.customFor(FPTypesPK16)
682       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
683       .clampMaxNumElements(0, S16, 2)
684       .clampScalar(0, S16, S64)
685       .scalarize(0);
686   } else if (ST.has16BitInsts()) {
687     MinNumMaxNum.customFor(FPTypes16)
688       .clampScalar(0, S16, S64)
689       .scalarize(0);
690   } else {
691     MinNumMaxNum.customFor(FPTypesBase)
692       .clampScalar(0, S32, S64)
693       .scalarize(0);
694   }
695 
696   if (ST.hasVOP3PInsts())
697     FPOpActions.clampMaxNumElements(0, S16, 2);
698 
699   FPOpActions
700     .scalarize(0)
701     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
702 
703   TrigActions
704     .scalarize(0)
705     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
706 
707   FDIVActions
708     .scalarize(0)
709     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
710 
711   getActionDefinitionsBuilder({G_FNEG, G_FABS})
712     .legalFor(FPTypesPK16)
713     .clampMaxNumElements(0, S16, 2)
714     .scalarize(0)
715     .clampScalar(0, S16, S64);
716 
717   if (ST.has16BitInsts()) {
718     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
719       .legalFor({S32, S64, S16})
720       .scalarize(0)
721       .clampScalar(0, S16, S64);
722   } else {
723     getActionDefinitionsBuilder(G_FSQRT)
724       .legalFor({S32, S64})
725       .scalarize(0)
726       .clampScalar(0, S32, S64);
727 
728     if (ST.hasFractBug()) {
729       getActionDefinitionsBuilder(G_FFLOOR)
730         .customFor({S64})
731         .legalFor({S32, S64})
732         .scalarize(0)
733         .clampScalar(0, S32, S64);
734     } else {
735       getActionDefinitionsBuilder(G_FFLOOR)
736         .legalFor({S32, S64})
737         .scalarize(0)
738         .clampScalar(0, S32, S64);
739     }
740   }
741 
742   getActionDefinitionsBuilder(G_FPTRUNC)
743     .legalFor({{S32, S64}, {S16, S32}})
744     .scalarize(0)
745     .lower();
746 
747   getActionDefinitionsBuilder(G_FPEXT)
748     .legalFor({{S64, S32}, {S32, S16}})
749     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
750     .scalarize(0);
751 
752   getActionDefinitionsBuilder(G_FSUB)
753       // Use actual fsub instruction
754       .legalFor({S32})
755       // Must use fadd + fneg
756       .lowerFor({S64, S16, V2S16})
757       .scalarize(0)
758       .clampScalar(0, S32, S64);
759 
760   // Whether this is legal depends on the floating point mode for the function.
761   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
762   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
763     FMad.customFor({S32, S16});
764   else if (ST.hasMadMacF32Insts())
765     FMad.customFor({S32});
766   else if (ST.hasMadF16())
767     FMad.customFor({S16});
768   FMad.scalarize(0)
769       .lower();
770 
771   auto &FRem = getActionDefinitionsBuilder(G_FREM);
772   if (ST.has16BitInsts()) {
773     FRem.customFor({S16, S32, S64});
774   } else {
775     FRem.minScalar(0, S32)
776         .customFor({S32, S64});
777   }
778   FRem.scalarize(0);
779 
780   // TODO: Do we need to clamp maximum bitwidth?
781   getActionDefinitionsBuilder(G_TRUNC)
782     .legalIf(isScalar(0))
783     .legalFor({{V2S16, V2S32}})
784     .clampMaxNumElements(0, S16, 2)
785     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
786     // situations (like an invalid implicit use), we don't want to infinite loop
787     // in the legalizer.
788     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
789     .alwaysLegal();
790 
791   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
792     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
793                {S32, S1}, {S64, S1}, {S16, S1}})
794     .scalarize(0)
795     .clampScalar(0, S32, S64)
796     .widenScalarToNextPow2(1, 32);
797 
798   // TODO: Split s1->s64 during regbankselect for VALU.
799   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
800     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
801     .lowerFor({{S32, S64}})
802     .lowerIf(typeIs(1, S1))
803     .customFor({{S64, S64}});
804   if (ST.has16BitInsts())
805     IToFP.legalFor({{S16, S16}});
806   IToFP.clampScalar(1, S32, S64)
807        .minScalar(0, S32)
808        .scalarize(0)
809        .widenScalarToNextPow2(1);
810 
811   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
812     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
813     .customFor({{S64, S64}})
814     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
815   if (ST.has16BitInsts())
816     FPToI.legalFor({{S16, S16}});
817   else
818     FPToI.minScalar(1, S32);
819 
820   FPToI.minScalar(0, S32)
821        .widenScalarToNextPow2(0, 32)
822        .scalarize(0)
823        .lower();
824 
825   // Lower roundeven into G_FRINT
826   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
827     .scalarize(0)
828     .lower();
829 
830   if (ST.has16BitInsts()) {
831     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832       .legalFor({S16, S32, S64})
833       .clampScalar(0, S16, S64)
834       .scalarize(0);
835   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
836     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
837       .legalFor({S32, S64})
838       .clampScalar(0, S32, S64)
839       .scalarize(0);
840   } else {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S32})
843       .customFor({S64})
844       .clampScalar(0, S32, S64)
845       .scalarize(0);
846   }
847 
848   getActionDefinitionsBuilder(G_PTR_ADD)
849     .legalIf(all(isPointer(0), sameSize(0, 1)))
850     .scalarize(0)
851     .scalarSameSizeAs(1, 0);
852 
853   getActionDefinitionsBuilder(G_PTRMASK)
854     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
855     .scalarSameSizeAs(1, 0)
856     .scalarize(0);
857 
858   auto &CmpBuilder =
859     getActionDefinitionsBuilder(G_ICMP)
860     // The compare output type differs based on the register bank of the output,
861     // so make both s1 and s32 legal.
862     //
863     // Scalar compares producing output in scc will be promoted to s32, as that
864     // is the allocatable register type that will be needed for the copy from
865     // scc. This will be promoted during RegBankSelect, and we assume something
866     // before that won't try to use s32 result types.
867     //
868     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
869     // bank.
870     .legalForCartesianProduct(
871       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
872     .legalForCartesianProduct(
873       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
874   if (ST.has16BitInsts()) {
875     CmpBuilder.legalFor({{S1, S16}});
876   }
877 
878   CmpBuilder
879     .widenScalarToNextPow2(1)
880     .clampScalar(1, S32, S64)
881     .scalarize(0)
882     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
883 
884   getActionDefinitionsBuilder(G_FCMP)
885     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
886     .widenScalarToNextPow2(1)
887     .clampScalar(1, S32, S64)
888     .scalarize(0);
889 
890   // FIXME: fpow has a selection pattern that should move to custom lowering.
891   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
892   if (ST.has16BitInsts())
893     Exp2Ops.legalFor({S32, S16});
894   else
895     Exp2Ops.legalFor({S32});
896   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
897   Exp2Ops.scalarize(0);
898 
899   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
900   if (ST.has16BitInsts())
901     ExpOps.customFor({{S32}, {S16}});
902   else
903     ExpOps.customFor({S32});
904   ExpOps.clampScalar(0, MinScalarFPTy, S32)
905         .scalarize(0);
906 
907   getActionDefinitionsBuilder(G_FPOWI)
908     .clampScalar(0, MinScalarFPTy, S32)
909     .lower();
910 
911   // The 64-bit versions produce 32-bit results, but only on the SALU.
912   getActionDefinitionsBuilder(G_CTPOP)
913     .legalFor({{S32, S32}, {S32, S64}})
914     .clampScalar(0, S32, S32)
915     .clampScalar(1, S32, S64)
916     .scalarize(0)
917     .widenScalarToNextPow2(0, 32)
918     .widenScalarToNextPow2(1, 32);
919 
920   // The hardware instructions return a different result on 0 than the generic
921   // instructions expect. The hardware produces -1, but these produce the
922   // bitwidth.
923   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
924     .scalarize(0)
925     .clampScalar(0, S32, S32)
926     .clampScalar(1, S32, S64)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32)
929     .lower();
930 
931   // The 64-bit versions produce 32-bit results, but only on the SALU.
932   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
933     .legalFor({{S32, S32}, {S32, S64}})
934     .clampScalar(0, S32, S32)
935     .clampScalar(1, S32, S64)
936     .scalarize(0)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32);
939 
940   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
941   // RegBankSelect.
942   getActionDefinitionsBuilder(G_BITREVERSE)
943     .legalFor({S32, S64})
944     .clampScalar(0, S32, S64)
945     .scalarize(0)
946     .widenScalarToNextPow2(0);
947 
948   if (ST.has16BitInsts()) {
949     getActionDefinitionsBuilder(G_BSWAP)
950       .legalFor({S16, S32, V2S16})
951       .clampMaxNumElements(0, S16, 2)
952       // FIXME: Fixing non-power-of-2 before clamp is workaround for
953       // narrowScalar limitation.
954       .widenScalarToNextPow2(0)
955       .clampScalar(0, S16, S32)
956       .scalarize(0);
957 
958     if (ST.hasVOP3PInsts()) {
959       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
960         .legalFor({S32, S16, V2S16})
961         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
962         .clampMaxNumElements(0, S16, 2)
963         .minScalar(0, S16)
964         .widenScalarToNextPow2(0)
965         .scalarize(0)
966         .lower();
967     } else {
968       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
969         .legalFor({S32, S16})
970         .widenScalarToNextPow2(0)
971         .minScalar(0, S16)
972         .scalarize(0)
973         .lower();
974     }
975   } else {
976     // TODO: Should have same legality without v_perm_b32
977     getActionDefinitionsBuilder(G_BSWAP)
978       .legalFor({S32})
979       .lowerIf(scalarNarrowerThan(0, 32))
980       // FIXME: Fixing non-power-of-2 before clamp is workaround for
981       // narrowScalar limitation.
982       .widenScalarToNextPow2(0)
983       .maxScalar(0, S32)
984       .scalarize(0)
985       .lower();
986 
987     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
988       .legalFor({S32})
989       .minScalar(0, S32)
990       .widenScalarToNextPow2(0)
991       .scalarize(0)
992       .lower();
993   }
994 
995   getActionDefinitionsBuilder(G_INTTOPTR)
996     // List the common cases
997     .legalForCartesianProduct(AddrSpaces64, {S64})
998     .legalForCartesianProduct(AddrSpaces32, {S32})
999     .scalarize(0)
1000     // Accept any address space as long as the size matches
1001     .legalIf(sameSize(0, 1))
1002     .widenScalarIf(smallerThan(1, 0),
1003       [](const LegalityQuery &Query) {
1004         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1005       })
1006     .narrowScalarIf(largerThan(1, 0),
1007       [](const LegalityQuery &Query) {
1008         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1009       });
1010 
1011   getActionDefinitionsBuilder(G_PTRTOINT)
1012     // List the common cases
1013     .legalForCartesianProduct(AddrSpaces64, {S64})
1014     .legalForCartesianProduct(AddrSpaces32, {S32})
1015     .scalarize(0)
1016     // Accept any address space as long as the size matches
1017     .legalIf(sameSize(0, 1))
1018     .widenScalarIf(smallerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       })
1022     .narrowScalarIf(
1023       largerThan(0, 1),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1029     .scalarize(0)
1030     .custom();
1031 
1032   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1033                                     bool IsLoad) -> bool {
1034     const LLT DstTy = Query.Types[0];
1035 
1036     // Split vector extloads.
1037     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1038     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1039 
1040     if (MemSize < DstTy.getSizeInBits())
1041       MemSize = std::max(MemSize, AlignBits);
1042 
1043     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1044       return true;
1045 
1046     const LLT PtrTy = Query.Types[1];
1047     unsigned AS = PtrTy.getAddressSpace();
1048     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1049       return true;
1050 
1051     // Catch weird sized loads that don't evenly divide into the access sizes
1052     // TODO: May be able to widen depending on alignment etc.
1053     unsigned NumRegs = (MemSize + 31) / 32;
1054     if (NumRegs == 3) {
1055       if (!ST.hasDwordx3LoadStores())
1056         return true;
1057     } else {
1058       // If the alignment allows, these should have been widened.
1059       if (!isPowerOf2_32(NumRegs))
1060         return true;
1061     }
1062 
1063     if (AlignBits < MemSize) {
1064       const SITargetLowering *TLI = ST.getTargetLowering();
1065       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1066                                                       Align(AlignBits / 8));
1067     }
1068 
1069     return false;
1070   };
1071 
1072   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1073   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1074   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1075 
1076   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1077   // LDS
1078   // TODO: Unsupported flat for SI.
1079 
1080   for (unsigned Op : {G_LOAD, G_STORE}) {
1081     const bool IsStore = Op == G_STORE;
1082 
1083     auto &Actions = getActionDefinitionsBuilder(Op);
1084     // Explicitly list some common cases.
1085     // TODO: Does this help compile time at all?
1086     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1087                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1088                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1089                                       {S64, GlobalPtr, 64, GlobalAlign32},
1090                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1091                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1092                                       {S32, GlobalPtr, 8, GlobalAlign8},
1093                                       {S32, GlobalPtr, 16, GlobalAlign16},
1094 
1095                                       {S32, LocalPtr, 32, 32},
1096                                       {S64, LocalPtr, 64, 32},
1097                                       {V2S32, LocalPtr, 64, 32},
1098                                       {S32, LocalPtr, 8, 8},
1099                                       {S32, LocalPtr, 16, 16},
1100                                       {V2S16, LocalPtr, 32, 32},
1101 
1102                                       {S32, PrivatePtr, 32, 32},
1103                                       {S32, PrivatePtr, 8, 8},
1104                                       {S32, PrivatePtr, 16, 16},
1105                                       {V2S16, PrivatePtr, 32, 32},
1106 
1107                                       {S32, ConstantPtr, 32, GlobalAlign32},
1108                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1109                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1110                                       {S64, ConstantPtr, 64, GlobalAlign32},
1111                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1112     Actions.legalIf(
1113       [=](const LegalityQuery &Query) -> bool {
1114         return isLoadStoreLegal(ST, Query);
1115       });
1116 
1117     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1118     // 64-bits.
1119     //
1120     // TODO: Should generalize bitcast action into coerce, which will also cover
1121     // inserting addrspacecasts.
1122     Actions.customIf(typeIs(1, Constant32Ptr));
1123 
1124     // Turn any illegal element vectors into something easier to deal
1125     // with. These will ultimately produce 32-bit scalar shifts to extract the
1126     // parts anyway.
1127     //
1128     // For odd 16-bit element vectors, prefer to split those into pieces with
1129     // 16-bit vector parts.
1130     Actions.bitcastIf(
1131       [=](const LegalityQuery &Query) -> bool {
1132         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1133                                           Query.MMODescrs[0].SizeInBits);
1134       }, bitcastToRegisterType(0));
1135 
1136     if (!IsStore) {
1137       // Widen suitably aligned loads by loading extra bytes. The standard
1138       // legalization actions can't properly express widening memory operands.
1139       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1140         return shouldWidenLoad(ST, Query, G_LOAD);
1141       });
1142     }
1143 
1144     // FIXME: load/store narrowing should be moved to lower action
1145     Actions
1146         .narrowScalarIf(
1147             [=](const LegalityQuery &Query) -> bool {
1148               return !Query.Types[0].isVector() &&
1149                      needToSplitMemOp(Query, Op == G_LOAD);
1150             },
1151             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1152               const LLT DstTy = Query.Types[0];
1153               const LLT PtrTy = Query.Types[1];
1154 
1155               const unsigned DstSize = DstTy.getSizeInBits();
1156               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1157 
1158               // Split extloads.
1159               if (DstSize > MemSize)
1160                 return std::make_pair(0, LLT::scalar(MemSize));
1161 
1162               if (!isPowerOf2_32(DstSize)) {
1163                 // We're probably decomposing an odd sized store. Try to split
1164                 // to the widest type. TODO: Account for alignment. As-is it
1165                 // should be OK, since the new parts will be further legalized.
1166                 unsigned FloorSize = PowerOf2Floor(DstSize);
1167                 return std::make_pair(0, LLT::scalar(FloorSize));
1168               }
1169 
1170               if (DstSize > 32 && (DstSize % 32 != 0)) {
1171                 // FIXME: Need a way to specify non-extload of larger size if
1172                 // suitably aligned.
1173                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1174               }
1175 
1176               unsigned MaxSize = maxSizeForAddrSpace(ST,
1177                                                      PtrTy.getAddressSpace(),
1178                                                      Op == G_LOAD);
1179               if (MemSize > MaxSize)
1180                 return std::make_pair(0, LLT::scalar(MaxSize));
1181 
1182               unsigned Align = Query.MMODescrs[0].AlignInBits;
1183               return std::make_pair(0, LLT::scalar(Align));
1184             })
1185         .fewerElementsIf(
1186             [=](const LegalityQuery &Query) -> bool {
1187               return Query.Types[0].isVector() &&
1188                      needToSplitMemOp(Query, Op == G_LOAD);
1189             },
1190             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1191               const LLT DstTy = Query.Types[0];
1192               const LLT PtrTy = Query.Types[1];
1193 
1194               LLT EltTy = DstTy.getElementType();
1195               unsigned MaxSize = maxSizeForAddrSpace(ST,
1196                                                      PtrTy.getAddressSpace(),
1197                                                      Op == G_LOAD);
1198 
1199               // FIXME: Handle widened to power of 2 results better. This ends
1200               // up scalarizing.
1201               // FIXME: 3 element stores scalarized on SI
1202 
1203               // Split if it's too large for the address space.
1204               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1205                 unsigned NumElts = DstTy.getNumElements();
1206                 unsigned EltSize = EltTy.getSizeInBits();
1207 
1208                 if (MaxSize % EltSize == 0) {
1209                   return std::make_pair(
1210                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1211                 }
1212 
1213                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1214 
1215                 // FIXME: Refine when odd breakdowns handled
1216                 // The scalars will need to be re-legalized.
1217                 if (NumPieces == 1 || NumPieces >= NumElts ||
1218                     NumElts % NumPieces != 0)
1219                   return std::make_pair(0, EltTy);
1220 
1221                 return std::make_pair(0,
1222                                       LLT::vector(NumElts / NumPieces, EltTy));
1223               }
1224 
1225               // FIXME: We could probably handle weird extending loads better.
1226               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1227               if (DstTy.getSizeInBits() > MemSize)
1228                 return std::make_pair(0, EltTy);
1229 
1230               unsigned EltSize = EltTy.getSizeInBits();
1231               unsigned DstSize = DstTy.getSizeInBits();
1232               if (!isPowerOf2_32(DstSize)) {
1233                 // We're probably decomposing an odd sized store. Try to split
1234                 // to the widest type. TODO: Account for alignment. As-is it
1235                 // should be OK, since the new parts will be further legalized.
1236                 unsigned FloorSize = PowerOf2Floor(DstSize);
1237                 return std::make_pair(
1238                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1239               }
1240 
1241               // Need to split because of alignment.
1242               unsigned Align = Query.MMODescrs[0].AlignInBits;
1243               if (EltSize > Align &&
1244                   (EltSize / Align < DstTy.getNumElements())) {
1245                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1246               }
1247 
1248               // May need relegalization for the scalars.
1249               return std::make_pair(0, EltTy);
1250             })
1251     .lowerIfMemSizeNotPow2()
1252     .minScalar(0, S32);
1253 
1254     if (IsStore)
1255       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1256 
1257     Actions
1258         .widenScalarToNextPow2(0)
1259         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1260         .lower();
1261   }
1262 
1263   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1264                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1265                                                   {S32, GlobalPtr, 16, 2 * 8},
1266                                                   {S32, LocalPtr, 8, 8},
1267                                                   {S32, LocalPtr, 16, 16},
1268                                                   {S32, PrivatePtr, 8, 8},
1269                                                   {S32, PrivatePtr, 16, 16},
1270                                                   {S32, ConstantPtr, 8, 8},
1271                                                   {S32, ConstantPtr, 16, 2 * 8}});
1272   if (ST.hasFlatAddressSpace()) {
1273     ExtLoads.legalForTypesWithMemDesc(
1274         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1275   }
1276 
1277   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1278   // 64-bits.
1279   //
1280   // TODO: Should generalize bitcast action into coerce, which will also cover
1281   // inserting addrspacecasts.
1282   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1283 
1284   ExtLoads.clampScalar(0, S32, S32)
1285           .widenScalarToNextPow2(0)
1286           .unsupportedIfMemSizeNotPow2()
1287           .lower();
1288 
1289   auto &Atomics = getActionDefinitionsBuilder(
1290     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1291      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1292      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1293      G_ATOMICRMW_UMIN})
1294     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1295                {S64, GlobalPtr}, {S64, LocalPtr},
1296                {S32, RegionPtr}, {S64, RegionPtr}});
1297   if (ST.hasFlatAddressSpace()) {
1298     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1299   }
1300 
1301   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1302   if (ST.hasLDSFPAtomics()) {
1303     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1304     if (ST.hasGFX90AInsts())
1305       Atomic.legalFor({{S64, LocalPtr}});
1306   }
1307   if (ST.hasAtomicFaddInsts())
1308     Atomic.legalFor({{S32, GlobalPtr}});
1309 
1310   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1311   // demarshalling
1312   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1313     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1314                 {S32, FlatPtr}, {S64, FlatPtr}})
1315     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1316                {S32, RegionPtr}, {S64, RegionPtr}});
1317   // TODO: Pointer types, any 32-bit or 64-bit vector
1318 
1319   // Condition should be s32 for scalar, s1 for vector.
1320   getActionDefinitionsBuilder(G_SELECT)
1321     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1322           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1323           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1324     .clampScalar(0, S16, S64)
1325     .scalarize(1)
1326     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1327     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1328     .clampMaxNumElements(0, S32, 2)
1329     .clampMaxNumElements(0, LocalPtr, 2)
1330     .clampMaxNumElements(0, PrivatePtr, 2)
1331     .scalarize(0)
1332     .widenScalarToNextPow2(0)
1333     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1334 
1335   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1336   // be more flexible with the shift amount type.
1337   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1338     .legalFor({{S32, S32}, {S64, S32}});
1339   if (ST.has16BitInsts()) {
1340     if (ST.hasVOP3PInsts()) {
1341       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1342             .clampMaxNumElements(0, S16, 2);
1343     } else
1344       Shifts.legalFor({{S16, S16}});
1345 
1346     // TODO: Support 16-bit shift amounts for all types
1347     Shifts.widenScalarIf(
1348       [=](const LegalityQuery &Query) {
1349         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1350         // 32-bit amount.
1351         const LLT ValTy = Query.Types[0];
1352         const LLT AmountTy = Query.Types[1];
1353         return ValTy.getSizeInBits() <= 16 &&
1354                AmountTy.getSizeInBits() < 16;
1355       }, changeTo(1, S16));
1356     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1357     Shifts.clampScalar(1, S32, S32);
1358     Shifts.clampScalar(0, S16, S64);
1359     Shifts.widenScalarToNextPow2(0, 16);
1360 
1361     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1362       .minScalar(0, S16)
1363       .scalarize(0)
1364       .lower();
1365   } else {
1366     // Make sure we legalize the shift amount type first, as the general
1367     // expansion for the shifted type will produce much worse code if it hasn't
1368     // been truncated already.
1369     Shifts.clampScalar(1, S32, S32);
1370     Shifts.clampScalar(0, S32, S64);
1371     Shifts.widenScalarToNextPow2(0, 32);
1372 
1373     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1374       .minScalar(0, S32)
1375       .scalarize(0)
1376       .lower();
1377   }
1378   Shifts.scalarize(0);
1379 
1380   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1381     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1382     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1383     unsigned IdxTypeIdx = 2;
1384 
1385     getActionDefinitionsBuilder(Op)
1386       .customIf([=](const LegalityQuery &Query) {
1387           const LLT EltTy = Query.Types[EltTypeIdx];
1388           const LLT VecTy = Query.Types[VecTypeIdx];
1389           const LLT IdxTy = Query.Types[IdxTypeIdx];
1390           const unsigned EltSize = EltTy.getSizeInBits();
1391           return (EltSize == 32 || EltSize == 64) &&
1392                   VecTy.getSizeInBits() % 32 == 0 &&
1393                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1394                   IdxTy.getSizeInBits() == 32;
1395         })
1396       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1397                  bitcastToVectorElement32(VecTypeIdx))
1398       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1399       .bitcastIf(
1400         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1401         [=](const LegalityQuery &Query) {
1402           // For > 64-bit element types, try to turn this into a 64-bit
1403           // element vector since we may be able to do better indexing
1404           // if this is scalar. If not, fall back to 32.
1405           const LLT EltTy = Query.Types[EltTypeIdx];
1406           const LLT VecTy = Query.Types[VecTypeIdx];
1407           const unsigned DstEltSize = EltTy.getSizeInBits();
1408           const unsigned VecSize = VecTy.getSizeInBits();
1409 
1410           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1411           return std::make_pair(
1412             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1413         })
1414       .clampScalar(EltTypeIdx, S32, S64)
1415       .clampScalar(VecTypeIdx, S32, S64)
1416       .clampScalar(IdxTypeIdx, S32, S32)
1417       .clampMaxNumElements(VecTypeIdx, S32, 32)
1418       // TODO: Clamp elements for 64-bit vectors?
1419       // It should only be necessary with variable indexes.
1420       // As a last resort, lower to the stack
1421       .lower();
1422   }
1423 
1424   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1425     .unsupportedIf([=](const LegalityQuery &Query) {
1426         const LLT &EltTy = Query.Types[1].getElementType();
1427         return Query.Types[0] != EltTy;
1428       });
1429 
1430   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1431     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1432     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1433 
1434     // FIXME: Doesn't handle extract of illegal sizes.
1435     getActionDefinitionsBuilder(Op)
1436       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1437       // FIXME: Multiples of 16 should not be legal.
1438       .legalIf([=](const LegalityQuery &Query) {
1439           const LLT BigTy = Query.Types[BigTyIdx];
1440           const LLT LitTy = Query.Types[LitTyIdx];
1441           return (BigTy.getSizeInBits() % 32 == 0) &&
1442                  (LitTy.getSizeInBits() % 16 == 0);
1443         })
1444       .widenScalarIf(
1445         [=](const LegalityQuery &Query) {
1446           const LLT BigTy = Query.Types[BigTyIdx];
1447           return (BigTy.getScalarSizeInBits() < 16);
1448         },
1449         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1450       .widenScalarIf(
1451         [=](const LegalityQuery &Query) {
1452           const LLT LitTy = Query.Types[LitTyIdx];
1453           return (LitTy.getScalarSizeInBits() < 16);
1454         },
1455         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1456       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1457       .widenScalarToNextPow2(BigTyIdx, 32);
1458 
1459   }
1460 
1461   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1462     .legalForCartesianProduct(AllS32Vectors, {S32})
1463     .legalForCartesianProduct(AllS64Vectors, {S64})
1464     .clampNumElements(0, V16S32, V32S32)
1465     .clampNumElements(0, V2S64, V16S64)
1466     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1467 
1468   if (ST.hasScalarPackInsts()) {
1469     BuildVector
1470       // FIXME: Should probably widen s1 vectors straight to s32
1471       .minScalarOrElt(0, S16)
1472       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1473       .minScalar(1, S32);
1474 
1475     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1476       .legalFor({V2S16, S32})
1477       .lower();
1478     BuildVector.minScalarOrElt(0, S32);
1479   } else {
1480     BuildVector.customFor({V2S16, S16});
1481     BuildVector.minScalarOrElt(0, S32);
1482 
1483     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1484       .customFor({V2S16, S32})
1485       .lower();
1486   }
1487 
1488   BuildVector.legalIf(isRegisterType(0));
1489 
1490   // FIXME: Clamp maximum size
1491   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1492     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1493     .clampMaxNumElements(0, S32, 32)
1494     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1495     .clampMaxNumElements(0, S16, 64);
1496 
1497   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1498   // pre-legalize.
1499   if (ST.hasVOP3PInsts()) {
1500     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1501       .customFor({V2S16, V2S16})
1502       .lower();
1503   } else
1504     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1505 
1506   // Merge/Unmerge
1507   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1508     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1509     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1510 
1511     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1512       const LLT Ty = Query.Types[TypeIdx];
1513       if (Ty.isVector()) {
1514         const LLT &EltTy = Ty.getElementType();
1515         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1516           return true;
1517         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1518           return true;
1519       }
1520       return false;
1521     };
1522 
1523     auto &Builder = getActionDefinitionsBuilder(Op)
1524       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1525       .lowerFor({{S16, V2S16}})
1526       .lowerIf([=](const LegalityQuery &Query) {
1527           const LLT BigTy = Query.Types[BigTyIdx];
1528           return BigTy.getSizeInBits() == 32;
1529         })
1530       // Try to widen to s16 first for small types.
1531       // TODO: Only do this on targets with legal s16 shifts
1532       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1533       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1534       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1535       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1536                            elementTypeIs(1, S16)),
1537                        changeTo(1, V2S16))
1538       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1539       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1540       // valid.
1541       .clampScalar(LitTyIdx, S32, S512)
1542       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1543       // Break up vectors with weird elements into scalars
1544       .fewerElementsIf(
1545         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1546         scalarize(0))
1547       .fewerElementsIf(
1548         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1549         scalarize(1))
1550       .clampScalar(BigTyIdx, S32, MaxScalar);
1551 
1552     if (Op == G_MERGE_VALUES) {
1553       Builder.widenScalarIf(
1554         // TODO: Use 16-bit shifts if legal for 8-bit values?
1555         [=](const LegalityQuery &Query) {
1556           const LLT Ty = Query.Types[LitTyIdx];
1557           return Ty.getSizeInBits() < 32;
1558         },
1559         changeTo(LitTyIdx, S32));
1560     }
1561 
1562     Builder.widenScalarIf(
1563       [=](const LegalityQuery &Query) {
1564         const LLT Ty = Query.Types[BigTyIdx];
1565         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1566           Ty.getSizeInBits() % 16 != 0;
1567       },
1568       [=](const LegalityQuery &Query) {
1569         // Pick the next power of 2, or a multiple of 64 over 128.
1570         // Whichever is smaller.
1571         const LLT &Ty = Query.Types[BigTyIdx];
1572         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1573         if (NewSizeInBits >= 256) {
1574           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1575           if (RoundedTo < NewSizeInBits)
1576             NewSizeInBits = RoundedTo;
1577         }
1578         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1579       })
1580       // Any vectors left are the wrong size. Scalarize them.
1581       .scalarize(0)
1582       .scalarize(1);
1583   }
1584 
1585   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1586   // RegBankSelect.
1587   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1588     .legalFor({{S32}, {S64}});
1589 
1590   if (ST.hasVOP3PInsts()) {
1591     SextInReg.lowerFor({{V2S16}})
1592       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1593       // get more vector shift opportunities, since we'll get those when
1594       // expanded.
1595       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1596   } else if (ST.has16BitInsts()) {
1597     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1598   } else {
1599     // Prefer to promote to s32 before lowering if we don't have 16-bit
1600     // shifts. This avoid a lot of intermediate truncate and extend operations.
1601     SextInReg.lowerFor({{S32}, {S64}});
1602   }
1603 
1604   SextInReg
1605     .scalarize(0)
1606     .clampScalar(0, S32, S64)
1607     .lower();
1608 
1609   // TODO: Only Try to form v2s16 with legal packed instructions.
1610   getActionDefinitionsBuilder(G_FSHR)
1611     .legalFor({{S32, S32}})
1612     .lowerFor({{V2S16, V2S16}})
1613     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1614     .scalarize(0)
1615     .lower();
1616 
1617   if (ST.hasVOP3PInsts()) {
1618     getActionDefinitionsBuilder(G_FSHL)
1619       .lowerFor({{V2S16, V2S16}})
1620       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1621       .scalarize(0)
1622       .lower();
1623   } else {
1624     getActionDefinitionsBuilder(G_FSHL)
1625       .scalarize(0)
1626       .lower();
1627   }
1628 
1629   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1630     .legalFor({S64});
1631 
1632   getActionDefinitionsBuilder(G_FENCE)
1633     .alwaysLegal();
1634 
1635   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1636       .scalarize(0)
1637       .minScalar(0, S32)
1638       .lower();
1639 
1640   getActionDefinitionsBuilder({
1641       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1642       G_FCOPYSIGN,
1643 
1644       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1645       G_ATOMICRMW_NAND,
1646       G_ATOMICRMW_FSUB,
1647       G_READ_REGISTER,
1648       G_WRITE_REGISTER,
1649 
1650       G_SADDO, G_SSUBO,
1651 
1652        // TODO: Implement
1653       G_FMINIMUM, G_FMAXIMUM}).lower();
1654 
1655   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1656         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1657         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1658     .unsupported();
1659 
1660   computeTables();
1661   verify(*ST.getInstrInfo());
1662 }
1663 
1664 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1665                                          MachineInstr &MI) const {
1666   MachineIRBuilder &B = Helper.MIRBuilder;
1667   MachineRegisterInfo &MRI = *B.getMRI();
1668 
1669   switch (MI.getOpcode()) {
1670   case TargetOpcode::G_ADDRSPACE_CAST:
1671     return legalizeAddrSpaceCast(MI, MRI, B);
1672   case TargetOpcode::G_FRINT:
1673     return legalizeFrint(MI, MRI, B);
1674   case TargetOpcode::G_FCEIL:
1675     return legalizeFceil(MI, MRI, B);
1676   case TargetOpcode::G_FREM:
1677     return legalizeFrem(MI, MRI, B);
1678   case TargetOpcode::G_INTRINSIC_TRUNC:
1679     return legalizeIntrinsicTrunc(MI, MRI, B);
1680   case TargetOpcode::G_SITOFP:
1681     return legalizeITOFP(MI, MRI, B, true);
1682   case TargetOpcode::G_UITOFP:
1683     return legalizeITOFP(MI, MRI, B, false);
1684   case TargetOpcode::G_FPTOSI:
1685     return legalizeFPTOI(MI, MRI, B, true);
1686   case TargetOpcode::G_FPTOUI:
1687     return legalizeFPTOI(MI, MRI, B, false);
1688   case TargetOpcode::G_FMINNUM:
1689   case TargetOpcode::G_FMAXNUM:
1690   case TargetOpcode::G_FMINNUM_IEEE:
1691   case TargetOpcode::G_FMAXNUM_IEEE:
1692     return legalizeMinNumMaxNum(Helper, MI);
1693   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1694     return legalizeExtractVectorElt(MI, MRI, B);
1695   case TargetOpcode::G_INSERT_VECTOR_ELT:
1696     return legalizeInsertVectorElt(MI, MRI, B);
1697   case TargetOpcode::G_SHUFFLE_VECTOR:
1698     return legalizeShuffleVector(MI, MRI, B);
1699   case TargetOpcode::G_FSIN:
1700   case TargetOpcode::G_FCOS:
1701     return legalizeSinCos(MI, MRI, B);
1702   case TargetOpcode::G_GLOBAL_VALUE:
1703     return legalizeGlobalValue(MI, MRI, B);
1704   case TargetOpcode::G_LOAD:
1705   case TargetOpcode::G_SEXTLOAD:
1706   case TargetOpcode::G_ZEXTLOAD:
1707     return legalizeLoad(Helper, MI);
1708   case TargetOpcode::G_FMAD:
1709     return legalizeFMad(MI, MRI, B);
1710   case TargetOpcode::G_FDIV:
1711     return legalizeFDIV(MI, MRI, B);
1712   case TargetOpcode::G_UDIV:
1713   case TargetOpcode::G_UREM:
1714   case TargetOpcode::G_UDIVREM:
1715     return legalizeUnsignedDIV_REM(MI, MRI, B);
1716   case TargetOpcode::G_SDIV:
1717   case TargetOpcode::G_SREM:
1718   case TargetOpcode::G_SDIVREM:
1719     return legalizeSignedDIV_REM(MI, MRI, B);
1720   case TargetOpcode::G_ATOMIC_CMPXCHG:
1721     return legalizeAtomicCmpXChg(MI, MRI, B);
1722   case TargetOpcode::G_FLOG:
1723     return legalizeFlog(MI, B, numbers::ln2f);
1724   case TargetOpcode::G_FLOG10:
1725     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1726   case TargetOpcode::G_FEXP:
1727     return legalizeFExp(MI, B);
1728   case TargetOpcode::G_FPOW:
1729     return legalizeFPow(MI, B);
1730   case TargetOpcode::G_FFLOOR:
1731     return legalizeFFloor(MI, MRI, B);
1732   case TargetOpcode::G_BUILD_VECTOR:
1733     return legalizeBuildVector(MI, MRI, B);
1734   default:
1735     return false;
1736   }
1737 
1738   llvm_unreachable("expected switch to return");
1739 }
1740 
1741 Register AMDGPULegalizerInfo::getSegmentAperture(
1742   unsigned AS,
1743   MachineRegisterInfo &MRI,
1744   MachineIRBuilder &B) const {
1745   MachineFunction &MF = B.getMF();
1746   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1747   const LLT S32 = LLT::scalar(32);
1748 
1749   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1750 
1751   if (ST.hasApertureRegs()) {
1752     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1753     // getreg.
1754     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1755         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1756         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1757     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1758         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1759         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1760     unsigned Encoding =
1761         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1762         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1763         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1764 
1765     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1766 
1767     B.buildInstr(AMDGPU::S_GETREG_B32)
1768       .addDef(GetReg)
1769       .addImm(Encoding);
1770     MRI.setType(GetReg, S32);
1771 
1772     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1773     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1774   }
1775 
1776   Register QueuePtr = MRI.createGenericVirtualRegister(
1777     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1778 
1779   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1780     return Register();
1781 
1782   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1783   // private_segment_aperture_base_hi.
1784   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1785 
1786   // TODO: can we be smarter about machine pointer info?
1787   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1788   MachineMemOperand *MMO = MF.getMachineMemOperand(
1789       PtrInfo,
1790       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1791           MachineMemOperand::MOInvariant,
1792       4, commonAlignment(Align(64), StructOffset));
1793 
1794   Register LoadAddr;
1795 
1796   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1797   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1798 }
1799 
1800 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1801   MachineInstr &MI, MachineRegisterInfo &MRI,
1802   MachineIRBuilder &B) const {
1803   MachineFunction &MF = B.getMF();
1804 
1805   const LLT S32 = LLT::scalar(32);
1806   Register Dst = MI.getOperand(0).getReg();
1807   Register Src = MI.getOperand(1).getReg();
1808 
1809   LLT DstTy = MRI.getType(Dst);
1810   LLT SrcTy = MRI.getType(Src);
1811   unsigned DestAS = DstTy.getAddressSpace();
1812   unsigned SrcAS = SrcTy.getAddressSpace();
1813 
1814   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1815   // vector element.
1816   assert(!DstTy.isVector());
1817 
1818   const AMDGPUTargetMachine &TM
1819     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1820 
1821   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1822     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1823     return true;
1824   }
1825 
1826   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1827     // Truncate.
1828     B.buildExtract(Dst, Src, 0);
1829     MI.eraseFromParent();
1830     return true;
1831   }
1832 
1833   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1834     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1835     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1836 
1837     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1838     // another. Merge operands are required to be the same type, but creating an
1839     // extra ptrtoint would be kind of pointless.
1840     auto HighAddr = B.buildConstant(
1841       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1842     B.buildMerge(Dst, {Src, HighAddr});
1843     MI.eraseFromParent();
1844     return true;
1845   }
1846 
1847   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1848     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1849            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1850     unsigned NullVal = TM.getNullPointerValue(DestAS);
1851 
1852     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1853     auto FlatNull = B.buildConstant(SrcTy, 0);
1854 
1855     // Extract low 32-bits of the pointer.
1856     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1857 
1858     auto CmpRes =
1859         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1860     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1861 
1862     MI.eraseFromParent();
1863     return true;
1864   }
1865 
1866   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1867     return false;
1868 
1869   if (!ST.hasFlatAddressSpace())
1870     return false;
1871 
1872   auto SegmentNull =
1873       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1874   auto FlatNull =
1875       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1876 
1877   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1878   if (!ApertureReg.isValid())
1879     return false;
1880 
1881   auto CmpRes =
1882       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1883 
1884   // Coerce the type of the low half of the result so we can use merge_values.
1885   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1886 
1887   // TODO: Should we allow mismatched types but matching sizes in merges to
1888   // avoid the ptrtoint?
1889   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1890   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1891 
1892   MI.eraseFromParent();
1893   return true;
1894 }
1895 
1896 bool AMDGPULegalizerInfo::legalizeFrint(
1897   MachineInstr &MI, MachineRegisterInfo &MRI,
1898   MachineIRBuilder &B) const {
1899   Register Src = MI.getOperand(1).getReg();
1900   LLT Ty = MRI.getType(Src);
1901   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1902 
1903   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1904   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1905 
1906   auto C1 = B.buildFConstant(Ty, C1Val);
1907   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1908 
1909   // TODO: Should this propagate fast-math-flags?
1910   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1911   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1912 
1913   auto C2 = B.buildFConstant(Ty, C2Val);
1914   auto Fabs = B.buildFAbs(Ty, Src);
1915 
1916   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1917   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1918   MI.eraseFromParent();
1919   return true;
1920 }
1921 
1922 bool AMDGPULegalizerInfo::legalizeFceil(
1923   MachineInstr &MI, MachineRegisterInfo &MRI,
1924   MachineIRBuilder &B) const {
1925 
1926   const LLT S1 = LLT::scalar(1);
1927   const LLT S64 = LLT::scalar(64);
1928 
1929   Register Src = MI.getOperand(1).getReg();
1930   assert(MRI.getType(Src) == S64);
1931 
1932   // result = trunc(src)
1933   // if (src > 0.0 && src != result)
1934   //   result += 1.0
1935 
1936   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1937 
1938   const auto Zero = B.buildFConstant(S64, 0.0);
1939   const auto One = B.buildFConstant(S64, 1.0);
1940   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1941   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1942   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1943   auto Add = B.buildSelect(S64, And, One, Zero);
1944 
1945   // TODO: Should this propagate fast-math-flags?
1946   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1947   return true;
1948 }
1949 
1950 bool AMDGPULegalizerInfo::legalizeFrem(
1951   MachineInstr &MI, MachineRegisterInfo &MRI,
1952   MachineIRBuilder &B) const {
1953     Register DstReg = MI.getOperand(0).getReg();
1954     Register Src0Reg = MI.getOperand(1).getReg();
1955     Register Src1Reg = MI.getOperand(2).getReg();
1956     auto Flags = MI.getFlags();
1957     LLT Ty = MRI.getType(DstReg);
1958 
1959     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1960     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1961     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1962     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1963     MI.eraseFromParent();
1964     return true;
1965 }
1966 
1967 static MachineInstrBuilder extractF64Exponent(Register Hi,
1968                                               MachineIRBuilder &B) {
1969   const unsigned FractBits = 52;
1970   const unsigned ExpBits = 11;
1971   LLT S32 = LLT::scalar(32);
1972 
1973   auto Const0 = B.buildConstant(S32, FractBits - 32);
1974   auto Const1 = B.buildConstant(S32, ExpBits);
1975 
1976   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1977     .addUse(Hi)
1978     .addUse(Const0.getReg(0))
1979     .addUse(Const1.getReg(0));
1980 
1981   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1982 }
1983 
1984 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1985   MachineInstr &MI, MachineRegisterInfo &MRI,
1986   MachineIRBuilder &B) const {
1987   const LLT S1 = LLT::scalar(1);
1988   const LLT S32 = LLT::scalar(32);
1989   const LLT S64 = LLT::scalar(64);
1990 
1991   Register Src = MI.getOperand(1).getReg();
1992   assert(MRI.getType(Src) == S64);
1993 
1994   // TODO: Should this use extract since the low half is unused?
1995   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1996   Register Hi = Unmerge.getReg(1);
1997 
1998   // Extract the upper half, since this is where we will find the sign and
1999   // exponent.
2000   auto Exp = extractF64Exponent(Hi, B);
2001 
2002   const unsigned FractBits = 52;
2003 
2004   // Extract the sign bit.
2005   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2006   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2007 
2008   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2009 
2010   const auto Zero32 = B.buildConstant(S32, 0);
2011 
2012   // Extend back to 64-bits.
2013   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2014 
2015   auto Shr = B.buildAShr(S64, FractMask, Exp);
2016   auto Not = B.buildNot(S64, Shr);
2017   auto Tmp0 = B.buildAnd(S64, Src, Not);
2018   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2019 
2020   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2021   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2022 
2023   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2024   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2025   MI.eraseFromParent();
2026   return true;
2027 }
2028 
2029 bool AMDGPULegalizerInfo::legalizeITOFP(
2030   MachineInstr &MI, MachineRegisterInfo &MRI,
2031   MachineIRBuilder &B, bool Signed) const {
2032 
2033   Register Dst = MI.getOperand(0).getReg();
2034   Register Src = MI.getOperand(1).getReg();
2035 
2036   const LLT S64 = LLT::scalar(64);
2037   const LLT S32 = LLT::scalar(32);
2038 
2039   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2040 
2041   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2042 
2043   auto CvtHi = Signed ?
2044     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2045     B.buildUITOFP(S64, Unmerge.getReg(1));
2046 
2047   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2048 
2049   auto ThirtyTwo = B.buildConstant(S32, 32);
2050   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2051     .addUse(CvtHi.getReg(0))
2052     .addUse(ThirtyTwo.getReg(0));
2053 
2054   // TODO: Should this propagate fast-math-flags?
2055   B.buildFAdd(Dst, LdExp, CvtLo);
2056   MI.eraseFromParent();
2057   return true;
2058 }
2059 
2060 // TODO: Copied from DAG implementation. Verify logic and document how this
2061 // actually works.
2062 bool AMDGPULegalizerInfo::legalizeFPTOI(
2063   MachineInstr &MI, MachineRegisterInfo &MRI,
2064   MachineIRBuilder &B, bool Signed) const {
2065 
2066   Register Dst = MI.getOperand(0).getReg();
2067   Register Src = MI.getOperand(1).getReg();
2068 
2069   const LLT S64 = LLT::scalar(64);
2070   const LLT S32 = LLT::scalar(32);
2071 
2072   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2073 
2074   unsigned Flags = MI.getFlags();
2075 
2076   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2077   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2078   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2079 
2080   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2081   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2082   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2083 
2084   auto Hi = Signed ?
2085     B.buildFPTOSI(S32, FloorMul) :
2086     B.buildFPTOUI(S32, FloorMul);
2087   auto Lo = B.buildFPTOUI(S32, Fma);
2088 
2089   B.buildMerge(Dst, { Lo, Hi });
2090   MI.eraseFromParent();
2091 
2092   return true;
2093 }
2094 
2095 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2096                                                MachineInstr &MI) const {
2097   MachineFunction &MF = Helper.MIRBuilder.getMF();
2098   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2099 
2100   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2101                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2102 
2103   // With ieee_mode disabled, the instructions have the correct behavior
2104   // already for G_FMINNUM/G_FMAXNUM
2105   if (!MFI->getMode().IEEE)
2106     return !IsIEEEOp;
2107 
2108   if (IsIEEEOp)
2109     return true;
2110 
2111   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2112 }
2113 
2114 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2115   MachineInstr &MI, MachineRegisterInfo &MRI,
2116   MachineIRBuilder &B) const {
2117   // TODO: Should move some of this into LegalizerHelper.
2118 
2119   // TODO: Promote dynamic indexing of s16 to s32
2120 
2121   // FIXME: Artifact combiner probably should have replaced the truncated
2122   // constant before this, so we shouldn't need
2123   // getConstantVRegValWithLookThrough.
2124   Optional<ValueAndVReg> MaybeIdxVal =
2125       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2126   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2127     return true;
2128   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2129 
2130   Register Dst = MI.getOperand(0).getReg();
2131   Register Vec = MI.getOperand(1).getReg();
2132 
2133   LLT VecTy = MRI.getType(Vec);
2134   LLT EltTy = VecTy.getElementType();
2135   assert(EltTy == MRI.getType(Dst));
2136 
2137   if (IdxVal < VecTy.getNumElements())
2138     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2139   else
2140     B.buildUndef(Dst);
2141 
2142   MI.eraseFromParent();
2143   return true;
2144 }
2145 
2146 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2147   MachineInstr &MI, MachineRegisterInfo &MRI,
2148   MachineIRBuilder &B) const {
2149   // TODO: Should move some of this into LegalizerHelper.
2150 
2151   // TODO: Promote dynamic indexing of s16 to s32
2152 
2153   // FIXME: Artifact combiner probably should have replaced the truncated
2154   // constant before this, so we shouldn't need
2155   // getConstantVRegValWithLookThrough.
2156   Optional<ValueAndVReg> MaybeIdxVal =
2157       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2158   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2159     return true;
2160 
2161   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2162   Register Dst = MI.getOperand(0).getReg();
2163   Register Vec = MI.getOperand(1).getReg();
2164   Register Ins = MI.getOperand(2).getReg();
2165 
2166   LLT VecTy = MRI.getType(Vec);
2167   LLT EltTy = VecTy.getElementType();
2168   assert(EltTy == MRI.getType(Ins));
2169 
2170   if (IdxVal < VecTy.getNumElements())
2171     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2172   else
2173     B.buildUndef(Dst);
2174 
2175   MI.eraseFromParent();
2176   return true;
2177 }
2178 
2179 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2180   MachineInstr &MI, MachineRegisterInfo &MRI,
2181   MachineIRBuilder &B) const {
2182   const LLT V2S16 = LLT::vector(2, 16);
2183 
2184   Register Dst = MI.getOperand(0).getReg();
2185   Register Src0 = MI.getOperand(1).getReg();
2186   LLT DstTy = MRI.getType(Dst);
2187   LLT SrcTy = MRI.getType(Src0);
2188 
2189   if (SrcTy == V2S16 && DstTy == V2S16 &&
2190       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2191     return true;
2192 
2193   MachineIRBuilder HelperBuilder(MI);
2194   GISelObserverWrapper DummyObserver;
2195   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2196   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2197 }
2198 
2199 bool AMDGPULegalizerInfo::legalizeSinCos(
2200   MachineInstr &MI, MachineRegisterInfo &MRI,
2201   MachineIRBuilder &B) const {
2202 
2203   Register DstReg = MI.getOperand(0).getReg();
2204   Register SrcReg = MI.getOperand(1).getReg();
2205   LLT Ty = MRI.getType(DstReg);
2206   unsigned Flags = MI.getFlags();
2207 
2208   Register TrigVal;
2209   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2210   if (ST.hasTrigReducedRange()) {
2211     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2212     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2213       .addUse(MulVal.getReg(0))
2214       .setMIFlags(Flags).getReg(0);
2215   } else
2216     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2217 
2218   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2219     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2220   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2221     .addUse(TrigVal)
2222     .setMIFlags(Flags);
2223   MI.eraseFromParent();
2224   return true;
2225 }
2226 
2227 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2228                                                   MachineIRBuilder &B,
2229                                                   const GlobalValue *GV,
2230                                                   int64_t Offset,
2231                                                   unsigned GAFlags) const {
2232   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2233   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2234   // to the following code sequence:
2235   //
2236   // For constant address space:
2237   //   s_getpc_b64 s[0:1]
2238   //   s_add_u32 s0, s0, $symbol
2239   //   s_addc_u32 s1, s1, 0
2240   //
2241   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2242   //   a fixup or relocation is emitted to replace $symbol with a literal
2243   //   constant, which is a pc-relative offset from the encoding of the $symbol
2244   //   operand to the global variable.
2245   //
2246   // For global address space:
2247   //   s_getpc_b64 s[0:1]
2248   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2249   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2250   //
2251   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2252   //   fixups or relocations are emitted to replace $symbol@*@lo and
2253   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2254   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2255   //   operand to the global variable.
2256   //
2257   // What we want here is an offset from the value returned by s_getpc
2258   // (which is the address of the s_add_u32 instruction) to the global
2259   // variable, but since the encoding of $symbol starts 4 bytes after the start
2260   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2261   // small. This requires us to add 4 to the global variable offset in order to
2262   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2263   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2264   // instruction.
2265 
2266   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2267 
2268   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2269     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2270 
2271   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2272     .addDef(PCReg);
2273 
2274   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2275   if (GAFlags == SIInstrInfo::MO_NONE)
2276     MIB.addImm(0);
2277   else
2278     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2279 
2280   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2281 
2282   if (PtrTy.getSizeInBits() == 32)
2283     B.buildExtract(DstReg, PCReg, 0);
2284   return true;
2285  }
2286 
2287 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2288   MachineInstr &MI, MachineRegisterInfo &MRI,
2289   MachineIRBuilder &B) const {
2290   Register DstReg = MI.getOperand(0).getReg();
2291   LLT Ty = MRI.getType(DstReg);
2292   unsigned AS = Ty.getAddressSpace();
2293 
2294   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2295   MachineFunction &MF = B.getMF();
2296   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2297 
2298   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2299     if (!MFI->isModuleEntryFunction() &&
2300         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2301       const Function &Fn = MF.getFunction();
2302       DiagnosticInfoUnsupported BadLDSDecl(
2303         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2304         DS_Warning);
2305       Fn.getContext().diagnose(BadLDSDecl);
2306 
2307       // We currently don't have a way to correctly allocate LDS objects that
2308       // aren't directly associated with a kernel. We do force inlining of
2309       // functions that use local objects. However, if these dead functions are
2310       // not eliminated, we don't want a compile time error. Just emit a warning
2311       // and a trap, since there should be no callable path here.
2312       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2313       B.buildUndef(DstReg);
2314       MI.eraseFromParent();
2315       return true;
2316     }
2317 
2318     // TODO: We could emit code to handle the initialization somewhere.
2319     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2320       const SITargetLowering *TLI = ST.getTargetLowering();
2321       if (!TLI->shouldUseLDSConstAddress(GV)) {
2322         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2323         return true; // Leave in place;
2324       }
2325 
2326       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2327         Type *Ty = GV->getValueType();
2328         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2329         // zero-sized type in other languages to declare the dynamic shared
2330         // memory which size is not known at the compile time. They will be
2331         // allocated by the runtime and placed directly after the static
2332         // allocated ones. They all share the same offset.
2333         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2334           // Adjust alignment for that dynamic shared memory array.
2335           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2336           LLT S32 = LLT::scalar(32);
2337           auto Sz =
2338               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2339           B.buildIntToPtr(DstReg, Sz);
2340           MI.eraseFromParent();
2341           return true;
2342         }
2343       }
2344 
2345       B.buildConstant(
2346           DstReg,
2347           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2348       MI.eraseFromParent();
2349       return true;
2350     }
2351 
2352     const Function &Fn = MF.getFunction();
2353     DiagnosticInfoUnsupported BadInit(
2354       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2355     Fn.getContext().diagnose(BadInit);
2356     return true;
2357   }
2358 
2359   const SITargetLowering *TLI = ST.getTargetLowering();
2360 
2361   if (TLI->shouldEmitFixup(GV)) {
2362     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2363     MI.eraseFromParent();
2364     return true;
2365   }
2366 
2367   if (TLI->shouldEmitPCReloc(GV)) {
2368     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2369     MI.eraseFromParent();
2370     return true;
2371   }
2372 
2373   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2374   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2375 
2376   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2377       MachinePointerInfo::getGOT(MF),
2378       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2379           MachineMemOperand::MOInvariant,
2380       8 /*Size*/, Align(8));
2381 
2382   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2383 
2384   if (Ty.getSizeInBits() == 32) {
2385     // Truncate if this is a 32-bit constant adrdess.
2386     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2387     B.buildExtract(DstReg, Load, 0);
2388   } else
2389     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2390 
2391   MI.eraseFromParent();
2392   return true;
2393 }
2394 
2395 static LLT widenToNextPowerOf2(LLT Ty) {
2396   if (Ty.isVector())
2397     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2398   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2399 }
2400 
2401 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2402                                        MachineInstr &MI) const {
2403   MachineIRBuilder &B = Helper.MIRBuilder;
2404   MachineRegisterInfo &MRI = *B.getMRI();
2405   GISelChangeObserver &Observer = Helper.Observer;
2406 
2407   Register PtrReg = MI.getOperand(1).getReg();
2408   LLT PtrTy = MRI.getType(PtrReg);
2409   unsigned AddrSpace = PtrTy.getAddressSpace();
2410 
2411   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2412     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2413     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2414     Observer.changingInstr(MI);
2415     MI.getOperand(1).setReg(Cast.getReg(0));
2416     Observer.changedInstr(MI);
2417     return true;
2418   }
2419 
2420   if (MI.getOpcode() != AMDGPU::G_LOAD)
2421     return false;
2422 
2423   Register ValReg = MI.getOperand(0).getReg();
2424   LLT ValTy = MRI.getType(ValReg);
2425 
2426   MachineMemOperand *MMO = *MI.memoperands_begin();
2427   const unsigned ValSize = ValTy.getSizeInBits();
2428   const unsigned MemSize = 8 * MMO->getSize();
2429   const Align MemAlign = MMO->getAlign();
2430   const unsigned AlignInBits = 8 * MemAlign.value();
2431 
2432   // Widen non-power-of-2 loads to the alignment if needed
2433   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2434     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2435 
2436     // This was already the correct extending load result type, so just adjust
2437     // the memory type.
2438     if (WideMemSize == ValSize) {
2439       MachineFunction &MF = B.getMF();
2440 
2441       MachineMemOperand *WideMMO =
2442           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2443       Observer.changingInstr(MI);
2444       MI.setMemRefs(MF, {WideMMO});
2445       Observer.changedInstr(MI);
2446       return true;
2447     }
2448 
2449     // Don't bother handling edge case that should probably never be produced.
2450     if (ValSize > WideMemSize)
2451       return false;
2452 
2453     LLT WideTy = widenToNextPowerOf2(ValTy);
2454 
2455     Register WideLoad;
2456     if (!WideTy.isVector()) {
2457       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2458       B.buildTrunc(ValReg, WideLoad).getReg(0);
2459     } else {
2460       // Extract the subvector.
2461 
2462       if (isRegisterType(ValTy)) {
2463         // If this a case where G_EXTRACT is legal, use it.
2464         // (e.g. <3 x s32> -> <4 x s32>)
2465         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2466         B.buildExtract(ValReg, WideLoad, 0);
2467       } else {
2468         // For cases where the widened type isn't a nice register value, unmerge
2469         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2470         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2471         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2472         B.setInsertPt(B.getMBB(), MI.getIterator());
2473         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2474       }
2475     }
2476 
2477     MI.eraseFromParent();
2478     return true;
2479   }
2480 
2481   return false;
2482 }
2483 
2484 bool AMDGPULegalizerInfo::legalizeFMad(
2485   MachineInstr &MI, MachineRegisterInfo &MRI,
2486   MachineIRBuilder &B) const {
2487   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2488   assert(Ty.isScalar());
2489 
2490   MachineFunction &MF = B.getMF();
2491   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2492 
2493   // TODO: Always legal with future ftz flag.
2494   // FIXME: Do we need just output?
2495   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2496     return true;
2497   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2498     return true;
2499 
2500   MachineIRBuilder HelperBuilder(MI);
2501   GISelObserverWrapper DummyObserver;
2502   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2503   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2504 }
2505 
2506 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2507   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2508   Register DstReg = MI.getOperand(0).getReg();
2509   Register PtrReg = MI.getOperand(1).getReg();
2510   Register CmpVal = MI.getOperand(2).getReg();
2511   Register NewVal = MI.getOperand(3).getReg();
2512 
2513   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2514          "this should not have been custom lowered");
2515 
2516   LLT ValTy = MRI.getType(CmpVal);
2517   LLT VecTy = LLT::vector(2, ValTy);
2518 
2519   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2520 
2521   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2522     .addDef(DstReg)
2523     .addUse(PtrReg)
2524     .addUse(PackedVal)
2525     .setMemRefs(MI.memoperands());
2526 
2527   MI.eraseFromParent();
2528   return true;
2529 }
2530 
2531 bool AMDGPULegalizerInfo::legalizeFlog(
2532   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2533   Register Dst = MI.getOperand(0).getReg();
2534   Register Src = MI.getOperand(1).getReg();
2535   LLT Ty = B.getMRI()->getType(Dst);
2536   unsigned Flags = MI.getFlags();
2537 
2538   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2539   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2540 
2541   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2542   MI.eraseFromParent();
2543   return true;
2544 }
2545 
2546 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2547                                        MachineIRBuilder &B) const {
2548   Register Dst = MI.getOperand(0).getReg();
2549   Register Src = MI.getOperand(1).getReg();
2550   unsigned Flags = MI.getFlags();
2551   LLT Ty = B.getMRI()->getType(Dst);
2552 
2553   auto K = B.buildFConstant(Ty, numbers::log2e);
2554   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2555   B.buildFExp2(Dst, Mul, Flags);
2556   MI.eraseFromParent();
2557   return true;
2558 }
2559 
2560 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2561                                        MachineIRBuilder &B) const {
2562   Register Dst = MI.getOperand(0).getReg();
2563   Register Src0 = MI.getOperand(1).getReg();
2564   Register Src1 = MI.getOperand(2).getReg();
2565   unsigned Flags = MI.getFlags();
2566   LLT Ty = B.getMRI()->getType(Dst);
2567   const LLT S16 = LLT::scalar(16);
2568   const LLT S32 = LLT::scalar(32);
2569 
2570   if (Ty == S32) {
2571     auto Log = B.buildFLog2(S32, Src0, Flags);
2572     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2573       .addUse(Log.getReg(0))
2574       .addUse(Src1)
2575       .setMIFlags(Flags);
2576     B.buildFExp2(Dst, Mul, Flags);
2577   } else if (Ty == S16) {
2578     // There's no f16 fmul_legacy, so we need to convert for it.
2579     auto Log = B.buildFLog2(S16, Src0, Flags);
2580     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2581     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2582     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2583       .addUse(Ext0.getReg(0))
2584       .addUse(Ext1.getReg(0))
2585       .setMIFlags(Flags);
2586 
2587     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2588   } else
2589     return false;
2590 
2591   MI.eraseFromParent();
2592   return true;
2593 }
2594 
2595 // Find a source register, ignoring any possible source modifiers.
2596 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2597   Register ModSrc = OrigSrc;
2598   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2599     ModSrc = SrcFNeg->getOperand(1).getReg();
2600     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2601       ModSrc = SrcFAbs->getOperand(1).getReg();
2602   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2603     ModSrc = SrcFAbs->getOperand(1).getReg();
2604   return ModSrc;
2605 }
2606 
2607 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2608                                          MachineRegisterInfo &MRI,
2609                                          MachineIRBuilder &B) const {
2610 
2611   const LLT S1 = LLT::scalar(1);
2612   const LLT S64 = LLT::scalar(64);
2613   Register Dst = MI.getOperand(0).getReg();
2614   Register OrigSrc = MI.getOperand(1).getReg();
2615   unsigned Flags = MI.getFlags();
2616   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2617          "this should not have been custom lowered");
2618 
2619   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2620   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2621   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2622   // V_FRACT bug is:
2623   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2624   //
2625   // Convert floor(x) to (x - fract(x))
2626 
2627   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2628     .addUse(OrigSrc)
2629     .setMIFlags(Flags);
2630 
2631   // Give source modifier matching some assistance before obscuring a foldable
2632   // pattern.
2633 
2634   // TODO: We can avoid the neg on the fract? The input sign to fract
2635   // shouldn't matter?
2636   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2637 
2638   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2639 
2640   Register Min = MRI.createGenericVirtualRegister(S64);
2641 
2642   // We don't need to concern ourselves with the snan handling difference, so
2643   // use the one which will directly select.
2644   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2645   if (MFI->getMode().IEEE)
2646     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2647   else
2648     B.buildFMinNum(Min, Fract, Const, Flags);
2649 
2650   Register CorrectedFract = Min;
2651   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2652     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2653     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2654   }
2655 
2656   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2657   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2658 
2659   MI.eraseFromParent();
2660   return true;
2661 }
2662 
2663 // Turn an illegal packed v2s16 build vector into bit operations.
2664 // TODO: This should probably be a bitcast action in LegalizerHelper.
2665 bool AMDGPULegalizerInfo::legalizeBuildVector(
2666   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2667   Register Dst = MI.getOperand(0).getReg();
2668   const LLT S32 = LLT::scalar(32);
2669   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2670 
2671   Register Src0 = MI.getOperand(1).getReg();
2672   Register Src1 = MI.getOperand(2).getReg();
2673   assert(MRI.getType(Src0) == LLT::scalar(16));
2674 
2675   auto Merge = B.buildMerge(S32, {Src0, Src1});
2676   B.buildBitcast(Dst, Merge);
2677 
2678   MI.eraseFromParent();
2679   return true;
2680 }
2681 
2682 // Check that this is a G_XOR x, -1
2683 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2684   if (MI.getOpcode() != TargetOpcode::G_XOR)
2685     return false;
2686   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2687   return ConstVal && *ConstVal == -1;
2688 }
2689 
2690 // Return the use branch instruction, otherwise null if the usage is invalid.
2691 static MachineInstr *
2692 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2693                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2694   Register CondDef = MI.getOperand(0).getReg();
2695   if (!MRI.hasOneNonDBGUse(CondDef))
2696     return nullptr;
2697 
2698   MachineBasicBlock *Parent = MI.getParent();
2699   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2700 
2701   if (isNot(MRI, *UseMI)) {
2702     Register NegatedCond = UseMI->getOperand(0).getReg();
2703     if (!MRI.hasOneNonDBGUse(NegatedCond))
2704       return nullptr;
2705 
2706     // We're deleting the def of this value, so we need to remove it.
2707     UseMI->eraseFromParent();
2708 
2709     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2710     Negated = true;
2711   }
2712 
2713   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2714     return nullptr;
2715 
2716   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2717   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2718   if (Next == Parent->end()) {
2719     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2720     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2721       return nullptr;
2722     UncondBrTarget = &*NextMBB;
2723   } else {
2724     if (Next->getOpcode() != AMDGPU::G_BR)
2725       return nullptr;
2726     Br = &*Next;
2727     UncondBrTarget = Br->getOperand(0).getMBB();
2728   }
2729 
2730   return UseMI;
2731 }
2732 
2733 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2734                                          const ArgDescriptor *Arg,
2735                                          const TargetRegisterClass *ArgRC,
2736                                          LLT ArgTy) const {
2737   MCRegister SrcReg = Arg->getRegister();
2738   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2739   assert(DstReg.isVirtual() && "Virtual register expected");
2740 
2741   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2742                                              ArgTy);
2743   if (Arg->isMasked()) {
2744     // TODO: Should we try to emit this once in the entry block?
2745     const LLT S32 = LLT::scalar(32);
2746     const unsigned Mask = Arg->getMask();
2747     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2748 
2749     Register AndMaskSrc = LiveIn;
2750 
2751     if (Shift != 0) {
2752       auto ShiftAmt = B.buildConstant(S32, Shift);
2753       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2754     }
2755 
2756     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2757   } else {
2758     B.buildCopy(DstReg, LiveIn);
2759   }
2760 
2761   return true;
2762 }
2763 
2764 bool AMDGPULegalizerInfo::loadInputValue(
2765     Register DstReg, MachineIRBuilder &B,
2766     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2767   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2768   const ArgDescriptor *Arg;
2769   const TargetRegisterClass *ArgRC;
2770   LLT ArgTy;
2771   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2772 
2773   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2774     return false; // TODO: Handle these
2775   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2776 }
2777 
2778 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2779     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2780     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2781   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2782     return false;
2783 
2784   MI.eraseFromParent();
2785   return true;
2786 }
2787 
2788 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2789                                        MachineRegisterInfo &MRI,
2790                                        MachineIRBuilder &B) const {
2791   Register Dst = MI.getOperand(0).getReg();
2792   LLT DstTy = MRI.getType(Dst);
2793   LLT S16 = LLT::scalar(16);
2794   LLT S32 = LLT::scalar(32);
2795   LLT S64 = LLT::scalar(64);
2796 
2797   if (DstTy == S16)
2798     return legalizeFDIV16(MI, MRI, B);
2799   if (DstTy == S32)
2800     return legalizeFDIV32(MI, MRI, B);
2801   if (DstTy == S64)
2802     return legalizeFDIV64(MI, MRI, B);
2803 
2804   return false;
2805 }
2806 
2807 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2808                                                         Register DstDivReg,
2809                                                         Register DstRemReg,
2810                                                         Register X,
2811                                                         Register Y) const {
2812   const LLT S1 = LLT::scalar(1);
2813   const LLT S32 = LLT::scalar(32);
2814 
2815   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2816   // algorithm used here.
2817 
2818   // Initial estimate of inv(y).
2819   auto FloatY = B.buildUITOFP(S32, Y);
2820   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2821   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2822   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2823   auto Z = B.buildFPTOUI(S32, ScaledY);
2824 
2825   // One round of UNR.
2826   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2827   auto NegYZ = B.buildMul(S32, NegY, Z);
2828   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2829 
2830   // Quotient/remainder estimate.
2831   auto Q = B.buildUMulH(S32, X, Z);
2832   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2833 
2834   // First quotient/remainder refinement.
2835   auto One = B.buildConstant(S32, 1);
2836   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2837   if (DstDivReg)
2838     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2839   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2840 
2841   // Second quotient/remainder refinement.
2842   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2843   if (DstDivReg)
2844     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2845 
2846   if (DstRemReg)
2847     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2848 }
2849 
2850 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2851 //
2852 // Return lo, hi of result
2853 //
2854 // %cvt.lo = G_UITOFP Val.lo
2855 // %cvt.hi = G_UITOFP Val.hi
2856 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2857 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2858 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2859 // %mul2 = G_FMUL %mul1, 2**(-32)
2860 // %trunc = G_INTRINSIC_TRUNC %mul2
2861 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2862 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2863 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2864                                                        Register Val) {
2865   const LLT S32 = LLT::scalar(32);
2866   auto Unmerge = B.buildUnmerge(S32, Val);
2867 
2868   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2869   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2870 
2871   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2872                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2873 
2874   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2875   auto Mul1 =
2876       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2877 
2878   // 2**(-32)
2879   auto Mul2 =
2880       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2881   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2882 
2883   // -(2**32)
2884   auto Mad2 = B.buildFMAD(S32, Trunc,
2885                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2886 
2887   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2888   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2889 
2890   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2891 }
2892 
2893 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2894                                                         Register DstDivReg,
2895                                                         Register DstRemReg,
2896                                                         Register Numer,
2897                                                         Register Denom) const {
2898   const LLT S32 = LLT::scalar(32);
2899   const LLT S64 = LLT::scalar(64);
2900   const LLT S1 = LLT::scalar(1);
2901   Register RcpLo, RcpHi;
2902 
2903   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2904 
2905   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2906 
2907   auto Zero64 = B.buildConstant(S64, 0);
2908   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2909 
2910   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2911   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2912 
2913   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2914   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2915   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2916 
2917   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2918   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2919   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2920   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2921 
2922   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2923   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2924   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2925   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2926   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2927 
2928   auto Zero32 = B.buildConstant(S32, 0);
2929   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2930   auto Add2_HiC =
2931       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2932   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2933   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2934 
2935   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2936   Register NumerLo = UnmergeNumer.getReg(0);
2937   Register NumerHi = UnmergeNumer.getReg(1);
2938 
2939   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2940   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2941   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2942   Register Mul3_Lo = UnmergeMul3.getReg(0);
2943   Register Mul3_Hi = UnmergeMul3.getReg(1);
2944   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2945   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2946   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2947   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2948 
2949   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2950   Register DenomLo = UnmergeDenom.getReg(0);
2951   Register DenomHi = UnmergeDenom.getReg(1);
2952 
2953   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2954   auto C1 = B.buildSExt(S32, CmpHi);
2955 
2956   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2957   auto C2 = B.buildSExt(S32, CmpLo);
2958 
2959   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2960   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2961 
2962   // TODO: Here and below portions of the code can be enclosed into if/endif.
2963   // Currently control flow is unconditional and we have 4 selects after
2964   // potential endif to substitute PHIs.
2965 
2966   // if C3 != 0 ...
2967   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2968   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2969   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2970   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2971 
2972   auto One64 = B.buildConstant(S64, 1);
2973   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2974 
2975   auto C4 =
2976       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2977   auto C5 =
2978       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2979   auto C6 = B.buildSelect(
2980       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2981 
2982   // if (C6 != 0)
2983   auto Add4 = B.buildAdd(S64, Add3, One64);
2984   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2985 
2986   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2987   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2988   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2989 
2990   // endif C6
2991   // endif C3
2992 
2993   if (DstDivReg) {
2994     auto Sel1 = B.buildSelect(
2995         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2996     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
2997                   Sel1, MulHi3);
2998   }
2999 
3000   if (DstRemReg) {
3001     auto Sel2 = B.buildSelect(
3002         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3003     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3004                   Sel2, Sub1);
3005   }
3006 }
3007 
3008 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3009                                                   MachineRegisterInfo &MRI,
3010                                                   MachineIRBuilder &B) const {
3011   Register DstDivReg, DstRemReg;
3012   switch (MI.getOpcode()) {
3013   default:
3014     llvm_unreachable("Unexpected opcode!");
3015   case AMDGPU::G_UDIV: {
3016     DstDivReg = MI.getOperand(0).getReg();
3017     break;
3018   }
3019   case AMDGPU::G_UREM: {
3020     DstRemReg = MI.getOperand(0).getReg();
3021     break;
3022   }
3023   case AMDGPU::G_UDIVREM: {
3024     DstDivReg = MI.getOperand(0).getReg();
3025     DstRemReg = MI.getOperand(1).getReg();
3026     break;
3027   }
3028   }
3029 
3030   const LLT S64 = LLT::scalar(64);
3031   const LLT S32 = LLT::scalar(32);
3032   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3033   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3034   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3035   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3036 
3037   if (Ty == S32)
3038     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3039   else if (Ty == S64)
3040     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3041   else
3042     return false;
3043 
3044   MI.eraseFromParent();
3045   return true;
3046 }
3047 
3048 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3049                                                 MachineRegisterInfo &MRI,
3050                                                 MachineIRBuilder &B) const {
3051   const LLT S64 = LLT::scalar(64);
3052   const LLT S32 = LLT::scalar(32);
3053 
3054   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3055   if (Ty != S32 && Ty != S64)
3056     return false;
3057 
3058   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3059   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3060   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3061 
3062   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3063   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3064   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3065 
3066   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3067   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3068 
3069   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3070   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3071 
3072   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3073   switch (MI.getOpcode()) {
3074   default:
3075     llvm_unreachable("Unexpected opcode!");
3076   case AMDGPU::G_SDIV: {
3077     DstDivReg = MI.getOperand(0).getReg();
3078     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3079     break;
3080   }
3081   case AMDGPU::G_SREM: {
3082     DstRemReg = MI.getOperand(0).getReg();
3083     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3084     break;
3085   }
3086   case AMDGPU::G_SDIVREM: {
3087     DstDivReg = MI.getOperand(0).getReg();
3088     DstRemReg = MI.getOperand(1).getReg();
3089     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3090     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3091     break;
3092   }
3093   }
3094 
3095   if (Ty == S32)
3096     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3097   else
3098     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3099 
3100   if (DstDivReg) {
3101     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3102     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3103     B.buildSub(DstDivReg, SignXor, Sign);
3104   }
3105 
3106   if (DstRemReg) {
3107     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3108     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3109     B.buildSub(DstRemReg, SignXor, Sign);
3110   }
3111 
3112   MI.eraseFromParent();
3113   return true;
3114 }
3115 
3116 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3117                                                  MachineRegisterInfo &MRI,
3118                                                  MachineIRBuilder &B) const {
3119   Register Res = MI.getOperand(0).getReg();
3120   Register LHS = MI.getOperand(1).getReg();
3121   Register RHS = MI.getOperand(2).getReg();
3122   uint16_t Flags = MI.getFlags();
3123   LLT ResTy = MRI.getType(Res);
3124 
3125   const MachineFunction &MF = B.getMF();
3126   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3127                             MI.getFlag(MachineInstr::FmAfn);
3128 
3129   if (!AllowInaccurateRcp)
3130     return false;
3131 
3132   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3133     // 1 / x -> RCP(x)
3134     if (CLHS->isExactlyValue(1.0)) {
3135       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3136         .addUse(RHS)
3137         .setMIFlags(Flags);
3138 
3139       MI.eraseFromParent();
3140       return true;
3141     }
3142 
3143     // -1 / x -> RCP( FNEG(x) )
3144     if (CLHS->isExactlyValue(-1.0)) {
3145       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3146       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3147         .addUse(FNeg.getReg(0))
3148         .setMIFlags(Flags);
3149 
3150       MI.eraseFromParent();
3151       return true;
3152     }
3153   }
3154 
3155   // x / y -> x * (1.0 / y)
3156   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3157     .addUse(RHS)
3158     .setMIFlags(Flags);
3159   B.buildFMul(Res, LHS, RCP, Flags);
3160 
3161   MI.eraseFromParent();
3162   return true;
3163 }
3164 
3165 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3166                                                    MachineRegisterInfo &MRI,
3167                                                    MachineIRBuilder &B) const {
3168   Register Res = MI.getOperand(0).getReg();
3169   Register X = MI.getOperand(1).getReg();
3170   Register Y = MI.getOperand(2).getReg();
3171   uint16_t Flags = MI.getFlags();
3172   LLT ResTy = MRI.getType(Res);
3173 
3174   const MachineFunction &MF = B.getMF();
3175   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3176                             MI.getFlag(MachineInstr::FmAfn);
3177 
3178   if (!AllowInaccurateRcp)
3179     return false;
3180 
3181   auto NegY = B.buildFNeg(ResTy, Y);
3182   auto One = B.buildFConstant(ResTy, 1.0);
3183 
3184   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3185     .addUse(Y)
3186     .setMIFlags(Flags);
3187 
3188   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3189   R = B.buildFMA(ResTy, Tmp0, R, R);
3190 
3191   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3192   R = B.buildFMA(ResTy, Tmp1, R, R);
3193 
3194   auto Ret = B.buildFMul(ResTy, X, R);
3195   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3196 
3197   B.buildFMA(Res, Tmp2, R, Ret);
3198   MI.eraseFromParent();
3199   return true;
3200 }
3201 
3202 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3203                                          MachineRegisterInfo &MRI,
3204                                          MachineIRBuilder &B) const {
3205   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3206     return true;
3207 
3208   Register Res = MI.getOperand(0).getReg();
3209   Register LHS = MI.getOperand(1).getReg();
3210   Register RHS = MI.getOperand(2).getReg();
3211 
3212   uint16_t Flags = MI.getFlags();
3213 
3214   LLT S16 = LLT::scalar(16);
3215   LLT S32 = LLT::scalar(32);
3216 
3217   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3218   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3219 
3220   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3221     .addUse(RHSExt.getReg(0))
3222     .setMIFlags(Flags);
3223 
3224   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3225   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3226 
3227   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3228     .addUse(RDst.getReg(0))
3229     .addUse(RHS)
3230     .addUse(LHS)
3231     .setMIFlags(Flags);
3232 
3233   MI.eraseFromParent();
3234   return true;
3235 }
3236 
3237 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3238 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3239 static void toggleSPDenormMode(bool Enable,
3240                                MachineIRBuilder &B,
3241                                const GCNSubtarget &ST,
3242                                AMDGPU::SIModeRegisterDefaults Mode) {
3243   // Set SP denorm mode to this value.
3244   unsigned SPDenormMode =
3245     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3246 
3247   if (ST.hasDenormModeInst()) {
3248     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3249     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3250 
3251     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3252     B.buildInstr(AMDGPU::S_DENORM_MODE)
3253       .addImm(NewDenormModeValue);
3254 
3255   } else {
3256     // Select FP32 bit field in mode register.
3257     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3258                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3259                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3260 
3261     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3262       .addImm(SPDenormMode)
3263       .addImm(SPDenormModeBitField);
3264   }
3265 }
3266 
3267 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3268                                          MachineRegisterInfo &MRI,
3269                                          MachineIRBuilder &B) const {
3270   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3271     return true;
3272 
3273   Register Res = MI.getOperand(0).getReg();
3274   Register LHS = MI.getOperand(1).getReg();
3275   Register RHS = MI.getOperand(2).getReg();
3276   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3277   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3278 
3279   uint16_t Flags = MI.getFlags();
3280 
3281   LLT S32 = LLT::scalar(32);
3282   LLT S1 = LLT::scalar(1);
3283 
3284   auto One = B.buildFConstant(S32, 1.0f);
3285 
3286   auto DenominatorScaled =
3287     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3288       .addUse(LHS)
3289       .addUse(RHS)
3290       .addImm(0)
3291       .setMIFlags(Flags);
3292   auto NumeratorScaled =
3293     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3294       .addUse(LHS)
3295       .addUse(RHS)
3296       .addImm(1)
3297       .setMIFlags(Flags);
3298 
3299   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3300     .addUse(DenominatorScaled.getReg(0))
3301     .setMIFlags(Flags);
3302   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3303 
3304   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3305   // aren't modeled as reading it.
3306   if (!Mode.allFP32Denormals())
3307     toggleSPDenormMode(true, B, ST, Mode);
3308 
3309   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3310   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3311   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3312   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3313   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3314   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3315 
3316   if (!Mode.allFP32Denormals())
3317     toggleSPDenormMode(false, B, ST, Mode);
3318 
3319   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3320     .addUse(Fma4.getReg(0))
3321     .addUse(Fma1.getReg(0))
3322     .addUse(Fma3.getReg(0))
3323     .addUse(NumeratorScaled.getReg(1))
3324     .setMIFlags(Flags);
3325 
3326   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3327     .addUse(Fmas.getReg(0))
3328     .addUse(RHS)
3329     .addUse(LHS)
3330     .setMIFlags(Flags);
3331 
3332   MI.eraseFromParent();
3333   return true;
3334 }
3335 
3336 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3337                                          MachineRegisterInfo &MRI,
3338                                          MachineIRBuilder &B) const {
3339   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3340     return true;
3341 
3342   Register Res = MI.getOperand(0).getReg();
3343   Register LHS = MI.getOperand(1).getReg();
3344   Register RHS = MI.getOperand(2).getReg();
3345 
3346   uint16_t Flags = MI.getFlags();
3347 
3348   LLT S64 = LLT::scalar(64);
3349   LLT S1 = LLT::scalar(1);
3350 
3351   auto One = B.buildFConstant(S64, 1.0);
3352 
3353   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3354     .addUse(LHS)
3355     .addUse(RHS)
3356     .addImm(0)
3357     .setMIFlags(Flags);
3358 
3359   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3360 
3361   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3362     .addUse(DivScale0.getReg(0))
3363     .setMIFlags(Flags);
3364 
3365   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3366   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3367   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3368 
3369   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3370     .addUse(LHS)
3371     .addUse(RHS)
3372     .addImm(1)
3373     .setMIFlags(Flags);
3374 
3375   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3376   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3377   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3378 
3379   Register Scale;
3380   if (!ST.hasUsableDivScaleConditionOutput()) {
3381     // Workaround a hardware bug on SI where the condition output from div_scale
3382     // is not usable.
3383 
3384     LLT S32 = LLT::scalar(32);
3385 
3386     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3387     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3388     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3389     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3390 
3391     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3392                               Scale1Unmerge.getReg(1));
3393     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3394                               Scale0Unmerge.getReg(1));
3395     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3396   } else {
3397     Scale = DivScale1.getReg(1);
3398   }
3399 
3400   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3401     .addUse(Fma4.getReg(0))
3402     .addUse(Fma3.getReg(0))
3403     .addUse(Mul.getReg(0))
3404     .addUse(Scale)
3405     .setMIFlags(Flags);
3406 
3407   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3408     .addUse(Fmas.getReg(0))
3409     .addUse(RHS)
3410     .addUse(LHS)
3411     .setMIFlags(Flags);
3412 
3413   MI.eraseFromParent();
3414   return true;
3415 }
3416 
3417 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3418                                                  MachineRegisterInfo &MRI,
3419                                                  MachineIRBuilder &B) const {
3420   Register Res = MI.getOperand(0).getReg();
3421   Register LHS = MI.getOperand(2).getReg();
3422   Register RHS = MI.getOperand(3).getReg();
3423   uint16_t Flags = MI.getFlags();
3424 
3425   LLT S32 = LLT::scalar(32);
3426   LLT S1 = LLT::scalar(1);
3427 
3428   auto Abs = B.buildFAbs(S32, RHS, Flags);
3429   const APFloat C0Val(1.0f);
3430 
3431   auto C0 = B.buildConstant(S32, 0x6f800000);
3432   auto C1 = B.buildConstant(S32, 0x2f800000);
3433   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3434 
3435   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3436   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3437 
3438   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3439 
3440   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3441     .addUse(Mul0.getReg(0))
3442     .setMIFlags(Flags);
3443 
3444   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3445 
3446   B.buildFMul(Res, Sel, Mul1, Flags);
3447 
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3453 // FIXME: Why do we handle this one but not other removed instructions?
3454 //
3455 // Reciprocal square root.  The clamp prevents infinite results, clamping
3456 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3457 // +-max_float.
3458 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3459                                                     MachineRegisterInfo &MRI,
3460                                                     MachineIRBuilder &B) const {
3461   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3462     return true;
3463 
3464   Register Dst = MI.getOperand(0).getReg();
3465   Register Src = MI.getOperand(2).getReg();
3466   auto Flags = MI.getFlags();
3467 
3468   LLT Ty = MRI.getType(Dst);
3469 
3470   const fltSemantics *FltSemantics;
3471   if (Ty == LLT::scalar(32))
3472     FltSemantics = &APFloat::IEEEsingle();
3473   else if (Ty == LLT::scalar(64))
3474     FltSemantics = &APFloat::IEEEdouble();
3475   else
3476     return false;
3477 
3478   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3479     .addUse(Src)
3480     .setMIFlags(Flags);
3481 
3482   // We don't need to concern ourselves with the snan handling difference, since
3483   // the rsq quieted (or not) so use the one which will directly select.
3484   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3485   const bool UseIEEE = MFI->getMode().IEEE;
3486 
3487   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3488   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3489                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3490 
3491   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3492 
3493   if (UseIEEE)
3494     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3495   else
3496     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3497   MI.eraseFromParent();
3498   return true;
3499 }
3500 
3501 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3502   switch (IID) {
3503   case Intrinsic::amdgcn_ds_fadd:
3504     return AMDGPU::G_ATOMICRMW_FADD;
3505   case Intrinsic::amdgcn_ds_fmin:
3506     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3507   case Intrinsic::amdgcn_ds_fmax:
3508     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3509   default:
3510     llvm_unreachable("not a DS FP intrinsic");
3511   }
3512 }
3513 
3514 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3515                                                       MachineInstr &MI,
3516                                                       Intrinsic::ID IID) const {
3517   GISelChangeObserver &Observer = Helper.Observer;
3518   Observer.changingInstr(MI);
3519 
3520   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3521 
3522   // The remaining operands were used to set fields in the MemOperand on
3523   // construction.
3524   for (int I = 6; I > 3; --I)
3525     MI.RemoveOperand(I);
3526 
3527   MI.RemoveOperand(1); // Remove the intrinsic ID.
3528   Observer.changedInstr(MI);
3529   return true;
3530 }
3531 
3532 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3533                                             MachineRegisterInfo &MRI,
3534                                             MachineIRBuilder &B) const {
3535   uint64_t Offset =
3536     ST.getTargetLowering()->getImplicitParameterOffset(
3537       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3538   LLT DstTy = MRI.getType(DstReg);
3539   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3540 
3541   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3542   if (!loadInputValue(KernargPtrReg, B,
3543                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3544     return false;
3545 
3546   // FIXME: This should be nuw
3547   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3548   return true;
3549 }
3550 
3551 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3552                                                  MachineRegisterInfo &MRI,
3553                                                  MachineIRBuilder &B) const {
3554   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3555   if (!MFI->isEntryFunction()) {
3556     return legalizePreloadedArgIntrin(MI, MRI, B,
3557                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3558   }
3559 
3560   Register DstReg = MI.getOperand(0).getReg();
3561   if (!getImplicitArgPtr(DstReg, MRI, B))
3562     return false;
3563 
3564   MI.eraseFromParent();
3565   return true;
3566 }
3567 
3568 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3569                                               MachineRegisterInfo &MRI,
3570                                               MachineIRBuilder &B,
3571                                               unsigned AddrSpace) const {
3572   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3573   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3574   Register Hi32 = Unmerge.getReg(1);
3575 
3576   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3577   MI.eraseFromParent();
3578   return true;
3579 }
3580 
3581 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3582 // offset (the offset that is included in bounds checking and swizzling, to be
3583 // split between the instruction's voffset and immoffset fields) and soffset
3584 // (the offset that is excluded from bounds checking and swizzling, to go in
3585 // the instruction's soffset field).  This function takes the first kind of
3586 // offset and figures out how to split it between voffset and immoffset.
3587 std::tuple<Register, unsigned, unsigned>
3588 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3589                                         Register OrigOffset) const {
3590   const unsigned MaxImm = 4095;
3591   Register BaseReg;
3592   unsigned TotalConstOffset;
3593   const LLT S32 = LLT::scalar(32);
3594   MachineRegisterInfo &MRI = *B.getMRI();
3595 
3596   std::tie(BaseReg, TotalConstOffset) =
3597       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3598 
3599   unsigned ImmOffset = TotalConstOffset;
3600 
3601   // If BaseReg is a pointer, convert it to int.
3602   if (MRI.getType(BaseReg).isPointer())
3603     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3604 
3605   // If the immediate value is too big for the immoffset field, put the value
3606   // and -4096 into the immoffset field so that the value that is copied/added
3607   // for the voffset field is a multiple of 4096, and it stands more chance
3608   // of being CSEd with the copy/add for another similar load/store.
3609   // However, do not do that rounding down to a multiple of 4096 if that is a
3610   // negative number, as it appears to be illegal to have a negative offset
3611   // in the vgpr, even if adding the immediate offset makes it positive.
3612   unsigned Overflow = ImmOffset & ~MaxImm;
3613   ImmOffset -= Overflow;
3614   if ((int32_t)Overflow < 0) {
3615     Overflow += ImmOffset;
3616     ImmOffset = 0;
3617   }
3618 
3619   if (Overflow != 0) {
3620     if (!BaseReg) {
3621       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3622     } else {
3623       auto OverflowVal = B.buildConstant(S32, Overflow);
3624       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3625     }
3626   }
3627 
3628   if (!BaseReg)
3629     BaseReg = B.buildConstant(S32, 0).getReg(0);
3630 
3631   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3632 }
3633 
3634 /// Handle register layout difference for f16 images for some subtargets.
3635 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3636                                              MachineRegisterInfo &MRI,
3637                                              Register Reg,
3638                                              bool ImageStore) const {
3639   const LLT S16 = LLT::scalar(16);
3640   const LLT S32 = LLT::scalar(32);
3641   LLT StoreVT = MRI.getType(Reg);
3642   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3643 
3644   if (ST.hasUnpackedD16VMem()) {
3645     auto Unmerge = B.buildUnmerge(S16, Reg);
3646 
3647     SmallVector<Register, 4> WideRegs;
3648     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3649       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3650 
3651     int NumElts = StoreVT.getNumElements();
3652 
3653     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3654   }
3655 
3656   if (ImageStore && ST.hasImageStoreD16Bug()) {
3657     if (StoreVT.getNumElements() == 2) {
3658       SmallVector<Register, 4> PackedRegs;
3659       Reg = B.buildBitcast(S32, Reg).getReg(0);
3660       PackedRegs.push_back(Reg);
3661       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3662       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3663     }
3664 
3665     if (StoreVT.getNumElements() == 3) {
3666       SmallVector<Register, 4> PackedRegs;
3667       auto Unmerge = B.buildUnmerge(S16, Reg);
3668       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3669         PackedRegs.push_back(Unmerge.getReg(I));
3670       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3671       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3672       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3673     }
3674 
3675     if (StoreVT.getNumElements() == 4) {
3676       SmallVector<Register, 4> PackedRegs;
3677       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3678       auto Unmerge = B.buildUnmerge(S32, Reg);
3679       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3680         PackedRegs.push_back(Unmerge.getReg(I));
3681       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3682       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3683     }
3684 
3685     llvm_unreachable("invalid data type");
3686   }
3687 
3688   return Reg;
3689 }
3690 
3691 Register AMDGPULegalizerInfo::fixStoreSourceType(
3692   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3693   MachineRegisterInfo *MRI = B.getMRI();
3694   LLT Ty = MRI->getType(VData);
3695 
3696   const LLT S16 = LLT::scalar(16);
3697 
3698   // Fixup illegal register types for i8 stores.
3699   if (Ty == LLT::scalar(8) || Ty == S16) {
3700     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3701     return AnyExt;
3702   }
3703 
3704   if (Ty.isVector()) {
3705     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3706       if (IsFormat)
3707         return handleD16VData(B, *MRI, VData);
3708     }
3709   }
3710 
3711   return VData;
3712 }
3713 
3714 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3715                                               MachineRegisterInfo &MRI,
3716                                               MachineIRBuilder &B,
3717                                               bool IsTyped,
3718                                               bool IsFormat) const {
3719   Register VData = MI.getOperand(1).getReg();
3720   LLT Ty = MRI.getType(VData);
3721   LLT EltTy = Ty.getScalarType();
3722   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3723   const LLT S32 = LLT::scalar(32);
3724 
3725   VData = fixStoreSourceType(B, VData, IsFormat);
3726   Register RSrc = MI.getOperand(2).getReg();
3727 
3728   MachineMemOperand *MMO = *MI.memoperands_begin();
3729   const int MemSize = MMO->getSize();
3730 
3731   unsigned ImmOffset;
3732   unsigned TotalOffset;
3733 
3734   // The typed intrinsics add an immediate after the registers.
3735   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3736 
3737   // The struct intrinsic variants add one additional operand over raw.
3738   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3739   Register VIndex;
3740   int OpOffset = 0;
3741   if (HasVIndex) {
3742     VIndex = MI.getOperand(3).getReg();
3743     OpOffset = 1;
3744   }
3745 
3746   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3747   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3748 
3749   unsigned Format = 0;
3750   if (IsTyped) {
3751     Format = MI.getOperand(5 + OpOffset).getImm();
3752     ++OpOffset;
3753   }
3754 
3755   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3756 
3757   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3758   if (TotalOffset != 0)
3759     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3760 
3761   unsigned Opc;
3762   if (IsTyped) {
3763     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3764                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3765   } else if (IsFormat) {
3766     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3767                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3768   } else {
3769     switch (MemSize) {
3770     case 1:
3771       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3772       break;
3773     case 2:
3774       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3775       break;
3776     default:
3777       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3778       break;
3779     }
3780   }
3781 
3782   if (!VIndex)
3783     VIndex = B.buildConstant(S32, 0).getReg(0);
3784 
3785   auto MIB = B.buildInstr(Opc)
3786     .addUse(VData)              // vdata
3787     .addUse(RSrc)               // rsrc
3788     .addUse(VIndex)             // vindex
3789     .addUse(VOffset)            // voffset
3790     .addUse(SOffset)            // soffset
3791     .addImm(ImmOffset);         // offset(imm)
3792 
3793   if (IsTyped)
3794     MIB.addImm(Format);
3795 
3796   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3797      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3798      .addMemOperand(MMO);
3799 
3800   MI.eraseFromParent();
3801   return true;
3802 }
3803 
3804 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3805                                              MachineRegisterInfo &MRI,
3806                                              MachineIRBuilder &B,
3807                                              bool IsFormat,
3808                                              bool IsTyped) const {
3809   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3810   MachineMemOperand *MMO = *MI.memoperands_begin();
3811   const int MemSize = MMO->getSize();
3812   const LLT S32 = LLT::scalar(32);
3813 
3814   Register Dst = MI.getOperand(0).getReg();
3815   Register RSrc = MI.getOperand(2).getReg();
3816 
3817   // The typed intrinsics add an immediate after the registers.
3818   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3819 
3820   // The struct intrinsic variants add one additional operand over raw.
3821   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3822   Register VIndex;
3823   int OpOffset = 0;
3824   if (HasVIndex) {
3825     VIndex = MI.getOperand(3).getReg();
3826     OpOffset = 1;
3827   }
3828 
3829   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3830   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3831 
3832   unsigned Format = 0;
3833   if (IsTyped) {
3834     Format = MI.getOperand(5 + OpOffset).getImm();
3835     ++OpOffset;
3836   }
3837 
3838   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3839   unsigned ImmOffset;
3840   unsigned TotalOffset;
3841 
3842   LLT Ty = MRI.getType(Dst);
3843   LLT EltTy = Ty.getScalarType();
3844   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3845   const bool Unpacked = ST.hasUnpackedD16VMem();
3846 
3847   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3848   if (TotalOffset != 0)
3849     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3850 
3851   unsigned Opc;
3852 
3853   if (IsTyped) {
3854     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3855                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3856   } else if (IsFormat) {
3857     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3858                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3859   } else {
3860     switch (MemSize) {
3861     case 1:
3862       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3863       break;
3864     case 2:
3865       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3866       break;
3867     default:
3868       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3869       break;
3870     }
3871   }
3872 
3873   Register LoadDstReg;
3874 
3875   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3876   LLT UnpackedTy = Ty.changeElementSize(32);
3877 
3878   if (IsExtLoad)
3879     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3880   else if (Unpacked && IsD16 && Ty.isVector())
3881     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3882   else
3883     LoadDstReg = Dst;
3884 
3885   if (!VIndex)
3886     VIndex = B.buildConstant(S32, 0).getReg(0);
3887 
3888   auto MIB = B.buildInstr(Opc)
3889     .addDef(LoadDstReg)         // vdata
3890     .addUse(RSrc)               // rsrc
3891     .addUse(VIndex)             // vindex
3892     .addUse(VOffset)            // voffset
3893     .addUse(SOffset)            // soffset
3894     .addImm(ImmOffset);         // offset(imm)
3895 
3896   if (IsTyped)
3897     MIB.addImm(Format);
3898 
3899   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3900      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3901      .addMemOperand(MMO);
3902 
3903   if (LoadDstReg != Dst) {
3904     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3905 
3906     // Widen result for extending loads was widened.
3907     if (IsExtLoad)
3908       B.buildTrunc(Dst, LoadDstReg);
3909     else {
3910       // Repack to original 16-bit vector result
3911       // FIXME: G_TRUNC should work, but legalization currently fails
3912       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3913       SmallVector<Register, 4> Repack;
3914       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3915         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3916       B.buildMerge(Dst, Repack);
3917     }
3918   }
3919 
3920   MI.eraseFromParent();
3921   return true;
3922 }
3923 
3924 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3925                                                MachineIRBuilder &B,
3926                                                bool IsInc) const {
3927   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3928                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3929   B.buildInstr(Opc)
3930     .addDef(MI.getOperand(0).getReg())
3931     .addUse(MI.getOperand(2).getReg())
3932     .addUse(MI.getOperand(3).getReg())
3933     .cloneMemRefs(MI);
3934   MI.eraseFromParent();
3935   return true;
3936 }
3937 
3938 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3939   switch (IntrID) {
3940   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3941   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3942     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3943   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3944   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3945     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3946   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3947   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3948     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3949   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3950   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3951     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3952   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3953   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3954     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3955   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3956   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3957     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3958   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3959   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3960     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3961   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3962   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3963     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3964   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3965   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3966     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3967   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3968   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3969     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3970   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3971   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3972     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3973   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3974   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3975     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3976   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3977   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3978     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3979   case Intrinsic::amdgcn_buffer_atomic_fadd:
3980   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3981   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3982     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3983   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3984   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3985     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3986   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3987   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3988     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3989   default:
3990     llvm_unreachable("unhandled atomic opcode");
3991   }
3992 }
3993 
3994 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3995                                                MachineIRBuilder &B,
3996                                                Intrinsic::ID IID) const {
3997   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3998                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3999   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4000 
4001   Register Dst;
4002 
4003   int OpOffset = 0;
4004   if (HasReturn) {
4005     // A few FP atomics do not support return values.
4006     Dst = MI.getOperand(0).getReg();
4007   } else {
4008     OpOffset = -1;
4009   }
4010 
4011   Register VData = MI.getOperand(2 + OpOffset).getReg();
4012   Register CmpVal;
4013 
4014   if (IsCmpSwap) {
4015     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4016     ++OpOffset;
4017   }
4018 
4019   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4020   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4021 
4022   // The struct intrinsic variants add one additional operand over raw.
4023   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4024   Register VIndex;
4025   if (HasVIndex) {
4026     VIndex = MI.getOperand(4 + OpOffset).getReg();
4027     ++OpOffset;
4028   }
4029 
4030   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4031   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4032   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4033 
4034   MachineMemOperand *MMO = *MI.memoperands_begin();
4035 
4036   unsigned ImmOffset;
4037   unsigned TotalOffset;
4038   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4039   if (TotalOffset != 0)
4040     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4041 
4042   if (!VIndex)
4043     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4044 
4045   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4046 
4047   if (HasReturn)
4048     MIB.addDef(Dst);
4049 
4050   MIB.addUse(VData); // vdata
4051 
4052   if (IsCmpSwap)
4053     MIB.addReg(CmpVal);
4054 
4055   MIB.addUse(RSrc)               // rsrc
4056      .addUse(VIndex)             // vindex
4057      .addUse(VOffset)            // voffset
4058      .addUse(SOffset)            // soffset
4059      .addImm(ImmOffset)          // offset(imm)
4060      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4061      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4062      .addMemOperand(MMO);
4063 
4064   MI.eraseFromParent();
4065   return true;
4066 }
4067 
4068 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4069 /// vector with s16 typed elements.
4070 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4071                                       SmallVectorImpl<Register> &PackedAddrs,
4072                                       unsigned ArgOffset,
4073                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4074                                       bool IsA16, bool IsG16) {
4075   const LLT S16 = LLT::scalar(16);
4076   const LLT V2S16 = LLT::vector(2, 16);
4077   auto EndIdx = Intr->VAddrEnd;
4078 
4079   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4080     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4081     if (!SrcOp.isReg())
4082       continue; // _L to _LZ may have eliminated this.
4083 
4084     Register AddrReg = SrcOp.getReg();
4085 
4086     if (I < Intr->GradientStart) {
4087       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4088       PackedAddrs.push_back(AddrReg);
4089     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4090                (I >= Intr->CoordStart && !IsA16)) {
4091       // Handle any gradient or coordinate operands that should not be packed
4092       PackedAddrs.push_back(AddrReg);
4093     } else {
4094       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4095       // derivatives dx/dh and dx/dv are packed with undef.
4096       if (((I + 1) >= EndIdx) ||
4097           ((Intr->NumGradients / 2) % 2 == 1 &&
4098            (I == static_cast<unsigned>(Intr->GradientStart +
4099                                        (Intr->NumGradients / 2) - 1) ||
4100             I == static_cast<unsigned>(Intr->GradientStart +
4101                                        Intr->NumGradients - 1))) ||
4102           // Check for _L to _LZ optimization
4103           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4104         PackedAddrs.push_back(
4105             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4106                 .getReg(0));
4107       } else {
4108         PackedAddrs.push_back(
4109             B.buildBuildVector(
4110                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4111                 .getReg(0));
4112         ++I;
4113       }
4114     }
4115   }
4116 }
4117 
4118 /// Convert from separate vaddr components to a single vector address register,
4119 /// and replace the remaining operands with $noreg.
4120 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4121                                      int DimIdx, int NumVAddrs) {
4122   const LLT S32 = LLT::scalar(32);
4123 
4124   SmallVector<Register, 8> AddrRegs;
4125   for (int I = 0; I != NumVAddrs; ++I) {
4126     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4127     if (SrcOp.isReg()) {
4128       AddrRegs.push_back(SrcOp.getReg());
4129       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4130     }
4131   }
4132 
4133   int NumAddrRegs = AddrRegs.size();
4134   if (NumAddrRegs != 1) {
4135     // Round up to 8 elements for v5-v7
4136     // FIXME: Missing intermediate sized register classes and instructions.
4137     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4138       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4139       auto Undef = B.buildUndef(S32);
4140       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4141       NumAddrRegs = RoundedNumRegs;
4142     }
4143 
4144     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4145     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4146   }
4147 
4148   for (int I = 1; I != NumVAddrs; ++I) {
4149     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4150     if (SrcOp.isReg())
4151       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4152   }
4153 }
4154 
4155 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4156 ///
4157 /// Depending on the subtarget, load/store with 16-bit element data need to be
4158 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4159 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4160 /// registers.
4161 ///
4162 /// We don't want to directly select image instructions just yet, but also want
4163 /// to exposes all register repacking to the legalizer/combiners. We also don't
4164 /// want a selected instrution entering RegBankSelect. In order to avoid
4165 /// defining a multitude of intermediate image instructions, directly hack on
4166 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4167 /// now unnecessary arguments with $noreg.
4168 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4169     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4170     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4171 
4172   const unsigned NumDefs = MI.getNumExplicitDefs();
4173   const unsigned ArgOffset = NumDefs + 1;
4174   bool IsTFE = NumDefs == 2;
4175   // We are only processing the operands of d16 image operations on subtargets
4176   // that use the unpacked register layout, or need to repack the TFE result.
4177 
4178   // TODO: Do we need to guard against already legalized intrinsics?
4179   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4180       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4181 
4182   MachineRegisterInfo *MRI = B.getMRI();
4183   const LLT S32 = LLT::scalar(32);
4184   const LLT S16 = LLT::scalar(16);
4185   const LLT V2S16 = LLT::vector(2, 16);
4186 
4187   unsigned DMask = 0;
4188 
4189   // Check for 16 bit addresses and pack if true.
4190   LLT GradTy =
4191       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4192   LLT AddrTy =
4193       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4194   const bool IsG16 = GradTy == S16;
4195   const bool IsA16 = AddrTy == S16;
4196 
4197   int DMaskLanes = 0;
4198   if (!BaseOpcode->Atomic) {
4199     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4200     if (BaseOpcode->Gather4) {
4201       DMaskLanes = 4;
4202     } else if (DMask != 0) {
4203       DMaskLanes = countPopulation(DMask);
4204     } else if (!IsTFE && !BaseOpcode->Store) {
4205       // If dmask is 0, this is a no-op load. This can be eliminated.
4206       B.buildUndef(MI.getOperand(0));
4207       MI.eraseFromParent();
4208       return true;
4209     }
4210   }
4211 
4212   Observer.changingInstr(MI);
4213   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4214 
4215   unsigned NewOpcode = NumDefs == 0 ?
4216     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4217 
4218   // Track that we legalized this
4219   MI.setDesc(B.getTII().get(NewOpcode));
4220 
4221   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4222   // dmask to be at least 1 otherwise the instruction will fail
4223   if (IsTFE && DMask == 0) {
4224     DMask = 0x1;
4225     DMaskLanes = 1;
4226     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4227   }
4228 
4229   if (BaseOpcode->Atomic) {
4230     Register VData0 = MI.getOperand(2).getReg();
4231     LLT Ty = MRI->getType(VData0);
4232 
4233     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4234     if (Ty.isVector())
4235       return false;
4236 
4237     if (BaseOpcode->AtomicX2) {
4238       Register VData1 = MI.getOperand(3).getReg();
4239       // The two values are packed in one register.
4240       LLT PackedTy = LLT::vector(2, Ty);
4241       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4242       MI.getOperand(2).setReg(Concat.getReg(0));
4243       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4244     }
4245   }
4246 
4247   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4248 
4249   // Optimize _L to _LZ when _L is zero
4250   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4251           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4252     const ConstantFP *ConstantLod;
4253 
4254     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4255                  m_GFCst(ConstantLod))) {
4256       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4257         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4258         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4259             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4260                                                       Intr->Dim);
4261 
4262         // The starting indexes should remain in the same place.
4263         --CorrectedNumVAddrs;
4264 
4265         MI.getOperand(MI.getNumExplicitDefs())
4266             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4267         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4268         Intr = NewImageDimIntr;
4269       }
4270     }
4271   }
4272 
4273   // Optimize _mip away, when 'lod' is zero
4274   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4275     int64_t ConstantLod;
4276     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4277                  m_ICst(ConstantLod))) {
4278       if (ConstantLod == 0) {
4279         // TODO: Change intrinsic opcode and remove operand instead or replacing
4280         // it with 0, as the _L to _LZ handling is done above.
4281         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4282         --CorrectedNumVAddrs;
4283       }
4284     }
4285   }
4286 
4287   // Rewrite the addressing register layout before doing anything else.
4288   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4289     // 16 bit gradients are supported, but are tied to the A16 control
4290     // so both gradients and addresses must be 16 bit
4291     return false;
4292   }
4293 
4294   if (IsA16 && !ST.hasA16()) {
4295     // A16 not supported
4296     return false;
4297   }
4298 
4299   if (IsA16 || IsG16) {
4300     if (Intr->NumVAddrs > 1) {
4301       SmallVector<Register, 4> PackedRegs;
4302 
4303       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4304                                 IsG16);
4305 
4306       // See also below in the non-a16 branch
4307       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4308 
4309       if (!UseNSA && PackedRegs.size() > 1) {
4310         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4311         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4312         PackedRegs[0] = Concat.getReg(0);
4313         PackedRegs.resize(1);
4314       }
4315 
4316       const unsigned NumPacked = PackedRegs.size();
4317       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4318         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4319         if (!SrcOp.isReg()) {
4320           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4321           continue;
4322         }
4323 
4324         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4325 
4326         if (I - Intr->VAddrStart < NumPacked)
4327           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4328         else
4329           SrcOp.setReg(AMDGPU::NoRegister);
4330       }
4331     }
4332   } else {
4333     // If the register allocator cannot place the address registers contiguously
4334     // without introducing moves, then using the non-sequential address encoding
4335     // is always preferable, since it saves VALU instructions and is usually a
4336     // wash in terms of code size or even better.
4337     //
4338     // However, we currently have no way of hinting to the register allocator
4339     // that MIMG addresses should be placed contiguously when it is possible to
4340     // do so, so force non-NSA for the common 2-address case as a heuristic.
4341     //
4342     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4343     // allocation when possible.
4344     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4345 
4346     if (!UseNSA && Intr->NumVAddrs > 1)
4347       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4348                                Intr->NumVAddrs);
4349   }
4350 
4351   int Flags = 0;
4352   if (IsA16)
4353     Flags |= 1;
4354   if (IsG16)
4355     Flags |= 2;
4356   MI.addOperand(MachineOperand::CreateImm(Flags));
4357 
4358   if (BaseOpcode->Store) { // No TFE for stores?
4359     // TODO: Handle dmask trim
4360     Register VData = MI.getOperand(1).getReg();
4361     LLT Ty = MRI->getType(VData);
4362     if (!Ty.isVector() || Ty.getElementType() != S16)
4363       return true;
4364 
4365     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4366     if (RepackedReg != VData) {
4367       MI.getOperand(1).setReg(RepackedReg);
4368     }
4369 
4370     return true;
4371   }
4372 
4373   Register DstReg = MI.getOperand(0).getReg();
4374   LLT Ty = MRI->getType(DstReg);
4375   const LLT EltTy = Ty.getScalarType();
4376   const bool IsD16 = Ty.getScalarType() == S16;
4377   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4378 
4379   // Confirm that the return type is large enough for the dmask specified
4380   if (NumElts < DMaskLanes)
4381     return false;
4382 
4383   if (NumElts > 4 || DMaskLanes > 4)
4384     return false;
4385 
4386   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4387   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4388 
4389   // The raw dword aligned data component of the load. The only legal cases
4390   // where this matters should be when using the packed D16 format, for
4391   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4392   LLT RoundedTy;
4393 
4394   // S32 vector to to cover all data, plus TFE result element.
4395   LLT TFETy;
4396 
4397   // Register type to use for each loaded component. Will be S32 or V2S16.
4398   LLT RegTy;
4399 
4400   if (IsD16 && ST.hasUnpackedD16VMem()) {
4401     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4402     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4403     RegTy = S32;
4404   } else {
4405     unsigned EltSize = EltTy.getSizeInBits();
4406     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4407     unsigned RoundedSize = 32 * RoundedElts;
4408     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4409     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4410     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4411   }
4412 
4413   // The return type does not need adjustment.
4414   // TODO: Should we change s16 case to s32 or <2 x s16>?
4415   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4416     return true;
4417 
4418   Register Dst1Reg;
4419 
4420   // Insert after the instruction.
4421   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4422 
4423   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4424   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4425   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4426   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4427 
4428   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4429 
4430   MI.getOperand(0).setReg(NewResultReg);
4431 
4432   // In the IR, TFE is supposed to be used with a 2 element struct return
4433   // type. The intruction really returns these two values in one contiguous
4434   // register, with one additional dword beyond the loaded data. Rewrite the
4435   // return type to use a single register result.
4436 
4437   if (IsTFE) {
4438     Dst1Reg = MI.getOperand(1).getReg();
4439     if (MRI->getType(Dst1Reg) != S32)
4440       return false;
4441 
4442     // TODO: Make sure the TFE operand bit is set.
4443     MI.RemoveOperand(1);
4444 
4445     // Handle the easy case that requires no repack instructions.
4446     if (Ty == S32) {
4447       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4448       return true;
4449     }
4450   }
4451 
4452   // Now figure out how to copy the new result register back into the old
4453   // result.
4454   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4455 
4456   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4457 
4458   if (ResultNumRegs == 1) {
4459     assert(!IsTFE);
4460     ResultRegs[0] = NewResultReg;
4461   } else {
4462     // We have to repack into a new vector of some kind.
4463     for (int I = 0; I != NumDataRegs; ++I)
4464       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4465     B.buildUnmerge(ResultRegs, NewResultReg);
4466 
4467     // Drop the final TFE element to get the data part. The TFE result is
4468     // directly written to the right place already.
4469     if (IsTFE)
4470       ResultRegs.resize(NumDataRegs);
4471   }
4472 
4473   // For an s16 scalar result, we form an s32 result with a truncate regardless
4474   // of packed vs. unpacked.
4475   if (IsD16 && !Ty.isVector()) {
4476     B.buildTrunc(DstReg, ResultRegs[0]);
4477     return true;
4478   }
4479 
4480   // Avoid a build/concat_vector of 1 entry.
4481   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4482     B.buildBitcast(DstReg, ResultRegs[0]);
4483     return true;
4484   }
4485 
4486   assert(Ty.isVector());
4487 
4488   if (IsD16) {
4489     // For packed D16 results with TFE enabled, all the data components are
4490     // S32. Cast back to the expected type.
4491     //
4492     // TODO: We don't really need to use load s32 elements. We would only need one
4493     // cast for the TFE result if a multiple of v2s16 was used.
4494     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4495       for (Register &Reg : ResultRegs)
4496         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4497     } else if (ST.hasUnpackedD16VMem()) {
4498       for (Register &Reg : ResultRegs)
4499         Reg = B.buildTrunc(S16, Reg).getReg(0);
4500     }
4501   }
4502 
4503   auto padWithUndef = [&](LLT Ty, int NumElts) {
4504     if (NumElts == 0)
4505       return;
4506     Register Undef = B.buildUndef(Ty).getReg(0);
4507     for (int I = 0; I != NumElts; ++I)
4508       ResultRegs.push_back(Undef);
4509   };
4510 
4511   // Pad out any elements eliminated due to the dmask.
4512   LLT ResTy = MRI->getType(ResultRegs[0]);
4513   if (!ResTy.isVector()) {
4514     padWithUndef(ResTy, NumElts - ResultRegs.size());
4515     B.buildBuildVector(DstReg, ResultRegs);
4516     return true;
4517   }
4518 
4519   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4520   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4521 
4522   // Deal with the one annoying legal case.
4523   const LLT V3S16 = LLT::vector(3, 16);
4524   if (Ty == V3S16) {
4525     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4526     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4527     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4528     return true;
4529   }
4530 
4531   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4532   B.buildConcatVectors(DstReg, ResultRegs);
4533   return true;
4534 }
4535 
4536 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4537   LegalizerHelper &Helper, MachineInstr &MI) const {
4538   MachineIRBuilder &B = Helper.MIRBuilder;
4539   GISelChangeObserver &Observer = Helper.Observer;
4540 
4541   Register Dst = MI.getOperand(0).getReg();
4542   LLT Ty = B.getMRI()->getType(Dst);
4543   unsigned Size = Ty.getSizeInBits();
4544   MachineFunction &MF = B.getMF();
4545 
4546   Observer.changingInstr(MI);
4547 
4548   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4549     Ty = getBitcastRegisterType(Ty);
4550     Helper.bitcastDst(MI, Ty, 0);
4551     Dst = MI.getOperand(0).getReg();
4552     B.setInsertPt(B.getMBB(), MI);
4553   }
4554 
4555   // FIXME: We don't really need this intermediate instruction. The intrinsic
4556   // should be fixed to have a memory operand. Since it's readnone, we're not
4557   // allowed to add one.
4558   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4559   MI.RemoveOperand(1); // Remove intrinsic ID
4560 
4561   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4562   // TODO: Should this use datalayout alignment?
4563   const unsigned MemSize = (Size + 7) / 8;
4564   const Align MemAlign(4);
4565   MachineMemOperand *MMO = MF.getMachineMemOperand(
4566       MachinePointerInfo(),
4567       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4568           MachineMemOperand::MOInvariant,
4569       MemSize, MemAlign);
4570   MI.addMemOperand(MF, MMO);
4571 
4572   // There are no 96-bit result scalar loads, but widening to 128-bit should
4573   // always be legal. We may need to restore this to a 96-bit result if it turns
4574   // out this needs to be converted to a vector load during RegBankSelect.
4575   if (!isPowerOf2_32(Size)) {
4576     if (Ty.isVector())
4577       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4578     else
4579       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4580   }
4581 
4582   Observer.changedInstr(MI);
4583   return true;
4584 }
4585 
4586 // TODO: Move to selection
4587 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4588                                                 MachineRegisterInfo &MRI,
4589                                                 MachineIRBuilder &B) const {
4590   if (!ST.isTrapHandlerEnabled() ||
4591       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4592     return legalizeTrapEndpgm(MI, MRI, B);
4593 
4594   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4595     switch (*HsaAbiVer) {
4596     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4597     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4598       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4599     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4600       return ST.supportsGetDoorbellID() ?
4601           legalizeTrapHsa(MI, MRI, B) :
4602           legalizeTrapHsaQueuePtr(MI, MRI, B);
4603     }
4604   }
4605 
4606   llvm_unreachable("Unknown trap handler");
4607 }
4608 
4609 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4610     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4611   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4612   MI.eraseFromParent();
4613   return true;
4614 }
4615 
4616 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4617     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4618   // Pass queue pointer to trap handler as input, and insert trap instruction
4619   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4620   Register LiveIn =
4621     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4622   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4623     return false;
4624 
4625   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4626   B.buildCopy(SGPR01, LiveIn);
4627   B.buildInstr(AMDGPU::S_TRAP)
4628       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4629       .addReg(SGPR01, RegState::Implicit);
4630 
4631   MI.eraseFromParent();
4632   return true;
4633 }
4634 
4635 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4636     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4637   B.buildInstr(AMDGPU::S_TRAP)
4638       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4639   MI.eraseFromParent();
4640   return true;
4641 }
4642 
4643 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4644     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4645   // Is non-HSA path or trap-handler disabled? then, report a warning
4646   // accordingly
4647   if (!ST.isTrapHandlerEnabled() ||
4648       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4649     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4650                                      "debugtrap handler not supported",
4651                                      MI.getDebugLoc(), DS_Warning);
4652     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4653     Ctx.diagnose(NoTrap);
4654   } else {
4655     // Insert debug-trap instruction
4656     B.buildInstr(AMDGPU::S_TRAP)
4657         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4658   }
4659 
4660   MI.eraseFromParent();
4661   return true;
4662 }
4663 
4664 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4665                                                MachineIRBuilder &B) const {
4666   MachineRegisterInfo &MRI = *B.getMRI();
4667   const LLT S16 = LLT::scalar(16);
4668   const LLT S32 = LLT::scalar(32);
4669 
4670   Register DstReg = MI.getOperand(0).getReg();
4671   Register NodePtr = MI.getOperand(2).getReg();
4672   Register RayExtent = MI.getOperand(3).getReg();
4673   Register RayOrigin = MI.getOperand(4).getReg();
4674   Register RayDir = MI.getOperand(5).getReg();
4675   Register RayInvDir = MI.getOperand(6).getReg();
4676   Register TDescr = MI.getOperand(7).getReg();
4677 
4678   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4679   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4680   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4681                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4682                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4683                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4684 
4685   SmallVector<Register, 12> Ops;
4686   if (Is64) {
4687     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4688     Ops.push_back(Unmerge.getReg(0));
4689     Ops.push_back(Unmerge.getReg(1));
4690   } else {
4691     Ops.push_back(NodePtr);
4692   }
4693   Ops.push_back(RayExtent);
4694 
4695   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4696     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4697     Ops.push_back(Unmerge.getReg(0));
4698     Ops.push_back(Unmerge.getReg(1));
4699     Ops.push_back(Unmerge.getReg(2));
4700   };
4701 
4702   packLanes(RayOrigin);
4703   if (IsA16) {
4704     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4705     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4706     Register R1 = MRI.createGenericVirtualRegister(S32);
4707     Register R2 = MRI.createGenericVirtualRegister(S32);
4708     Register R3 = MRI.createGenericVirtualRegister(S32);
4709     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4710     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4711     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4712     Ops.push_back(R1);
4713     Ops.push_back(R2);
4714     Ops.push_back(R3);
4715   } else {
4716     packLanes(RayDir);
4717     packLanes(RayInvDir);
4718   }
4719 
4720   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4721     .addDef(DstReg)
4722     .addImm(Opcode);
4723 
4724   for (Register R : Ops) {
4725     MIB.addUse(R);
4726   }
4727 
4728   MIB.addUse(TDescr)
4729      .addImm(IsA16 ? 1 : 0)
4730      .cloneMemRefs(MI);
4731 
4732   MI.eraseFromParent();
4733   return true;
4734 }
4735 
4736 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4737                                             MachineInstr &MI) const {
4738   MachineIRBuilder &B = Helper.MIRBuilder;
4739   MachineRegisterInfo &MRI = *B.getMRI();
4740 
4741   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4742   auto IntrID = MI.getIntrinsicID();
4743   switch (IntrID) {
4744   case Intrinsic::amdgcn_if:
4745   case Intrinsic::amdgcn_else: {
4746     MachineInstr *Br = nullptr;
4747     MachineBasicBlock *UncondBrTarget = nullptr;
4748     bool Negated = false;
4749     if (MachineInstr *BrCond =
4750             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4751       const SIRegisterInfo *TRI
4752         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4753 
4754       Register Def = MI.getOperand(1).getReg();
4755       Register Use = MI.getOperand(3).getReg();
4756 
4757       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4758 
4759       if (Negated)
4760         std::swap(CondBrTarget, UncondBrTarget);
4761 
4762       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4763       if (IntrID == Intrinsic::amdgcn_if) {
4764         B.buildInstr(AMDGPU::SI_IF)
4765           .addDef(Def)
4766           .addUse(Use)
4767           .addMBB(UncondBrTarget);
4768       } else {
4769         B.buildInstr(AMDGPU::SI_ELSE)
4770             .addDef(Def)
4771             .addUse(Use)
4772             .addMBB(UncondBrTarget);
4773       }
4774 
4775       if (Br) {
4776         Br->getOperand(0).setMBB(CondBrTarget);
4777       } else {
4778         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4779         // since we're swapping branch targets it needs to be reinserted.
4780         // FIXME: IRTranslator should probably not do this
4781         B.buildBr(*CondBrTarget);
4782       }
4783 
4784       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4785       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4786       MI.eraseFromParent();
4787       BrCond->eraseFromParent();
4788       return true;
4789     }
4790 
4791     return false;
4792   }
4793   case Intrinsic::amdgcn_loop: {
4794     MachineInstr *Br = nullptr;
4795     MachineBasicBlock *UncondBrTarget = nullptr;
4796     bool Negated = false;
4797     if (MachineInstr *BrCond =
4798             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4799       const SIRegisterInfo *TRI
4800         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4801 
4802       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4803       Register Reg = MI.getOperand(2).getReg();
4804 
4805       if (Negated)
4806         std::swap(CondBrTarget, UncondBrTarget);
4807 
4808       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4809       B.buildInstr(AMDGPU::SI_LOOP)
4810         .addUse(Reg)
4811         .addMBB(UncondBrTarget);
4812 
4813       if (Br)
4814         Br->getOperand(0).setMBB(CondBrTarget);
4815       else
4816         B.buildBr(*CondBrTarget);
4817 
4818       MI.eraseFromParent();
4819       BrCond->eraseFromParent();
4820       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4821       return true;
4822     }
4823 
4824     return false;
4825   }
4826   case Intrinsic::amdgcn_kernarg_segment_ptr:
4827     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4828       // This only makes sense to call in a kernel, so just lower to null.
4829       B.buildConstant(MI.getOperand(0).getReg(), 0);
4830       MI.eraseFromParent();
4831       return true;
4832     }
4833 
4834     return legalizePreloadedArgIntrin(
4835       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4836   case Intrinsic::amdgcn_implicitarg_ptr:
4837     return legalizeImplicitArgPtr(MI, MRI, B);
4838   case Intrinsic::amdgcn_workitem_id_x:
4839     return legalizePreloadedArgIntrin(MI, MRI, B,
4840                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4841   case Intrinsic::amdgcn_workitem_id_y:
4842     return legalizePreloadedArgIntrin(MI, MRI, B,
4843                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4844   case Intrinsic::amdgcn_workitem_id_z:
4845     return legalizePreloadedArgIntrin(MI, MRI, B,
4846                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4847   case Intrinsic::amdgcn_workgroup_id_x:
4848     return legalizePreloadedArgIntrin(MI, MRI, B,
4849                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4850   case Intrinsic::amdgcn_workgroup_id_y:
4851     return legalizePreloadedArgIntrin(MI, MRI, B,
4852                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4853   case Intrinsic::amdgcn_workgroup_id_z:
4854     return legalizePreloadedArgIntrin(MI, MRI, B,
4855                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4856   case Intrinsic::amdgcn_dispatch_ptr:
4857     return legalizePreloadedArgIntrin(MI, MRI, B,
4858                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4859   case Intrinsic::amdgcn_queue_ptr:
4860     return legalizePreloadedArgIntrin(MI, MRI, B,
4861                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4862   case Intrinsic::amdgcn_implicit_buffer_ptr:
4863     return legalizePreloadedArgIntrin(
4864       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4865   case Intrinsic::amdgcn_dispatch_id:
4866     return legalizePreloadedArgIntrin(MI, MRI, B,
4867                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4868   case Intrinsic::amdgcn_fdiv_fast:
4869     return legalizeFDIVFastIntrin(MI, MRI, B);
4870   case Intrinsic::amdgcn_is_shared:
4871     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4872   case Intrinsic::amdgcn_is_private:
4873     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4874   case Intrinsic::amdgcn_wavefrontsize: {
4875     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4876     MI.eraseFromParent();
4877     return true;
4878   }
4879   case Intrinsic::amdgcn_s_buffer_load:
4880     return legalizeSBufferLoad(Helper, MI);
4881   case Intrinsic::amdgcn_raw_buffer_store:
4882   case Intrinsic::amdgcn_struct_buffer_store:
4883     return legalizeBufferStore(MI, MRI, B, false, false);
4884   case Intrinsic::amdgcn_raw_buffer_store_format:
4885   case Intrinsic::amdgcn_struct_buffer_store_format:
4886     return legalizeBufferStore(MI, MRI, B, false, true);
4887   case Intrinsic::amdgcn_raw_tbuffer_store:
4888   case Intrinsic::amdgcn_struct_tbuffer_store:
4889     return legalizeBufferStore(MI, MRI, B, true, true);
4890   case Intrinsic::amdgcn_raw_buffer_load:
4891   case Intrinsic::amdgcn_struct_buffer_load:
4892     return legalizeBufferLoad(MI, MRI, B, false, false);
4893   case Intrinsic::amdgcn_raw_buffer_load_format:
4894   case Intrinsic::amdgcn_struct_buffer_load_format:
4895     return legalizeBufferLoad(MI, MRI, B, true, false);
4896   case Intrinsic::amdgcn_raw_tbuffer_load:
4897   case Intrinsic::amdgcn_struct_tbuffer_load:
4898     return legalizeBufferLoad(MI, MRI, B, true, true);
4899   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4900   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4901   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4902   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4903   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4904   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4905   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4906   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4907   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4908   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4909   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4910   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4911   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4912   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4913   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4914   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4915   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4916   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4917   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4918   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4919   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4920   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4921   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4922   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4923   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4924   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4925   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4926   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4927   case Intrinsic::amdgcn_buffer_atomic_fadd:
4928   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4929   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4930   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4931   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4932     return legalizeBufferAtomic(MI, B, IntrID);
4933   case Intrinsic::amdgcn_atomic_inc:
4934     return legalizeAtomicIncDec(MI, B, true);
4935   case Intrinsic::amdgcn_atomic_dec:
4936     return legalizeAtomicIncDec(MI, B, false);
4937   case Intrinsic::trap:
4938     return legalizeTrapIntrinsic(MI, MRI, B);
4939   case Intrinsic::debugtrap:
4940     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4941   case Intrinsic::amdgcn_rsq_clamp:
4942     return legalizeRsqClampIntrinsic(MI, MRI, B);
4943   case Intrinsic::amdgcn_ds_fadd:
4944   case Intrinsic::amdgcn_ds_fmin:
4945   case Intrinsic::amdgcn_ds_fmax:
4946     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4947   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4948     return legalizeBVHIntrinsic(MI, B);
4949   default: {
4950     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4951             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4952       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4953     return true;
4954   }
4955   }
4956 
4957   return true;
4958 }
4959