1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226   return [=](const LegalityQuery &Query) {
227     const LLT Ty = Query.Types[TypeIdx];
228     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230   };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237                                     bool IsLoad) {
238   switch (AS) {
239   case AMDGPUAS::PRIVATE_ADDRESS:
240     // FIXME: Private element size.
241     return ST.enableFlatScratch() ? 128 : 32;
242   case AMDGPUAS::LOCAL_ADDRESS:
243     return ST.useDS128() ? 128 : 64;
244   case AMDGPUAS::GLOBAL_ADDRESS:
245   case AMDGPUAS::CONSTANT_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247     // Treat constant and global as identical. SMRD loads are sometimes usable for
248     // global loads (ideally constant address space should be eliminated)
249     // depending on the context. Legality cannot be context dependent, but
250     // RegBankSelect can split the load as necessary depending on the pointer
251     // register bank/uniformity and if the memory is invariant or not written in a
252     // kernel.
253     return IsLoad ? 512 : 128;
254   default:
255     // Flat addresses may contextually need to be split to 32-bit parts if they
256     // may alias scratch depending on the subtarget.
257     return 128;
258   }
259 }
260 
261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262                                  const LegalityQuery &Query,
263                                  unsigned Opcode) {
264   const LLT Ty = Query.Types[0];
265 
266   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267   const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 
269   unsigned RegSize = Ty.getSizeInBits();
270   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272   unsigned AS = Query.Types[1].getAddressSpace();
273 
274   // All of these need to be custom lowered to cast the pointer operand.
275   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276     return false;
277 
278   // TODO: We should be able to widen loads if the alignment is high enough, but
279   // we also need to modify the memory access size.
280 #if 0
281   // Accept widening loads based on alignment.
282   if (IsLoad && MemSize < Size)
283     MemSize = std::max(MemSize, Align);
284 #endif
285 
286   // Only 1-byte and 2-byte to 32-bit extloads are valid.
287   if (MemSize != RegSize && RegSize != 32)
288     return false;
289 
290   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291     return false;
292 
293   switch (MemSize) {
294   case 8:
295   case 16:
296   case 32:
297   case 64:
298   case 128:
299     break;
300   case 96:
301     if (!ST.hasDwordx3LoadStores())
302       return false;
303     break;
304   case 256:
305   case 512:
306     // These may contextually need to be broken down.
307     break;
308   default:
309     return false;
310   }
311 
312   assert(RegSize >= MemSize);
313 
314   if (AlignBits < MemSize) {
315     const SITargetLowering *TLI = ST.getTargetLowering();
316     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317                                                  Align(AlignBits / 8)))
318       return false;
319   }
320 
321   return true;
322 }
323 
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329   if (EnableNewLegality)
330     return false;
331 
332   const unsigned Size = Ty.getSizeInBits();
333   if (Size <= 64)
334     return false;
335   if (!Ty.isVector())
336     return true;
337 
338   LLT EltTy = Ty.getElementType();
339   if (EltTy.isPointer())
340     return true;
341 
342   unsigned EltSize = EltTy.getSizeInBits();
343   return EltSize != 32 && EltSize != 64;
344 }
345 
346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347                              unsigned Opcode) {
348   const LLT Ty = Query.Types[0];
349   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350          !loadStoreBitcastWorkaround(Ty);
351 }
352 
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356                                        const unsigned MemSizeInBits) {
357   const unsigned Size = Ty.getSizeInBits();
358     if (Size != MemSizeInBits)
359       return Size <= 32 && Ty.isVector();
360 
361   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362     return true;
363   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364          !isRegisterVectorElementType(Ty.getElementType());
365 }
366 
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371                             unsigned AlignInBits, unsigned AddrSpace,
372                             unsigned Opcode) {
373   // We don't want to widen cases that are naturally legal.
374   if (isPowerOf2_32(SizeInBits))
375     return false;
376 
377   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378   // end up widening these for a scalar load during RegBankSelect, since there
379   // aren't 96-bit scalar loads.
380   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381     return false;
382 
383   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384     return false;
385 
386   // A load is known dereferenceable up to the alignment, so it's legal to widen
387   // to it.
388   //
389   // TODO: Could check dereferenceable for less aligned cases.
390   unsigned RoundedSize = NextPowerOf2(SizeInBits);
391   if (AlignInBits < RoundedSize)
392     return false;
393 
394   // Do not widen if it would introduce a slow unaligned load.
395   const SITargetLowering *TLI = ST.getTargetLowering();
396   bool Fast = false;
397   return TLI->allowsMisalignedMemoryAccessesImpl(
398              RoundedSize, AddrSpace, Align(AlignInBits / 8),
399              MachineMemOperand::MOLoad, &Fast) &&
400          Fast;
401 }
402 
403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404                             unsigned Opcode) {
405   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406     return false;
407 
408   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409                          Query.MMODescrs[0].AlignInBits,
410                          Query.Types[1].getAddressSpace(), Opcode);
411 }
412 
413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414                                          const GCNTargetMachine &TM)
415   :  ST(ST_) {
416   using namespace TargetOpcode;
417 
418   auto GetAddrSpacePtr = [&TM](unsigned AS) {
419     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420   };
421 
422   const LLT S1 = LLT::scalar(1);
423   const LLT S8 = LLT::scalar(8);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S8 = LLT::vector(2, 8);
433   const LLT V2S16 = LLT::vector(2, 16);
434   const LLT V4S16 = LLT::vector(4, 16);
435 
436   const LLT V2S32 = LLT::vector(2, 32);
437   const LLT V3S32 = LLT::vector(3, 32);
438   const LLT V4S32 = LLT::vector(4, 32);
439   const LLT V5S32 = LLT::vector(5, 32);
440   const LLT V6S32 = LLT::vector(6, 32);
441   const LLT V7S32 = LLT::vector(7, 32);
442   const LLT V8S32 = LLT::vector(8, 32);
443   const LLT V9S32 = LLT::vector(9, 32);
444   const LLT V10S32 = LLT::vector(10, 32);
445   const LLT V11S32 = LLT::vector(11, 32);
446   const LLT V12S32 = LLT::vector(12, 32);
447   const LLT V13S32 = LLT::vector(13, 32);
448   const LLT V14S32 = LLT::vector(14, 32);
449   const LLT V15S32 = LLT::vector(15, 32);
450   const LLT V16S32 = LLT::vector(16, 32);
451   const LLT V32S32 = LLT::vector(32, 32);
452 
453   const LLT V2S64 = LLT::vector(2, 64);
454   const LLT V3S64 = LLT::vector(3, 64);
455   const LLT V4S64 = LLT::vector(4, 64);
456   const LLT V5S64 = LLT::vector(5, 64);
457   const LLT V6S64 = LLT::vector(6, 64);
458   const LLT V7S64 = LLT::vector(7, 64);
459   const LLT V8S64 = LLT::vector(8, 64);
460   const LLT V16S64 = LLT::vector(16, 64);
461 
462   std::initializer_list<LLT> AllS32Vectors =
463     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465   std::initializer_list<LLT> AllS64Vectors =
466     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467 
468   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475 
476   const LLT CodePtr = FlatPtr;
477 
478   const std::initializer_list<LLT> AddrSpaces64 = {
479     GlobalPtr, ConstantPtr, FlatPtr
480   };
481 
482   const std::initializer_list<LLT> AddrSpaces32 = {
483     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484   };
485 
486   const std::initializer_list<LLT> FPTypesBase = {
487     S32, S64
488   };
489 
490   const std::initializer_list<LLT> FPTypes16 = {
491     S32, S64, S16
492   };
493 
494   const std::initializer_list<LLT> FPTypesPK16 = {
495     S32, S64, S16, V2S16
496   };
497 
498   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499 
500   setAction({G_BRCOND, S1}, Legal); // VCC branches
501   setAction({G_BRCOND, S32}, Legal); // SCC branches
502 
503   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504   // elements for v3s16
505   getActionDefinitionsBuilder(G_PHI)
506     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507     .legalFor(AllS32Vectors)
508     .legalFor(AllS64Vectors)
509     .legalFor(AddrSpaces64)
510     .legalFor(AddrSpaces32)
511     .legalIf(isPointer(0))
512     .clampScalar(0, S16, S256)
513     .widenScalarToNextPow2(0, 32)
514     .clampMaxNumElements(0, S32, 16)
515     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516     .scalarize(0);
517 
518   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519     // Full set of gfx9 features.
520     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521       .legalFor({S32, S16, V2S16})
522       .clampScalar(0, S16, S32)
523       .clampMaxNumElements(0, S16, 2)
524       .scalarize(0)
525       .widenScalarToNextPow2(0, 32);
526 
527     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528       .legalFor({S32, S16, V2S16}) // Clamp modifier
529       .minScalarOrElt(0, S16)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32)
533       .lower();
534   } else if (ST.has16BitInsts()) {
535     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536       .legalFor({S32, S16})
537       .clampScalar(0, S16, S32)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540 
541     // Technically the saturating operations require clamp bit support, but this
542     // was introduced at the same time as 16-bit operations.
543     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544       .legalFor({S32, S16}) // Clamp modifier
545       .minScalar(0, S16)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 16)
548       .lower();
549 
550     // We're just lowering this, but it helps get a better result to try to
551     // coerce to the desired type first.
552     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553       .minScalar(0, S16)
554       .scalarize(0)
555       .lower();
556   } else {
557     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558       .legalFor({S32})
559       .clampScalar(0, S32, S32)
560       .scalarize(0);
561 
562     if (ST.hasIntClamp()) {
563       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564         .legalFor({S32}) // Clamp modifier.
565         .scalarize(0)
566         .minScalarOrElt(0, S32)
567         .lower();
568     } else {
569       // Clamp bit support was added in VI, along with 16-bit operations.
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .minScalar(0, S32)
572         .scalarize(0)
573         .lower();
574     }
575 
576     // FIXME: DAG expansion gets better results. The widening uses the smaller
577     // range values and goes for the min/max lowering directly.
578     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579       .minScalar(0, S32)
580       .scalarize(0)
581       .lower();
582   }
583 
584   getActionDefinitionsBuilder(
585       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
586       .customFor({S32, S64})
587       .clampScalar(0, S32, S64)
588       .widenScalarToNextPow2(0, 32)
589       .scalarize(0);
590 
591   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
592                    .legalFor({S32})
593                    .maxScalarOrElt(0, S32);
594 
595   if (ST.hasVOP3PInsts()) {
596     Mulh
597       .clampMaxNumElements(0, S8, 2)
598       .lowerFor({V2S8});
599   }
600 
601   Mulh
602     .scalarize(0)
603     .lower();
604 
605   // Report legal for any types we can handle anywhere. For the cases only legal
606   // on the SALU, RegBankSelect will be able to re-legalize.
607   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
608     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
609     .clampScalar(0, S32, S64)
610     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
611     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
612     .widenScalarToNextPow2(0)
613     .scalarize(0);
614 
615   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
616                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
617     .legalFor({{S32, S1}, {S32, S32}})
618     .minScalar(0, S32)
619     // TODO: .scalarize(0)
620     .lower();
621 
622   getActionDefinitionsBuilder(G_BITCAST)
623     // Don't worry about the size constraint.
624     .legalIf(all(isRegisterType(0), isRegisterType(1)))
625     .lower();
626 
627 
628   getActionDefinitionsBuilder(G_CONSTANT)
629     .legalFor({S1, S32, S64, S16, GlobalPtr,
630                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
631     .legalIf(isPointer(0))
632     .clampScalar(0, S32, S64)
633     .widenScalarToNextPow2(0);
634 
635   getActionDefinitionsBuilder(G_FCONSTANT)
636     .legalFor({S32, S64, S16})
637     .clampScalar(0, S16, S64);
638 
639   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
640       .legalIf(isRegisterType(0))
641       // s1 and s16 are special cases because they have legal operations on
642       // them, but don't really occupy registers in the normal way.
643       .legalFor({S1, S16})
644       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
645       .clampScalarOrElt(0, S32, MaxScalar)
646       .widenScalarToNextPow2(0, 32)
647       .clampMaxNumElements(0, S32, 16);
648 
649   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
650 
651   // If the amount is divergent, we have to do a wave reduction to get the
652   // maximum value, so this is expanded during RegBankSelect.
653   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
654     .legalFor({{PrivatePtr, S32}});
655 
656   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
657     .customIf(typeIsNot(0, PrivatePtr));
658 
659   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
660 
661   auto &FPOpActions = getActionDefinitionsBuilder(
662     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
663     .legalFor({S32, S64});
664   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
665     .customFor({S32, S64});
666   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
667     .customFor({S32, S64});
668 
669   if (ST.has16BitInsts()) {
670     if (ST.hasVOP3PInsts())
671       FPOpActions.legalFor({S16, V2S16});
672     else
673       FPOpActions.legalFor({S16});
674 
675     TrigActions.customFor({S16});
676     FDIVActions.customFor({S16});
677   }
678 
679   auto &MinNumMaxNum = getActionDefinitionsBuilder({
680       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
681 
682   if (ST.hasVOP3PInsts()) {
683     MinNumMaxNum.customFor(FPTypesPK16)
684       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
685       .clampMaxNumElements(0, S16, 2)
686       .clampScalar(0, S16, S64)
687       .scalarize(0);
688   } else if (ST.has16BitInsts()) {
689     MinNumMaxNum.customFor(FPTypes16)
690       .clampScalar(0, S16, S64)
691       .scalarize(0);
692   } else {
693     MinNumMaxNum.customFor(FPTypesBase)
694       .clampScalar(0, S32, S64)
695       .scalarize(0);
696   }
697 
698   if (ST.hasVOP3PInsts())
699     FPOpActions.clampMaxNumElements(0, S16, 2);
700 
701   FPOpActions
702     .scalarize(0)
703     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
704 
705   TrigActions
706     .scalarize(0)
707     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
708 
709   FDIVActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   getActionDefinitionsBuilder({G_FNEG, G_FABS})
714     .legalFor(FPTypesPK16)
715     .clampMaxNumElements(0, S16, 2)
716     .scalarize(0)
717     .clampScalar(0, S16, S64);
718 
719   if (ST.has16BitInsts()) {
720     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
721       .legalFor({S32, S64, S16})
722       .scalarize(0)
723       .clampScalar(0, S16, S64);
724   } else {
725     getActionDefinitionsBuilder(G_FSQRT)
726       .legalFor({S32, S64})
727       .scalarize(0)
728       .clampScalar(0, S32, S64);
729 
730     if (ST.hasFractBug()) {
731       getActionDefinitionsBuilder(G_FFLOOR)
732         .customFor({S64})
733         .legalFor({S32, S64})
734         .scalarize(0)
735         .clampScalar(0, S32, S64);
736     } else {
737       getActionDefinitionsBuilder(G_FFLOOR)
738         .legalFor({S32, S64})
739         .scalarize(0)
740         .clampScalar(0, S32, S64);
741     }
742   }
743 
744   getActionDefinitionsBuilder(G_FPTRUNC)
745     .legalFor({{S32, S64}, {S16, S32}})
746     .scalarize(0)
747     .lower();
748 
749   getActionDefinitionsBuilder(G_FPEXT)
750     .legalFor({{S64, S32}, {S32, S16}})
751     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
752     .scalarize(0);
753 
754   getActionDefinitionsBuilder(G_FSUB)
755       // Use actual fsub instruction
756       .legalFor({S32})
757       // Must use fadd + fneg
758       .lowerFor({S64, S16, V2S16})
759       .scalarize(0)
760       .clampScalar(0, S32, S64);
761 
762   // Whether this is legal depends on the floating point mode for the function.
763   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
764   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
765     FMad.customFor({S32, S16});
766   else if (ST.hasMadMacF32Insts())
767     FMad.customFor({S32});
768   else if (ST.hasMadF16())
769     FMad.customFor({S16});
770   FMad.scalarize(0)
771       .lower();
772 
773   auto &FRem = getActionDefinitionsBuilder(G_FREM);
774   if (ST.has16BitInsts()) {
775     FRem.customFor({S16, S32, S64});
776   } else {
777     FRem.minScalar(0, S32)
778         .customFor({S32, S64});
779   }
780   FRem.scalarize(0);
781 
782   // TODO: Do we need to clamp maximum bitwidth?
783   getActionDefinitionsBuilder(G_TRUNC)
784     .legalIf(isScalar(0))
785     .legalFor({{V2S16, V2S32}})
786     .clampMaxNumElements(0, S16, 2)
787     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
788     // situations (like an invalid implicit use), we don't want to infinite loop
789     // in the legalizer.
790     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
791     .alwaysLegal();
792 
793   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
794     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
795                {S32, S1}, {S64, S1}, {S16, S1}})
796     .scalarize(0)
797     .clampScalar(0, S32, S64)
798     .widenScalarToNextPow2(1, 32);
799 
800   // TODO: Split s1->s64 during regbankselect for VALU.
801   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
802     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
803     .lowerFor({{S32, S64}})
804     .lowerIf(typeIs(1, S1))
805     .customFor({{S64, S64}});
806   if (ST.has16BitInsts())
807     IToFP.legalFor({{S16, S16}});
808   IToFP.clampScalar(1, S32, S64)
809        .minScalar(0, S32)
810        .scalarize(0)
811        .widenScalarToNextPow2(1);
812 
813   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
814     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
815     .customFor({{S64, S64}})
816     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
817   if (ST.has16BitInsts())
818     FPToI.legalFor({{S16, S16}});
819   else
820     FPToI.minScalar(1, S32);
821 
822   FPToI.minScalar(0, S32)
823        .widenScalarToNextPow2(0, 32)
824        .scalarize(0)
825        .lower();
826 
827   // Lower roundeven into G_FRINT
828   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
829     .scalarize(0)
830     .lower();
831 
832   if (ST.has16BitInsts()) {
833     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
834       .legalFor({S16, S32, S64})
835       .clampScalar(0, S16, S64)
836       .scalarize(0);
837   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
838     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
839       .legalFor({S32, S64})
840       .clampScalar(0, S32, S64)
841       .scalarize(0);
842   } else {
843     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
844       .legalFor({S32})
845       .customFor({S64})
846       .clampScalar(0, S32, S64)
847       .scalarize(0);
848   }
849 
850   getActionDefinitionsBuilder(G_PTR_ADD)
851     .legalIf(all(isPointer(0), sameSize(0, 1)))
852     .scalarize(0)
853     .scalarSameSizeAs(1, 0);
854 
855   getActionDefinitionsBuilder(G_PTRMASK)
856     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
857     .scalarSameSizeAs(1, 0)
858     .scalarize(0);
859 
860   auto &CmpBuilder =
861     getActionDefinitionsBuilder(G_ICMP)
862     // The compare output type differs based on the register bank of the output,
863     // so make both s1 and s32 legal.
864     //
865     // Scalar compares producing output in scc will be promoted to s32, as that
866     // is the allocatable register type that will be needed for the copy from
867     // scc. This will be promoted during RegBankSelect, and we assume something
868     // before that won't try to use s32 result types.
869     //
870     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
871     // bank.
872     .legalForCartesianProduct(
873       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
874     .legalForCartesianProduct(
875       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
876   if (ST.has16BitInsts()) {
877     CmpBuilder.legalFor({{S1, S16}});
878   }
879 
880   CmpBuilder
881     .widenScalarToNextPow2(1)
882     .clampScalar(1, S32, S64)
883     .scalarize(0)
884     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
885 
886   getActionDefinitionsBuilder(G_FCMP)
887     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
888     .widenScalarToNextPow2(1)
889     .clampScalar(1, S32, S64)
890     .scalarize(0);
891 
892   // FIXME: fpow has a selection pattern that should move to custom lowering.
893   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
894   if (ST.has16BitInsts())
895     Exp2Ops.legalFor({S32, S16});
896   else
897     Exp2Ops.legalFor({S32});
898   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
899   Exp2Ops.scalarize(0);
900 
901   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
902   if (ST.has16BitInsts())
903     ExpOps.customFor({{S32}, {S16}});
904   else
905     ExpOps.customFor({S32});
906   ExpOps.clampScalar(0, MinScalarFPTy, S32)
907         .scalarize(0);
908 
909   getActionDefinitionsBuilder(G_FPOWI)
910     .clampScalar(0, MinScalarFPTy, S32)
911     .lower();
912 
913   // The 64-bit versions produce 32-bit results, but only on the SALU.
914   getActionDefinitionsBuilder(G_CTPOP)
915     .legalFor({{S32, S32}, {S32, S64}})
916     .clampScalar(0, S32, S32)
917     .clampScalar(1, S32, S64)
918     .scalarize(0)
919     .widenScalarToNextPow2(0, 32)
920     .widenScalarToNextPow2(1, 32);
921 
922   // The hardware instructions return a different result on 0 than the generic
923   // instructions expect. The hardware produces -1, but these produce the
924   // bitwidth.
925   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
926     .scalarize(0)
927     .clampScalar(0, S32, S32)
928     .clampScalar(1, S32, S64)
929     .widenScalarToNextPow2(0, 32)
930     .widenScalarToNextPow2(1, 32)
931     .lower();
932 
933   // The 64-bit versions produce 32-bit results, but only on the SALU.
934   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
935     .legalFor({{S32, S32}, {S32, S64}})
936     .clampScalar(0, S32, S32)
937     .clampScalar(1, S32, S64)
938     .scalarize(0)
939     .widenScalarToNextPow2(0, 32)
940     .widenScalarToNextPow2(1, 32);
941 
942   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
943   // RegBankSelect.
944   getActionDefinitionsBuilder(G_BITREVERSE)
945     .legalFor({S32, S64})
946     .clampScalar(0, S32, S64)
947     .scalarize(0)
948     .widenScalarToNextPow2(0);
949 
950   if (ST.has16BitInsts()) {
951     getActionDefinitionsBuilder(G_BSWAP)
952       .legalFor({S16, S32, V2S16})
953       .clampMaxNumElements(0, S16, 2)
954       // FIXME: Fixing non-power-of-2 before clamp is workaround for
955       // narrowScalar limitation.
956       .widenScalarToNextPow2(0)
957       .clampScalar(0, S16, S32)
958       .scalarize(0);
959 
960     if (ST.hasVOP3PInsts()) {
961       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
962         .legalFor({S32, S16, V2S16})
963         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
964         .clampMaxNumElements(0, S16, 2)
965         .minScalar(0, S16)
966         .widenScalarToNextPow2(0)
967         .scalarize(0)
968         .lower();
969     } else {
970       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
971         .legalFor({S32, S16})
972         .widenScalarToNextPow2(0)
973         .minScalar(0, S16)
974         .scalarize(0)
975         .lower();
976     }
977   } else {
978     // TODO: Should have same legality without v_perm_b32
979     getActionDefinitionsBuilder(G_BSWAP)
980       .legalFor({S32})
981       .lowerIf(scalarNarrowerThan(0, 32))
982       // FIXME: Fixing non-power-of-2 before clamp is workaround for
983       // narrowScalar limitation.
984       .widenScalarToNextPow2(0)
985       .maxScalar(0, S32)
986       .scalarize(0)
987       .lower();
988 
989     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
990       .legalFor({S32})
991       .minScalar(0, S32)
992       .widenScalarToNextPow2(0)
993       .scalarize(0)
994       .lower();
995   }
996 
997   getActionDefinitionsBuilder(G_INTTOPTR)
998     // List the common cases
999     .legalForCartesianProduct(AddrSpaces64, {S64})
1000     .legalForCartesianProduct(AddrSpaces32, {S32})
1001     .scalarize(0)
1002     // Accept any address space as long as the size matches
1003     .legalIf(sameSize(0, 1))
1004     .widenScalarIf(smallerThan(1, 0),
1005       [](const LegalityQuery &Query) {
1006         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1007       })
1008     .narrowScalarIf(largerThan(1, 0),
1009       [](const LegalityQuery &Query) {
1010         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1011       });
1012 
1013   getActionDefinitionsBuilder(G_PTRTOINT)
1014     // List the common cases
1015     .legalForCartesianProduct(AddrSpaces64, {S64})
1016     .legalForCartesianProduct(AddrSpaces32, {S32})
1017     .scalarize(0)
1018     // Accept any address space as long as the size matches
1019     .legalIf(sameSize(0, 1))
1020     .widenScalarIf(smallerThan(0, 1),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1023       })
1024     .narrowScalarIf(
1025       largerThan(0, 1),
1026       [](const LegalityQuery &Query) {
1027         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1028       });
1029 
1030   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1031     .scalarize(0)
1032     .custom();
1033 
1034   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1035                                     bool IsLoad) -> bool {
1036     const LLT DstTy = Query.Types[0];
1037 
1038     // Split vector extloads.
1039     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1040     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1041 
1042     if (MemSize < DstTy.getSizeInBits())
1043       MemSize = std::max(MemSize, AlignBits);
1044 
1045     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1046       return true;
1047 
1048     const LLT PtrTy = Query.Types[1];
1049     unsigned AS = PtrTy.getAddressSpace();
1050     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1051       return true;
1052 
1053     // Catch weird sized loads that don't evenly divide into the access sizes
1054     // TODO: May be able to widen depending on alignment etc.
1055     unsigned NumRegs = (MemSize + 31) / 32;
1056     if (NumRegs == 3) {
1057       if (!ST.hasDwordx3LoadStores())
1058         return true;
1059     } else {
1060       // If the alignment allows, these should have been widened.
1061       if (!isPowerOf2_32(NumRegs))
1062         return true;
1063     }
1064 
1065     if (AlignBits < MemSize) {
1066       const SITargetLowering *TLI = ST.getTargetLowering();
1067       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1068                                                       Align(AlignBits / 8));
1069     }
1070 
1071     return false;
1072   };
1073 
1074   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1075   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1076   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1077 
1078   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1079   // LDS
1080   // TODO: Unsupported flat for SI.
1081 
1082   for (unsigned Op : {G_LOAD, G_STORE}) {
1083     const bool IsStore = Op == G_STORE;
1084 
1085     auto &Actions = getActionDefinitionsBuilder(Op);
1086     // Explicitly list some common cases.
1087     // TODO: Does this help compile time at all?
1088     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1089                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1090                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1091                                       {S64, GlobalPtr, 64, GlobalAlign32},
1092                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1093                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1094                                       {S32, GlobalPtr, 8, GlobalAlign8},
1095                                       {S32, GlobalPtr, 16, GlobalAlign16},
1096 
1097                                       {S32, LocalPtr, 32, 32},
1098                                       {S64, LocalPtr, 64, 32},
1099                                       {V2S32, LocalPtr, 64, 32},
1100                                       {S32, LocalPtr, 8, 8},
1101                                       {S32, LocalPtr, 16, 16},
1102                                       {V2S16, LocalPtr, 32, 32},
1103 
1104                                       {S32, PrivatePtr, 32, 32},
1105                                       {S32, PrivatePtr, 8, 8},
1106                                       {S32, PrivatePtr, 16, 16},
1107                                       {V2S16, PrivatePtr, 32, 32},
1108 
1109                                       {S32, ConstantPtr, 32, GlobalAlign32},
1110                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1111                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1112                                       {S64, ConstantPtr, 64, GlobalAlign32},
1113                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1114     Actions.legalIf(
1115       [=](const LegalityQuery &Query) -> bool {
1116         return isLoadStoreLegal(ST, Query, Op);
1117       });
1118 
1119     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1120     // 64-bits.
1121     //
1122     // TODO: Should generalize bitcast action into coerce, which will also cover
1123     // inserting addrspacecasts.
1124     Actions.customIf(typeIs(1, Constant32Ptr));
1125 
1126     // Turn any illegal element vectors into something easier to deal
1127     // with. These will ultimately produce 32-bit scalar shifts to extract the
1128     // parts anyway.
1129     //
1130     // For odd 16-bit element vectors, prefer to split those into pieces with
1131     // 16-bit vector parts.
1132     Actions.bitcastIf(
1133       [=](const LegalityQuery &Query) -> bool {
1134         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1135                                           Query.MMODescrs[0].SizeInBits);
1136       }, bitcastToRegisterType(0));
1137 
1138     if (!IsStore) {
1139       // Widen suitably aligned loads by loading extra bytes. The standard
1140       // legalization actions can't properly express widening memory operands.
1141       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1142         return shouldWidenLoad(ST, Query, G_LOAD);
1143       });
1144     }
1145 
1146     // FIXME: load/store narrowing should be moved to lower action
1147     Actions
1148         .narrowScalarIf(
1149             [=](const LegalityQuery &Query) -> bool {
1150               return !Query.Types[0].isVector() &&
1151                      needToSplitMemOp(Query, Op == G_LOAD);
1152             },
1153             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1154               const LLT DstTy = Query.Types[0];
1155               const LLT PtrTy = Query.Types[1];
1156 
1157               const unsigned DstSize = DstTy.getSizeInBits();
1158               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1159 
1160               // Split extloads.
1161               if (DstSize > MemSize)
1162                 return std::make_pair(0, LLT::scalar(MemSize));
1163 
1164               if (!isPowerOf2_32(DstSize)) {
1165                 // We're probably decomposing an odd sized store. Try to split
1166                 // to the widest type. TODO: Account for alignment. As-is it
1167                 // should be OK, since the new parts will be further legalized.
1168                 unsigned FloorSize = PowerOf2Floor(DstSize);
1169                 return std::make_pair(0, LLT::scalar(FloorSize));
1170               }
1171 
1172               if (DstSize > 32 && (DstSize % 32 != 0)) {
1173                 // FIXME: Need a way to specify non-extload of larger size if
1174                 // suitably aligned.
1175                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1176               }
1177 
1178               unsigned MaxSize = maxSizeForAddrSpace(ST,
1179                                                      PtrTy.getAddressSpace(),
1180                                                      Op == G_LOAD);
1181               if (MemSize > MaxSize)
1182                 return std::make_pair(0, LLT::scalar(MaxSize));
1183 
1184               unsigned Align = Query.MMODescrs[0].AlignInBits;
1185               return std::make_pair(0, LLT::scalar(Align));
1186             })
1187         .fewerElementsIf(
1188             [=](const LegalityQuery &Query) -> bool {
1189               return Query.Types[0].isVector() &&
1190                      needToSplitMemOp(Query, Op == G_LOAD);
1191             },
1192             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1193               const LLT DstTy = Query.Types[0];
1194               const LLT PtrTy = Query.Types[1];
1195 
1196               LLT EltTy = DstTy.getElementType();
1197               unsigned MaxSize = maxSizeForAddrSpace(ST,
1198                                                      PtrTy.getAddressSpace(),
1199                                                      Op == G_LOAD);
1200 
1201               // FIXME: Handle widened to power of 2 results better. This ends
1202               // up scalarizing.
1203               // FIXME: 3 element stores scalarized on SI
1204 
1205               // Split if it's too large for the address space.
1206               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1207                 unsigned NumElts = DstTy.getNumElements();
1208                 unsigned EltSize = EltTy.getSizeInBits();
1209 
1210                 if (MaxSize % EltSize == 0) {
1211                   return std::make_pair(
1212                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1213                 }
1214 
1215                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1216 
1217                 // FIXME: Refine when odd breakdowns handled
1218                 // The scalars will need to be re-legalized.
1219                 if (NumPieces == 1 || NumPieces >= NumElts ||
1220                     NumElts % NumPieces != 0)
1221                   return std::make_pair(0, EltTy);
1222 
1223                 return std::make_pair(0,
1224                                       LLT::vector(NumElts / NumPieces, EltTy));
1225               }
1226 
1227               // FIXME: We could probably handle weird extending loads better.
1228               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1229               if (DstTy.getSizeInBits() > MemSize)
1230                 return std::make_pair(0, EltTy);
1231 
1232               unsigned EltSize = EltTy.getSizeInBits();
1233               unsigned DstSize = DstTy.getSizeInBits();
1234               if (!isPowerOf2_32(DstSize)) {
1235                 // We're probably decomposing an odd sized store. Try to split
1236                 // to the widest type. TODO: Account for alignment. As-is it
1237                 // should be OK, since the new parts will be further legalized.
1238                 unsigned FloorSize = PowerOf2Floor(DstSize);
1239                 return std::make_pair(
1240                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1241               }
1242 
1243               // Need to split because of alignment.
1244               unsigned Align = Query.MMODescrs[0].AlignInBits;
1245               if (EltSize > Align &&
1246                   (EltSize / Align < DstTy.getNumElements())) {
1247                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1248               }
1249 
1250               // May need relegalization for the scalars.
1251               return std::make_pair(0, EltTy);
1252             })
1253     .lowerIfMemSizeNotPow2()
1254     .minScalar(0, S32);
1255 
1256     if (IsStore)
1257       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1258 
1259     Actions
1260         .widenScalarToNextPow2(0)
1261         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1262         .lower();
1263   }
1264 
1265   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1266                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1267                                                   {S32, GlobalPtr, 16, 2 * 8},
1268                                                   {S32, LocalPtr, 8, 8},
1269                                                   {S32, LocalPtr, 16, 16},
1270                                                   {S32, PrivatePtr, 8, 8},
1271                                                   {S32, PrivatePtr, 16, 16},
1272                                                   {S32, ConstantPtr, 8, 8},
1273                                                   {S32, ConstantPtr, 16, 2 * 8}});
1274   if (ST.hasFlatAddressSpace()) {
1275     ExtLoads.legalForTypesWithMemDesc(
1276         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1277   }
1278 
1279   ExtLoads.clampScalar(0, S32, S32)
1280           .widenScalarToNextPow2(0)
1281           .unsupportedIfMemSizeNotPow2()
1282           .lower();
1283 
1284   auto &Atomics = getActionDefinitionsBuilder(
1285     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1286      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1287      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1288      G_ATOMICRMW_UMIN})
1289     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1290                {S64, GlobalPtr}, {S64, LocalPtr},
1291                {S32, RegionPtr}, {S64, RegionPtr}});
1292   if (ST.hasFlatAddressSpace()) {
1293     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1294   }
1295 
1296   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1297   if (ST.hasLDSFPAtomics()) {
1298     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1299     if (ST.hasGFX90AInsts())
1300       Atomic.legalFor({{S64, LocalPtr}});
1301   }
1302   if (ST.hasAtomicFaddInsts())
1303     Atomic.legalFor({{S32, GlobalPtr}});
1304 
1305   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1306   // demarshalling
1307   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1308     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1309                 {S32, FlatPtr}, {S64, FlatPtr}})
1310     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1311                {S32, RegionPtr}, {S64, RegionPtr}});
1312   // TODO: Pointer types, any 32-bit or 64-bit vector
1313 
1314   // Condition should be s32 for scalar, s1 for vector.
1315   getActionDefinitionsBuilder(G_SELECT)
1316     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1317           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1318           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1319     .clampScalar(0, S16, S64)
1320     .scalarize(1)
1321     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1322     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1323     .clampMaxNumElements(0, S32, 2)
1324     .clampMaxNumElements(0, LocalPtr, 2)
1325     .clampMaxNumElements(0, PrivatePtr, 2)
1326     .scalarize(0)
1327     .widenScalarToNextPow2(0)
1328     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1329 
1330   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1331   // be more flexible with the shift amount type.
1332   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1333     .legalFor({{S32, S32}, {S64, S32}});
1334   if (ST.has16BitInsts()) {
1335     if (ST.hasVOP3PInsts()) {
1336       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1337             .clampMaxNumElements(0, S16, 2);
1338     } else
1339       Shifts.legalFor({{S16, S16}});
1340 
1341     // TODO: Support 16-bit shift amounts for all types
1342     Shifts.widenScalarIf(
1343       [=](const LegalityQuery &Query) {
1344         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1345         // 32-bit amount.
1346         const LLT ValTy = Query.Types[0];
1347         const LLT AmountTy = Query.Types[1];
1348         return ValTy.getSizeInBits() <= 16 &&
1349                AmountTy.getSizeInBits() < 16;
1350       }, changeTo(1, S16));
1351     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1352     Shifts.clampScalar(1, S32, S32);
1353     Shifts.clampScalar(0, S16, S64);
1354     Shifts.widenScalarToNextPow2(0, 16);
1355 
1356     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1357       .minScalar(0, S16)
1358       .scalarize(0)
1359       .lower();
1360   } else {
1361     // Make sure we legalize the shift amount type first, as the general
1362     // expansion for the shifted type will produce much worse code if it hasn't
1363     // been truncated already.
1364     Shifts.clampScalar(1, S32, S32);
1365     Shifts.clampScalar(0, S32, S64);
1366     Shifts.widenScalarToNextPow2(0, 32);
1367 
1368     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1369       .minScalar(0, S32)
1370       .scalarize(0)
1371       .lower();
1372   }
1373   Shifts.scalarize(0);
1374 
1375   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1376     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1377     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1378     unsigned IdxTypeIdx = 2;
1379 
1380     getActionDefinitionsBuilder(Op)
1381       .customIf([=](const LegalityQuery &Query) {
1382           const LLT EltTy = Query.Types[EltTypeIdx];
1383           const LLT VecTy = Query.Types[VecTypeIdx];
1384           const LLT IdxTy = Query.Types[IdxTypeIdx];
1385           const unsigned EltSize = EltTy.getSizeInBits();
1386           return (EltSize == 32 || EltSize == 64) &&
1387                   VecTy.getSizeInBits() % 32 == 0 &&
1388                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1389                   IdxTy.getSizeInBits() == 32;
1390         })
1391       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1392                  bitcastToVectorElement32(VecTypeIdx))
1393       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1394       .bitcastIf(
1395         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1396         [=](const LegalityQuery &Query) {
1397           // For > 64-bit element types, try to turn this into a 64-bit
1398           // element vector since we may be able to do better indexing
1399           // if this is scalar. If not, fall back to 32.
1400           const LLT EltTy = Query.Types[EltTypeIdx];
1401           const LLT VecTy = Query.Types[VecTypeIdx];
1402           const unsigned DstEltSize = EltTy.getSizeInBits();
1403           const unsigned VecSize = VecTy.getSizeInBits();
1404 
1405           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1406           return std::make_pair(
1407             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1408         })
1409       .clampScalar(EltTypeIdx, S32, S64)
1410       .clampScalar(VecTypeIdx, S32, S64)
1411       .clampScalar(IdxTypeIdx, S32, S32)
1412       .clampMaxNumElements(VecTypeIdx, S32, 32)
1413       // TODO: Clamp elements for 64-bit vectors?
1414       // It should only be necessary with variable indexes.
1415       // As a last resort, lower to the stack
1416       .lower();
1417   }
1418 
1419   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1420     .unsupportedIf([=](const LegalityQuery &Query) {
1421         const LLT &EltTy = Query.Types[1].getElementType();
1422         return Query.Types[0] != EltTy;
1423       });
1424 
1425   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1426     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1427     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1428 
1429     // FIXME: Doesn't handle extract of illegal sizes.
1430     getActionDefinitionsBuilder(Op)
1431       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1432       // FIXME: Multiples of 16 should not be legal.
1433       .legalIf([=](const LegalityQuery &Query) {
1434           const LLT BigTy = Query.Types[BigTyIdx];
1435           const LLT LitTy = Query.Types[LitTyIdx];
1436           return (BigTy.getSizeInBits() % 32 == 0) &&
1437                  (LitTy.getSizeInBits() % 16 == 0);
1438         })
1439       .widenScalarIf(
1440         [=](const LegalityQuery &Query) {
1441           const LLT BigTy = Query.Types[BigTyIdx];
1442           return (BigTy.getScalarSizeInBits() < 16);
1443         },
1444         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1445       .widenScalarIf(
1446         [=](const LegalityQuery &Query) {
1447           const LLT LitTy = Query.Types[LitTyIdx];
1448           return (LitTy.getScalarSizeInBits() < 16);
1449         },
1450         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1451       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1452       .widenScalarToNextPow2(BigTyIdx, 32);
1453 
1454   }
1455 
1456   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1457     .legalForCartesianProduct(AllS32Vectors, {S32})
1458     .legalForCartesianProduct(AllS64Vectors, {S64})
1459     .clampNumElements(0, V16S32, V32S32)
1460     .clampNumElements(0, V2S64, V16S64)
1461     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1462 
1463   if (ST.hasScalarPackInsts()) {
1464     BuildVector
1465       // FIXME: Should probably widen s1 vectors straight to s32
1466       .minScalarOrElt(0, S16)
1467       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1468       .minScalar(1, S32);
1469 
1470     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1471       .legalFor({V2S16, S32})
1472       .lower();
1473     BuildVector.minScalarOrElt(0, S32);
1474   } else {
1475     BuildVector.customFor({V2S16, S16});
1476     BuildVector.minScalarOrElt(0, S32);
1477 
1478     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1479       .customFor({V2S16, S32})
1480       .lower();
1481   }
1482 
1483   BuildVector.legalIf(isRegisterType(0));
1484 
1485   // FIXME: Clamp maximum size
1486   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1487     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1488     .clampMaxNumElements(0, S32, 32)
1489     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1490     .clampMaxNumElements(0, S16, 64);
1491 
1492   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1493   // pre-legalize.
1494   if (ST.hasVOP3PInsts()) {
1495     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1496       .customFor({V2S16, V2S16})
1497       .lower();
1498   } else
1499     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1500 
1501   // Merge/Unmerge
1502   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1503     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1504     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1505 
1506     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1507       const LLT Ty = Query.Types[TypeIdx];
1508       if (Ty.isVector()) {
1509         const LLT &EltTy = Ty.getElementType();
1510         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1511           return true;
1512         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1513           return true;
1514       }
1515       return false;
1516     };
1517 
1518     auto &Builder = getActionDefinitionsBuilder(Op)
1519       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1520       .lowerFor({{S16, V2S16}})
1521       .lowerIf([=](const LegalityQuery &Query) {
1522           const LLT BigTy = Query.Types[BigTyIdx];
1523           return BigTy.getSizeInBits() == 32;
1524         })
1525       // Try to widen to s16 first for small types.
1526       // TODO: Only do this on targets with legal s16 shifts
1527       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1528       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1529       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1530       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1531                            elementTypeIs(1, S16)),
1532                        changeTo(1, V2S16))
1533       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1534       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1535       // valid.
1536       .clampScalar(LitTyIdx, S32, S512)
1537       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1538       // Break up vectors with weird elements into scalars
1539       .fewerElementsIf(
1540         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1541         scalarize(0))
1542       .fewerElementsIf(
1543         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1544         scalarize(1))
1545       .clampScalar(BigTyIdx, S32, MaxScalar);
1546 
1547     if (Op == G_MERGE_VALUES) {
1548       Builder.widenScalarIf(
1549         // TODO: Use 16-bit shifts if legal for 8-bit values?
1550         [=](const LegalityQuery &Query) {
1551           const LLT Ty = Query.Types[LitTyIdx];
1552           return Ty.getSizeInBits() < 32;
1553         },
1554         changeTo(LitTyIdx, S32));
1555     }
1556 
1557     Builder.widenScalarIf(
1558       [=](const LegalityQuery &Query) {
1559         const LLT Ty = Query.Types[BigTyIdx];
1560         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1561           Ty.getSizeInBits() % 16 != 0;
1562       },
1563       [=](const LegalityQuery &Query) {
1564         // Pick the next power of 2, or a multiple of 64 over 128.
1565         // Whichever is smaller.
1566         const LLT &Ty = Query.Types[BigTyIdx];
1567         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1568         if (NewSizeInBits >= 256) {
1569           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1570           if (RoundedTo < NewSizeInBits)
1571             NewSizeInBits = RoundedTo;
1572         }
1573         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1574       })
1575       // Any vectors left are the wrong size. Scalarize them.
1576       .scalarize(0)
1577       .scalarize(1);
1578   }
1579 
1580   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1581   // RegBankSelect.
1582   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1583     .legalFor({{S32}, {S64}});
1584 
1585   if (ST.hasVOP3PInsts()) {
1586     SextInReg.lowerFor({{V2S16}})
1587       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1588       // get more vector shift opportunities, since we'll get those when
1589       // expanded.
1590       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1591   } else if (ST.has16BitInsts()) {
1592     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1593   } else {
1594     // Prefer to promote to s32 before lowering if we don't have 16-bit
1595     // shifts. This avoid a lot of intermediate truncate and extend operations.
1596     SextInReg.lowerFor({{S32}, {S64}});
1597   }
1598 
1599   SextInReg
1600     .scalarize(0)
1601     .clampScalar(0, S32, S64)
1602     .lower();
1603 
1604   // TODO: Only Try to form v2s16 with legal packed instructions.
1605   getActionDefinitionsBuilder(G_FSHR)
1606     .legalFor({{S32, S32}})
1607     .lowerFor({{V2S16, V2S16}})
1608     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1609     .scalarize(0)
1610     .lower();
1611 
1612   if (ST.hasVOP3PInsts()) {
1613     getActionDefinitionsBuilder(G_FSHL)
1614       .lowerFor({{V2S16, V2S16}})
1615       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1616       .scalarize(0)
1617       .lower();
1618   } else {
1619     getActionDefinitionsBuilder(G_FSHL)
1620       .scalarize(0)
1621       .lower();
1622   }
1623 
1624   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1625     .legalFor({S64});
1626 
1627   getActionDefinitionsBuilder(G_FENCE)
1628     .alwaysLegal();
1629 
1630   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1631       .scalarize(0)
1632       .minScalar(0, S32)
1633       .lower();
1634 
1635   getActionDefinitionsBuilder({
1636       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1637       G_FCOPYSIGN,
1638 
1639       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1640       G_ATOMICRMW_NAND,
1641       G_ATOMICRMW_FSUB,
1642       G_READ_REGISTER,
1643       G_WRITE_REGISTER,
1644 
1645       G_SADDO, G_SSUBO,
1646 
1647        // TODO: Implement
1648       G_FMINIMUM, G_FMAXIMUM}).lower();
1649 
1650   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1651         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1652         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1653     .unsupported();
1654 
1655   computeTables();
1656   verify(*ST.getInstrInfo());
1657 }
1658 
1659 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1660                                          MachineInstr &MI) const {
1661   MachineIRBuilder &B = Helper.MIRBuilder;
1662   MachineRegisterInfo &MRI = *B.getMRI();
1663 
1664   switch (MI.getOpcode()) {
1665   case TargetOpcode::G_ADDRSPACE_CAST:
1666     return legalizeAddrSpaceCast(MI, MRI, B);
1667   case TargetOpcode::G_FRINT:
1668     return legalizeFrint(MI, MRI, B);
1669   case TargetOpcode::G_FCEIL:
1670     return legalizeFceil(MI, MRI, B);
1671   case TargetOpcode::G_FREM:
1672     return legalizeFrem(MI, MRI, B);
1673   case TargetOpcode::G_INTRINSIC_TRUNC:
1674     return legalizeIntrinsicTrunc(MI, MRI, B);
1675   case TargetOpcode::G_SITOFP:
1676     return legalizeITOFP(MI, MRI, B, true);
1677   case TargetOpcode::G_UITOFP:
1678     return legalizeITOFP(MI, MRI, B, false);
1679   case TargetOpcode::G_FPTOSI:
1680     return legalizeFPTOI(MI, MRI, B, true);
1681   case TargetOpcode::G_FPTOUI:
1682     return legalizeFPTOI(MI, MRI, B, false);
1683   case TargetOpcode::G_FMINNUM:
1684   case TargetOpcode::G_FMAXNUM:
1685   case TargetOpcode::G_FMINNUM_IEEE:
1686   case TargetOpcode::G_FMAXNUM_IEEE:
1687     return legalizeMinNumMaxNum(Helper, MI);
1688   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1689     return legalizeExtractVectorElt(MI, MRI, B);
1690   case TargetOpcode::G_INSERT_VECTOR_ELT:
1691     return legalizeInsertVectorElt(MI, MRI, B);
1692   case TargetOpcode::G_SHUFFLE_VECTOR:
1693     return legalizeShuffleVector(MI, MRI, B);
1694   case TargetOpcode::G_FSIN:
1695   case TargetOpcode::G_FCOS:
1696     return legalizeSinCos(MI, MRI, B);
1697   case TargetOpcode::G_GLOBAL_VALUE:
1698     return legalizeGlobalValue(MI, MRI, B);
1699   case TargetOpcode::G_LOAD:
1700     return legalizeLoad(Helper, MI);
1701   case TargetOpcode::G_FMAD:
1702     return legalizeFMad(MI, MRI, B);
1703   case TargetOpcode::G_FDIV:
1704     return legalizeFDIV(MI, MRI, B);
1705   case TargetOpcode::G_UDIV:
1706   case TargetOpcode::G_UREM:
1707   case TargetOpcode::G_UDIVREM:
1708     return legalizeUnsignedDIV_REM(MI, MRI, B);
1709   case TargetOpcode::G_SDIV:
1710   case TargetOpcode::G_SREM:
1711   case TargetOpcode::G_SDIVREM:
1712     return legalizeSignedDIV_REM(MI, MRI, B);
1713   case TargetOpcode::G_ATOMIC_CMPXCHG:
1714     return legalizeAtomicCmpXChg(MI, MRI, B);
1715   case TargetOpcode::G_FLOG:
1716     return legalizeFlog(MI, B, numbers::ln2f);
1717   case TargetOpcode::G_FLOG10:
1718     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1719   case TargetOpcode::G_FEXP:
1720     return legalizeFExp(MI, B);
1721   case TargetOpcode::G_FPOW:
1722     return legalizeFPow(MI, B);
1723   case TargetOpcode::G_FFLOOR:
1724     return legalizeFFloor(MI, MRI, B);
1725   case TargetOpcode::G_BUILD_VECTOR:
1726     return legalizeBuildVector(MI, MRI, B);
1727   default:
1728     return false;
1729   }
1730 
1731   llvm_unreachable("expected switch to return");
1732 }
1733 
1734 Register AMDGPULegalizerInfo::getSegmentAperture(
1735   unsigned AS,
1736   MachineRegisterInfo &MRI,
1737   MachineIRBuilder &B) const {
1738   MachineFunction &MF = B.getMF();
1739   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1740   const LLT S32 = LLT::scalar(32);
1741 
1742   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1743 
1744   if (ST.hasApertureRegs()) {
1745     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1746     // getreg.
1747     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1748         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1749         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1750     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1751         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1752         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1753     unsigned Encoding =
1754         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1755         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1756         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1757 
1758     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1759 
1760     B.buildInstr(AMDGPU::S_GETREG_B32)
1761       .addDef(GetReg)
1762       .addImm(Encoding);
1763     MRI.setType(GetReg, S32);
1764 
1765     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1766     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1767   }
1768 
1769   Register QueuePtr = MRI.createGenericVirtualRegister(
1770     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1771 
1772   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1773     return Register();
1774 
1775   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1776   // private_segment_aperture_base_hi.
1777   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1778 
1779   // TODO: can we be smarter about machine pointer info?
1780   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1781   MachineMemOperand *MMO = MF.getMachineMemOperand(
1782       PtrInfo,
1783       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1784           MachineMemOperand::MOInvariant,
1785       4, commonAlignment(Align(64), StructOffset));
1786 
1787   Register LoadAddr;
1788 
1789   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1790   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1791 }
1792 
1793 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1794   MachineInstr &MI, MachineRegisterInfo &MRI,
1795   MachineIRBuilder &B) const {
1796   MachineFunction &MF = B.getMF();
1797 
1798   const LLT S32 = LLT::scalar(32);
1799   Register Dst = MI.getOperand(0).getReg();
1800   Register Src = MI.getOperand(1).getReg();
1801 
1802   LLT DstTy = MRI.getType(Dst);
1803   LLT SrcTy = MRI.getType(Src);
1804   unsigned DestAS = DstTy.getAddressSpace();
1805   unsigned SrcAS = SrcTy.getAddressSpace();
1806 
1807   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1808   // vector element.
1809   assert(!DstTy.isVector());
1810 
1811   const AMDGPUTargetMachine &TM
1812     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1813 
1814   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1815     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1816     return true;
1817   }
1818 
1819   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1820     // Truncate.
1821     B.buildExtract(Dst, Src, 0);
1822     MI.eraseFromParent();
1823     return true;
1824   }
1825 
1826   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1827     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1828     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1829 
1830     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1831     // another. Merge operands are required to be the same type, but creating an
1832     // extra ptrtoint would be kind of pointless.
1833     auto HighAddr = B.buildConstant(
1834       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1835     B.buildMerge(Dst, {Src, HighAddr});
1836     MI.eraseFromParent();
1837     return true;
1838   }
1839 
1840   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1841     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1842            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1843     unsigned NullVal = TM.getNullPointerValue(DestAS);
1844 
1845     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1846     auto FlatNull = B.buildConstant(SrcTy, 0);
1847 
1848     // Extract low 32-bits of the pointer.
1849     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1850 
1851     auto CmpRes =
1852         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1853     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1854 
1855     MI.eraseFromParent();
1856     return true;
1857   }
1858 
1859   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1860     return false;
1861 
1862   if (!ST.hasFlatAddressSpace())
1863     return false;
1864 
1865   auto SegmentNull =
1866       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1867   auto FlatNull =
1868       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1869 
1870   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1871   if (!ApertureReg.isValid())
1872     return false;
1873 
1874   auto CmpRes =
1875       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1876 
1877   // Coerce the type of the low half of the result so we can use merge_values.
1878   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1879 
1880   // TODO: Should we allow mismatched types but matching sizes in merges to
1881   // avoid the ptrtoint?
1882   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1883   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1884 
1885   MI.eraseFromParent();
1886   return true;
1887 }
1888 
1889 bool AMDGPULegalizerInfo::legalizeFrint(
1890   MachineInstr &MI, MachineRegisterInfo &MRI,
1891   MachineIRBuilder &B) const {
1892   Register Src = MI.getOperand(1).getReg();
1893   LLT Ty = MRI.getType(Src);
1894   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1895 
1896   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1897   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1898 
1899   auto C1 = B.buildFConstant(Ty, C1Val);
1900   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1901 
1902   // TODO: Should this propagate fast-math-flags?
1903   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1904   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1905 
1906   auto C2 = B.buildFConstant(Ty, C2Val);
1907   auto Fabs = B.buildFAbs(Ty, Src);
1908 
1909   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1910   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1911   MI.eraseFromParent();
1912   return true;
1913 }
1914 
1915 bool AMDGPULegalizerInfo::legalizeFceil(
1916   MachineInstr &MI, MachineRegisterInfo &MRI,
1917   MachineIRBuilder &B) const {
1918 
1919   const LLT S1 = LLT::scalar(1);
1920   const LLT S64 = LLT::scalar(64);
1921 
1922   Register Src = MI.getOperand(1).getReg();
1923   assert(MRI.getType(Src) == S64);
1924 
1925   // result = trunc(src)
1926   // if (src > 0.0 && src != result)
1927   //   result += 1.0
1928 
1929   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1930 
1931   const auto Zero = B.buildFConstant(S64, 0.0);
1932   const auto One = B.buildFConstant(S64, 1.0);
1933   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1934   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1935   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1936   auto Add = B.buildSelect(S64, And, One, Zero);
1937 
1938   // TODO: Should this propagate fast-math-flags?
1939   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1940   return true;
1941 }
1942 
1943 bool AMDGPULegalizerInfo::legalizeFrem(
1944   MachineInstr &MI, MachineRegisterInfo &MRI,
1945   MachineIRBuilder &B) const {
1946     Register DstReg = MI.getOperand(0).getReg();
1947     Register Src0Reg = MI.getOperand(1).getReg();
1948     Register Src1Reg = MI.getOperand(2).getReg();
1949     auto Flags = MI.getFlags();
1950     LLT Ty = MRI.getType(DstReg);
1951 
1952     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1953     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1954     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1955     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1956     MI.eraseFromParent();
1957     return true;
1958 }
1959 
1960 static MachineInstrBuilder extractF64Exponent(Register Hi,
1961                                               MachineIRBuilder &B) {
1962   const unsigned FractBits = 52;
1963   const unsigned ExpBits = 11;
1964   LLT S32 = LLT::scalar(32);
1965 
1966   auto Const0 = B.buildConstant(S32, FractBits - 32);
1967   auto Const1 = B.buildConstant(S32, ExpBits);
1968 
1969   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1970     .addUse(Hi)
1971     .addUse(Const0.getReg(0))
1972     .addUse(Const1.getReg(0));
1973 
1974   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1975 }
1976 
1977 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1978   MachineInstr &MI, MachineRegisterInfo &MRI,
1979   MachineIRBuilder &B) const {
1980   const LLT S1 = LLT::scalar(1);
1981   const LLT S32 = LLT::scalar(32);
1982   const LLT S64 = LLT::scalar(64);
1983 
1984   Register Src = MI.getOperand(1).getReg();
1985   assert(MRI.getType(Src) == S64);
1986 
1987   // TODO: Should this use extract since the low half is unused?
1988   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1989   Register Hi = Unmerge.getReg(1);
1990 
1991   // Extract the upper half, since this is where we will find the sign and
1992   // exponent.
1993   auto Exp = extractF64Exponent(Hi, B);
1994 
1995   const unsigned FractBits = 52;
1996 
1997   // Extract the sign bit.
1998   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1999   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2000 
2001   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2002 
2003   const auto Zero32 = B.buildConstant(S32, 0);
2004 
2005   // Extend back to 64-bits.
2006   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2007 
2008   auto Shr = B.buildAShr(S64, FractMask, Exp);
2009   auto Not = B.buildNot(S64, Shr);
2010   auto Tmp0 = B.buildAnd(S64, Src, Not);
2011   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2012 
2013   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2014   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2015 
2016   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2017   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2018   MI.eraseFromParent();
2019   return true;
2020 }
2021 
2022 bool AMDGPULegalizerInfo::legalizeITOFP(
2023   MachineInstr &MI, MachineRegisterInfo &MRI,
2024   MachineIRBuilder &B, bool Signed) const {
2025 
2026   Register Dst = MI.getOperand(0).getReg();
2027   Register Src = MI.getOperand(1).getReg();
2028 
2029   const LLT S64 = LLT::scalar(64);
2030   const LLT S32 = LLT::scalar(32);
2031 
2032   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2033 
2034   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2035 
2036   auto CvtHi = Signed ?
2037     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2038     B.buildUITOFP(S64, Unmerge.getReg(1));
2039 
2040   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2041 
2042   auto ThirtyTwo = B.buildConstant(S32, 32);
2043   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2044     .addUse(CvtHi.getReg(0))
2045     .addUse(ThirtyTwo.getReg(0));
2046 
2047   // TODO: Should this propagate fast-math-flags?
2048   B.buildFAdd(Dst, LdExp, CvtLo);
2049   MI.eraseFromParent();
2050   return true;
2051 }
2052 
2053 // TODO: Copied from DAG implementation. Verify logic and document how this
2054 // actually works.
2055 bool AMDGPULegalizerInfo::legalizeFPTOI(
2056   MachineInstr &MI, MachineRegisterInfo &MRI,
2057   MachineIRBuilder &B, bool Signed) const {
2058 
2059   Register Dst = MI.getOperand(0).getReg();
2060   Register Src = MI.getOperand(1).getReg();
2061 
2062   const LLT S64 = LLT::scalar(64);
2063   const LLT S32 = LLT::scalar(32);
2064 
2065   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2066 
2067   unsigned Flags = MI.getFlags();
2068 
2069   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2070   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2071   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2072 
2073   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2074   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2075   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2076 
2077   auto Hi = Signed ?
2078     B.buildFPTOSI(S32, FloorMul) :
2079     B.buildFPTOUI(S32, FloorMul);
2080   auto Lo = B.buildFPTOUI(S32, Fma);
2081 
2082   B.buildMerge(Dst, { Lo, Hi });
2083   MI.eraseFromParent();
2084 
2085   return true;
2086 }
2087 
2088 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2089                                                MachineInstr &MI) const {
2090   MachineFunction &MF = Helper.MIRBuilder.getMF();
2091   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2092 
2093   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2094                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2095 
2096   // With ieee_mode disabled, the instructions have the correct behavior
2097   // already for G_FMINNUM/G_FMAXNUM
2098   if (!MFI->getMode().IEEE)
2099     return !IsIEEEOp;
2100 
2101   if (IsIEEEOp)
2102     return true;
2103 
2104   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2105 }
2106 
2107 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2108   MachineInstr &MI, MachineRegisterInfo &MRI,
2109   MachineIRBuilder &B) const {
2110   // TODO: Should move some of this into LegalizerHelper.
2111 
2112   // TODO: Promote dynamic indexing of s16 to s32
2113 
2114   // FIXME: Artifact combiner probably should have replaced the truncated
2115   // constant before this, so we shouldn't need
2116   // getConstantVRegValWithLookThrough.
2117   Optional<ValueAndVReg> MaybeIdxVal =
2118       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2119   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2120     return true;
2121   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2122 
2123   Register Dst = MI.getOperand(0).getReg();
2124   Register Vec = MI.getOperand(1).getReg();
2125 
2126   LLT VecTy = MRI.getType(Vec);
2127   LLT EltTy = VecTy.getElementType();
2128   assert(EltTy == MRI.getType(Dst));
2129 
2130   if (IdxVal < VecTy.getNumElements())
2131     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2132   else
2133     B.buildUndef(Dst);
2134 
2135   MI.eraseFromParent();
2136   return true;
2137 }
2138 
2139 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2140   MachineInstr &MI, MachineRegisterInfo &MRI,
2141   MachineIRBuilder &B) const {
2142   // TODO: Should move some of this into LegalizerHelper.
2143 
2144   // TODO: Promote dynamic indexing of s16 to s32
2145 
2146   // FIXME: Artifact combiner probably should have replaced the truncated
2147   // constant before this, so we shouldn't need
2148   // getConstantVRegValWithLookThrough.
2149   Optional<ValueAndVReg> MaybeIdxVal =
2150       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2151   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2152     return true;
2153 
2154   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2155   Register Dst = MI.getOperand(0).getReg();
2156   Register Vec = MI.getOperand(1).getReg();
2157   Register Ins = MI.getOperand(2).getReg();
2158 
2159   LLT VecTy = MRI.getType(Vec);
2160   LLT EltTy = VecTy.getElementType();
2161   assert(EltTy == MRI.getType(Ins));
2162 
2163   if (IdxVal < VecTy.getNumElements())
2164     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2165   else
2166     B.buildUndef(Dst);
2167 
2168   MI.eraseFromParent();
2169   return true;
2170 }
2171 
2172 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2173   MachineInstr &MI, MachineRegisterInfo &MRI,
2174   MachineIRBuilder &B) const {
2175   const LLT V2S16 = LLT::vector(2, 16);
2176 
2177   Register Dst = MI.getOperand(0).getReg();
2178   Register Src0 = MI.getOperand(1).getReg();
2179   LLT DstTy = MRI.getType(Dst);
2180   LLT SrcTy = MRI.getType(Src0);
2181 
2182   if (SrcTy == V2S16 && DstTy == V2S16 &&
2183       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2184     return true;
2185 
2186   MachineIRBuilder HelperBuilder(MI);
2187   GISelObserverWrapper DummyObserver;
2188   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2189   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2190 }
2191 
2192 bool AMDGPULegalizerInfo::legalizeSinCos(
2193   MachineInstr &MI, MachineRegisterInfo &MRI,
2194   MachineIRBuilder &B) const {
2195 
2196   Register DstReg = MI.getOperand(0).getReg();
2197   Register SrcReg = MI.getOperand(1).getReg();
2198   LLT Ty = MRI.getType(DstReg);
2199   unsigned Flags = MI.getFlags();
2200 
2201   Register TrigVal;
2202   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2203   if (ST.hasTrigReducedRange()) {
2204     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2205     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2206       .addUse(MulVal.getReg(0))
2207       .setMIFlags(Flags).getReg(0);
2208   } else
2209     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2210 
2211   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2212     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2213   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2214     .addUse(TrigVal)
2215     .setMIFlags(Flags);
2216   MI.eraseFromParent();
2217   return true;
2218 }
2219 
2220 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2221                                                   MachineIRBuilder &B,
2222                                                   const GlobalValue *GV,
2223                                                   int64_t Offset,
2224                                                   unsigned GAFlags) const {
2225   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2226   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2227   // to the following code sequence:
2228   //
2229   // For constant address space:
2230   //   s_getpc_b64 s[0:1]
2231   //   s_add_u32 s0, s0, $symbol
2232   //   s_addc_u32 s1, s1, 0
2233   //
2234   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2235   //   a fixup or relocation is emitted to replace $symbol with a literal
2236   //   constant, which is a pc-relative offset from the encoding of the $symbol
2237   //   operand to the global variable.
2238   //
2239   // For global address space:
2240   //   s_getpc_b64 s[0:1]
2241   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2242   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2243   //
2244   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2245   //   fixups or relocations are emitted to replace $symbol@*@lo and
2246   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2247   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2248   //   operand to the global variable.
2249   //
2250   // What we want here is an offset from the value returned by s_getpc
2251   // (which is the address of the s_add_u32 instruction) to the global
2252   // variable, but since the encoding of $symbol starts 4 bytes after the start
2253   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2254   // small. This requires us to add 4 to the global variable offset in order to
2255   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2256   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2257   // instruction.
2258 
2259   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2260 
2261   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2262     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2263 
2264   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2265     .addDef(PCReg);
2266 
2267   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2268   if (GAFlags == SIInstrInfo::MO_NONE)
2269     MIB.addImm(0);
2270   else
2271     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2272 
2273   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2274 
2275   if (PtrTy.getSizeInBits() == 32)
2276     B.buildExtract(DstReg, PCReg, 0);
2277   return true;
2278  }
2279 
2280 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2281   MachineInstr &MI, MachineRegisterInfo &MRI,
2282   MachineIRBuilder &B) const {
2283   Register DstReg = MI.getOperand(0).getReg();
2284   LLT Ty = MRI.getType(DstReg);
2285   unsigned AS = Ty.getAddressSpace();
2286 
2287   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2288   MachineFunction &MF = B.getMF();
2289   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2290 
2291   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2292     if (!MFI->isModuleEntryFunction() &&
2293         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2294       const Function &Fn = MF.getFunction();
2295       DiagnosticInfoUnsupported BadLDSDecl(
2296         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2297         DS_Warning);
2298       Fn.getContext().diagnose(BadLDSDecl);
2299 
2300       // We currently don't have a way to correctly allocate LDS objects that
2301       // aren't directly associated with a kernel. We do force inlining of
2302       // functions that use local objects. However, if these dead functions are
2303       // not eliminated, we don't want a compile time error. Just emit a warning
2304       // and a trap, since there should be no callable path here.
2305       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2306       B.buildUndef(DstReg);
2307       MI.eraseFromParent();
2308       return true;
2309     }
2310 
2311     // TODO: We could emit code to handle the initialization somewhere.
2312     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2313       const SITargetLowering *TLI = ST.getTargetLowering();
2314       if (!TLI->shouldUseLDSConstAddress(GV)) {
2315         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2316         return true; // Leave in place;
2317       }
2318 
2319       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2320         Type *Ty = GV->getValueType();
2321         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2322         // zero-sized type in other languages to declare the dynamic shared
2323         // memory which size is not known at the compile time. They will be
2324         // allocated by the runtime and placed directly after the static
2325         // allocated ones. They all share the same offset.
2326         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2327           // Adjust alignment for that dynamic shared memory array.
2328           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2329           LLT S32 = LLT::scalar(32);
2330           auto Sz =
2331               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2332           B.buildIntToPtr(DstReg, Sz);
2333           MI.eraseFromParent();
2334           return true;
2335         }
2336       }
2337 
2338       B.buildConstant(
2339           DstReg,
2340           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2341       MI.eraseFromParent();
2342       return true;
2343     }
2344 
2345     const Function &Fn = MF.getFunction();
2346     DiagnosticInfoUnsupported BadInit(
2347       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2348     Fn.getContext().diagnose(BadInit);
2349     return true;
2350   }
2351 
2352   const SITargetLowering *TLI = ST.getTargetLowering();
2353 
2354   if (TLI->shouldEmitFixup(GV)) {
2355     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2356     MI.eraseFromParent();
2357     return true;
2358   }
2359 
2360   if (TLI->shouldEmitPCReloc(GV)) {
2361     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2362     MI.eraseFromParent();
2363     return true;
2364   }
2365 
2366   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2367   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2368 
2369   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2370       MachinePointerInfo::getGOT(MF),
2371       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2372           MachineMemOperand::MOInvariant,
2373       8 /*Size*/, Align(8));
2374 
2375   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2376 
2377   if (Ty.getSizeInBits() == 32) {
2378     // Truncate if this is a 32-bit constant adrdess.
2379     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2380     B.buildExtract(DstReg, Load, 0);
2381   } else
2382     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2383 
2384   MI.eraseFromParent();
2385   return true;
2386 }
2387 
2388 static LLT widenToNextPowerOf2(LLT Ty) {
2389   if (Ty.isVector())
2390     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2391   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2392 }
2393 
2394 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2395                                        MachineInstr &MI) const {
2396   MachineIRBuilder &B = Helper.MIRBuilder;
2397   MachineRegisterInfo &MRI = *B.getMRI();
2398   GISelChangeObserver &Observer = Helper.Observer;
2399 
2400   Register PtrReg = MI.getOperand(1).getReg();
2401   LLT PtrTy = MRI.getType(PtrReg);
2402   unsigned AddrSpace = PtrTy.getAddressSpace();
2403 
2404   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2405     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2406     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2407     Observer.changingInstr(MI);
2408     MI.getOperand(1).setReg(Cast.getReg(0));
2409     Observer.changedInstr(MI);
2410     return true;
2411   }
2412 
2413   Register ValReg = MI.getOperand(0).getReg();
2414   LLT ValTy = MRI.getType(ValReg);
2415 
2416   MachineMemOperand *MMO = *MI.memoperands_begin();
2417   const unsigned ValSize = ValTy.getSizeInBits();
2418   const unsigned MemSize = 8 * MMO->getSize();
2419   const Align MemAlign = MMO->getAlign();
2420   const unsigned AlignInBits = 8 * MemAlign.value();
2421 
2422   // Widen non-power-of-2 loads to the alignment if needed
2423   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2424     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2425 
2426     // This was already the correct extending load result type, so just adjust
2427     // the memory type.
2428     if (WideMemSize == ValSize) {
2429       MachineFunction &MF = B.getMF();
2430 
2431       MachineMemOperand *WideMMO =
2432           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2433       Observer.changingInstr(MI);
2434       MI.setMemRefs(MF, {WideMMO});
2435       Observer.changedInstr(MI);
2436       return true;
2437     }
2438 
2439     // Don't bother handling edge case that should probably never be produced.
2440     if (ValSize > WideMemSize)
2441       return false;
2442 
2443     LLT WideTy = widenToNextPowerOf2(ValTy);
2444 
2445     Register WideLoad;
2446     if (!WideTy.isVector()) {
2447       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2448       B.buildTrunc(ValReg, WideLoad).getReg(0);
2449     } else {
2450       // Extract the subvector.
2451 
2452       if (isRegisterType(ValTy)) {
2453         // If this a case where G_EXTRACT is legal, use it.
2454         // (e.g. <3 x s32> -> <4 x s32>)
2455         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2456         B.buildExtract(ValReg, WideLoad, 0);
2457       } else {
2458         // For cases where the widened type isn't a nice register value, unmerge
2459         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2460         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2461         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2462         B.setInsertPt(B.getMBB(), MI.getIterator());
2463         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2464       }
2465     }
2466 
2467     MI.eraseFromParent();
2468     return true;
2469   }
2470 
2471   return false;
2472 }
2473 
2474 bool AMDGPULegalizerInfo::legalizeFMad(
2475   MachineInstr &MI, MachineRegisterInfo &MRI,
2476   MachineIRBuilder &B) const {
2477   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2478   assert(Ty.isScalar());
2479 
2480   MachineFunction &MF = B.getMF();
2481   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2482 
2483   // TODO: Always legal with future ftz flag.
2484   // FIXME: Do we need just output?
2485   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2486     return true;
2487   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2488     return true;
2489 
2490   MachineIRBuilder HelperBuilder(MI);
2491   GISelObserverWrapper DummyObserver;
2492   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2493   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2494 }
2495 
2496 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2497   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2498   Register DstReg = MI.getOperand(0).getReg();
2499   Register PtrReg = MI.getOperand(1).getReg();
2500   Register CmpVal = MI.getOperand(2).getReg();
2501   Register NewVal = MI.getOperand(3).getReg();
2502 
2503   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2504          "this should not have been custom lowered");
2505 
2506   LLT ValTy = MRI.getType(CmpVal);
2507   LLT VecTy = LLT::vector(2, ValTy);
2508 
2509   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2510 
2511   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2512     .addDef(DstReg)
2513     .addUse(PtrReg)
2514     .addUse(PackedVal)
2515     .setMemRefs(MI.memoperands());
2516 
2517   MI.eraseFromParent();
2518   return true;
2519 }
2520 
2521 bool AMDGPULegalizerInfo::legalizeFlog(
2522   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2523   Register Dst = MI.getOperand(0).getReg();
2524   Register Src = MI.getOperand(1).getReg();
2525   LLT Ty = B.getMRI()->getType(Dst);
2526   unsigned Flags = MI.getFlags();
2527 
2528   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2529   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2530 
2531   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2532   MI.eraseFromParent();
2533   return true;
2534 }
2535 
2536 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2537                                        MachineIRBuilder &B) const {
2538   Register Dst = MI.getOperand(0).getReg();
2539   Register Src = MI.getOperand(1).getReg();
2540   unsigned Flags = MI.getFlags();
2541   LLT Ty = B.getMRI()->getType(Dst);
2542 
2543   auto K = B.buildFConstant(Ty, numbers::log2e);
2544   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2545   B.buildFExp2(Dst, Mul, Flags);
2546   MI.eraseFromParent();
2547   return true;
2548 }
2549 
2550 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2551                                        MachineIRBuilder &B) const {
2552   Register Dst = MI.getOperand(0).getReg();
2553   Register Src0 = MI.getOperand(1).getReg();
2554   Register Src1 = MI.getOperand(2).getReg();
2555   unsigned Flags = MI.getFlags();
2556   LLT Ty = B.getMRI()->getType(Dst);
2557   const LLT S16 = LLT::scalar(16);
2558   const LLT S32 = LLT::scalar(32);
2559 
2560   if (Ty == S32) {
2561     auto Log = B.buildFLog2(S32, Src0, Flags);
2562     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2563       .addUse(Log.getReg(0))
2564       .addUse(Src1)
2565       .setMIFlags(Flags);
2566     B.buildFExp2(Dst, Mul, Flags);
2567   } else if (Ty == S16) {
2568     // There's no f16 fmul_legacy, so we need to convert for it.
2569     auto Log = B.buildFLog2(S16, Src0, Flags);
2570     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2571     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2572     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2573       .addUse(Ext0.getReg(0))
2574       .addUse(Ext1.getReg(0))
2575       .setMIFlags(Flags);
2576 
2577     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2578   } else
2579     return false;
2580 
2581   MI.eraseFromParent();
2582   return true;
2583 }
2584 
2585 // Find a source register, ignoring any possible source modifiers.
2586 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2587   Register ModSrc = OrigSrc;
2588   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2589     ModSrc = SrcFNeg->getOperand(1).getReg();
2590     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2591       ModSrc = SrcFAbs->getOperand(1).getReg();
2592   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2593     ModSrc = SrcFAbs->getOperand(1).getReg();
2594   return ModSrc;
2595 }
2596 
2597 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2598                                          MachineRegisterInfo &MRI,
2599                                          MachineIRBuilder &B) const {
2600 
2601   const LLT S1 = LLT::scalar(1);
2602   const LLT S64 = LLT::scalar(64);
2603   Register Dst = MI.getOperand(0).getReg();
2604   Register OrigSrc = MI.getOperand(1).getReg();
2605   unsigned Flags = MI.getFlags();
2606   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2607          "this should not have been custom lowered");
2608 
2609   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2610   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2611   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2612   // V_FRACT bug is:
2613   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2614   //
2615   // Convert floor(x) to (x - fract(x))
2616 
2617   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2618     .addUse(OrigSrc)
2619     .setMIFlags(Flags);
2620 
2621   // Give source modifier matching some assistance before obscuring a foldable
2622   // pattern.
2623 
2624   // TODO: We can avoid the neg on the fract? The input sign to fract
2625   // shouldn't matter?
2626   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2627 
2628   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2629 
2630   Register Min = MRI.createGenericVirtualRegister(S64);
2631 
2632   // We don't need to concern ourselves with the snan handling difference, so
2633   // use the one which will directly select.
2634   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2635   if (MFI->getMode().IEEE)
2636     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2637   else
2638     B.buildFMinNum(Min, Fract, Const, Flags);
2639 
2640   Register CorrectedFract = Min;
2641   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2642     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2643     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2644   }
2645 
2646   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2647   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2648 
2649   MI.eraseFromParent();
2650   return true;
2651 }
2652 
2653 // Turn an illegal packed v2s16 build vector into bit operations.
2654 // TODO: This should probably be a bitcast action in LegalizerHelper.
2655 bool AMDGPULegalizerInfo::legalizeBuildVector(
2656   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2657   Register Dst = MI.getOperand(0).getReg();
2658   const LLT S32 = LLT::scalar(32);
2659   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2660 
2661   Register Src0 = MI.getOperand(1).getReg();
2662   Register Src1 = MI.getOperand(2).getReg();
2663   assert(MRI.getType(Src0) == LLT::scalar(16));
2664 
2665   auto Merge = B.buildMerge(S32, {Src0, Src1});
2666   B.buildBitcast(Dst, Merge);
2667 
2668   MI.eraseFromParent();
2669   return true;
2670 }
2671 
2672 // Check that this is a G_XOR x, -1
2673 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2674   if (MI.getOpcode() != TargetOpcode::G_XOR)
2675     return false;
2676   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2677   return ConstVal && *ConstVal == -1;
2678 }
2679 
2680 // Return the use branch instruction, otherwise null if the usage is invalid.
2681 static MachineInstr *
2682 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2683                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2684   Register CondDef = MI.getOperand(0).getReg();
2685   if (!MRI.hasOneNonDBGUse(CondDef))
2686     return nullptr;
2687 
2688   MachineBasicBlock *Parent = MI.getParent();
2689   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2690 
2691   if (isNot(MRI, *UseMI)) {
2692     Register NegatedCond = UseMI->getOperand(0).getReg();
2693     if (!MRI.hasOneNonDBGUse(NegatedCond))
2694       return nullptr;
2695 
2696     // We're deleting the def of this value, so we need to remove it.
2697     UseMI->eraseFromParent();
2698 
2699     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2700     Negated = true;
2701   }
2702 
2703   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2704     return nullptr;
2705 
2706   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2707   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2708   if (Next == Parent->end()) {
2709     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2710     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2711       return nullptr;
2712     UncondBrTarget = &*NextMBB;
2713   } else {
2714     if (Next->getOpcode() != AMDGPU::G_BR)
2715       return nullptr;
2716     Br = &*Next;
2717     UncondBrTarget = Br->getOperand(0).getMBB();
2718   }
2719 
2720   return UseMI;
2721 }
2722 
2723 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2724                                          const ArgDescriptor *Arg,
2725                                          const TargetRegisterClass *ArgRC,
2726                                          LLT ArgTy) const {
2727   MCRegister SrcReg = Arg->getRegister();
2728   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2729   assert(DstReg.isVirtual() && "Virtual register expected");
2730 
2731   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2732                                              ArgTy);
2733   if (Arg->isMasked()) {
2734     // TODO: Should we try to emit this once in the entry block?
2735     const LLT S32 = LLT::scalar(32);
2736     const unsigned Mask = Arg->getMask();
2737     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2738 
2739     Register AndMaskSrc = LiveIn;
2740 
2741     if (Shift != 0) {
2742       auto ShiftAmt = B.buildConstant(S32, Shift);
2743       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2744     }
2745 
2746     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2747   } else {
2748     B.buildCopy(DstReg, LiveIn);
2749   }
2750 
2751   return true;
2752 }
2753 
2754 bool AMDGPULegalizerInfo::loadInputValue(
2755     Register DstReg, MachineIRBuilder &B,
2756     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2757   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2758   const ArgDescriptor *Arg;
2759   const TargetRegisterClass *ArgRC;
2760   LLT ArgTy;
2761   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2762 
2763   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2764     return false; // TODO: Handle these
2765   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2766 }
2767 
2768 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2769     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2770     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2771   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2772     return false;
2773 
2774   MI.eraseFromParent();
2775   return true;
2776 }
2777 
2778 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2779                                        MachineRegisterInfo &MRI,
2780                                        MachineIRBuilder &B) const {
2781   Register Dst = MI.getOperand(0).getReg();
2782   LLT DstTy = MRI.getType(Dst);
2783   LLT S16 = LLT::scalar(16);
2784   LLT S32 = LLT::scalar(32);
2785   LLT S64 = LLT::scalar(64);
2786 
2787   if (DstTy == S16)
2788     return legalizeFDIV16(MI, MRI, B);
2789   if (DstTy == S32)
2790     return legalizeFDIV32(MI, MRI, B);
2791   if (DstTy == S64)
2792     return legalizeFDIV64(MI, MRI, B);
2793 
2794   return false;
2795 }
2796 
2797 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2798                                                         Register DstDivReg,
2799                                                         Register DstRemReg,
2800                                                         Register X,
2801                                                         Register Y) const {
2802   const LLT S1 = LLT::scalar(1);
2803   const LLT S32 = LLT::scalar(32);
2804 
2805   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2806   // algorithm used here.
2807 
2808   // Initial estimate of inv(y).
2809   auto FloatY = B.buildUITOFP(S32, Y);
2810   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2811   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2812   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2813   auto Z = B.buildFPTOUI(S32, ScaledY);
2814 
2815   // One round of UNR.
2816   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2817   auto NegYZ = B.buildMul(S32, NegY, Z);
2818   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2819 
2820   // Quotient/remainder estimate.
2821   auto Q = B.buildUMulH(S32, X, Z);
2822   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2823 
2824   // First quotient/remainder refinement.
2825   auto One = B.buildConstant(S32, 1);
2826   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2827   if (DstDivReg)
2828     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2829   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2830 
2831   // Second quotient/remainder refinement.
2832   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2833   if (DstDivReg)
2834     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2835 
2836   if (DstRemReg)
2837     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2838 }
2839 
2840 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2841 //
2842 // Return lo, hi of result
2843 //
2844 // %cvt.lo = G_UITOFP Val.lo
2845 // %cvt.hi = G_UITOFP Val.hi
2846 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2847 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2848 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2849 // %mul2 = G_FMUL %mul1, 2**(-32)
2850 // %trunc = G_INTRINSIC_TRUNC %mul2
2851 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2852 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2853 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2854                                                        Register Val) {
2855   const LLT S32 = LLT::scalar(32);
2856   auto Unmerge = B.buildUnmerge(S32, Val);
2857 
2858   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2859   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2860 
2861   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2862                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2863 
2864   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2865   auto Mul1 =
2866       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2867 
2868   // 2**(-32)
2869   auto Mul2 =
2870       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2871   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2872 
2873   // -(2**32)
2874   auto Mad2 = B.buildFMAD(S32, Trunc,
2875                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2876 
2877   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2878   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2879 
2880   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2881 }
2882 
2883 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2884                                                         Register DstDivReg,
2885                                                         Register DstRemReg,
2886                                                         Register Numer,
2887                                                         Register Denom) const {
2888   const LLT S32 = LLT::scalar(32);
2889   const LLT S64 = LLT::scalar(64);
2890   const LLT S1 = LLT::scalar(1);
2891   Register RcpLo, RcpHi;
2892 
2893   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2894 
2895   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2896 
2897   auto Zero64 = B.buildConstant(S64, 0);
2898   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2899 
2900   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2901   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2902 
2903   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2904   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2905   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2906 
2907   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2908   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2909   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2910   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2911 
2912   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2913   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2914   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2915   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2916   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2917 
2918   auto Zero32 = B.buildConstant(S32, 0);
2919   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2920   auto Add2_HiC =
2921       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2922   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2923   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2924 
2925   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2926   Register NumerLo = UnmergeNumer.getReg(0);
2927   Register NumerHi = UnmergeNumer.getReg(1);
2928 
2929   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2930   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2931   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2932   Register Mul3_Lo = UnmergeMul3.getReg(0);
2933   Register Mul3_Hi = UnmergeMul3.getReg(1);
2934   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2935   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2936   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2937   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2938 
2939   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2940   Register DenomLo = UnmergeDenom.getReg(0);
2941   Register DenomHi = UnmergeDenom.getReg(1);
2942 
2943   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2944   auto C1 = B.buildSExt(S32, CmpHi);
2945 
2946   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2947   auto C2 = B.buildSExt(S32, CmpLo);
2948 
2949   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2950   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2951 
2952   // TODO: Here and below portions of the code can be enclosed into if/endif.
2953   // Currently control flow is unconditional and we have 4 selects after
2954   // potential endif to substitute PHIs.
2955 
2956   // if C3 != 0 ...
2957   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2958   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2959   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2960   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2961 
2962   auto One64 = B.buildConstant(S64, 1);
2963   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2964 
2965   auto C4 =
2966       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2967   auto C5 =
2968       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2969   auto C6 = B.buildSelect(
2970       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2971 
2972   // if (C6 != 0)
2973   auto Add4 = B.buildAdd(S64, Add3, One64);
2974   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2975 
2976   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2977   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2978   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2979 
2980   // endif C6
2981   // endif C3
2982 
2983   if (DstDivReg) {
2984     auto Sel1 = B.buildSelect(
2985         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2986     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
2987                   Sel1, MulHi3);
2988   }
2989 
2990   if (DstRemReg) {
2991     auto Sel2 = B.buildSelect(
2992         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2993     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
2994                   Sel2, Sub1);
2995   }
2996 }
2997 
2998 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
2999                                                   MachineRegisterInfo &MRI,
3000                                                   MachineIRBuilder &B) const {
3001   Register DstDivReg, DstRemReg;
3002   switch (MI.getOpcode()) {
3003   default:
3004     llvm_unreachable("Unexpected opcode!");
3005   case AMDGPU::G_UDIV: {
3006     DstDivReg = MI.getOperand(0).getReg();
3007     break;
3008   }
3009   case AMDGPU::G_UREM: {
3010     DstRemReg = MI.getOperand(0).getReg();
3011     break;
3012   }
3013   case AMDGPU::G_UDIVREM: {
3014     DstDivReg = MI.getOperand(0).getReg();
3015     DstRemReg = MI.getOperand(1).getReg();
3016     break;
3017   }
3018   }
3019 
3020   const LLT S64 = LLT::scalar(64);
3021   const LLT S32 = LLT::scalar(32);
3022   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3023   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3024   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3025   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3026 
3027   if (Ty == S32)
3028     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3029   else if (Ty == S64)
3030     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3031   else
3032     return false;
3033 
3034   MI.eraseFromParent();
3035   return true;
3036 }
3037 
3038 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3039                                                 MachineRegisterInfo &MRI,
3040                                                 MachineIRBuilder &B) const {
3041   const LLT S64 = LLT::scalar(64);
3042   const LLT S32 = LLT::scalar(32);
3043 
3044   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3045   if (Ty != S32 && Ty != S64)
3046     return false;
3047 
3048   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3049   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3050   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3051 
3052   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3053   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3054   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3055 
3056   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3057   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3058 
3059   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3060   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3061 
3062   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3063   switch (MI.getOpcode()) {
3064   default:
3065     llvm_unreachable("Unexpected opcode!");
3066   case AMDGPU::G_SDIV: {
3067     DstDivReg = MI.getOperand(0).getReg();
3068     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3069     break;
3070   }
3071   case AMDGPU::G_SREM: {
3072     DstRemReg = MI.getOperand(0).getReg();
3073     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3074     break;
3075   }
3076   case AMDGPU::G_SDIVREM: {
3077     DstDivReg = MI.getOperand(0).getReg();
3078     DstRemReg = MI.getOperand(1).getReg();
3079     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3080     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3081     break;
3082   }
3083   }
3084 
3085   if (Ty == S32)
3086     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3087   else
3088     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3089 
3090   if (DstDivReg) {
3091     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3092     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3093     B.buildSub(DstDivReg, SignXor, Sign);
3094   }
3095 
3096   if (DstRemReg) {
3097     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3098     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3099     B.buildSub(DstRemReg, SignXor, Sign);
3100   }
3101 
3102   MI.eraseFromParent();
3103   return true;
3104 }
3105 
3106 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3107                                                  MachineRegisterInfo &MRI,
3108                                                  MachineIRBuilder &B) const {
3109   Register Res = MI.getOperand(0).getReg();
3110   Register LHS = MI.getOperand(1).getReg();
3111   Register RHS = MI.getOperand(2).getReg();
3112   uint16_t Flags = MI.getFlags();
3113   LLT ResTy = MRI.getType(Res);
3114 
3115   const MachineFunction &MF = B.getMF();
3116   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3117                             MI.getFlag(MachineInstr::FmAfn);
3118 
3119   if (!AllowInaccurateRcp)
3120     return false;
3121 
3122   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3123     // 1 / x -> RCP(x)
3124     if (CLHS->isExactlyValue(1.0)) {
3125       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3126         .addUse(RHS)
3127         .setMIFlags(Flags);
3128 
3129       MI.eraseFromParent();
3130       return true;
3131     }
3132 
3133     // -1 / x -> RCP( FNEG(x) )
3134     if (CLHS->isExactlyValue(-1.0)) {
3135       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3136       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3137         .addUse(FNeg.getReg(0))
3138         .setMIFlags(Flags);
3139 
3140       MI.eraseFromParent();
3141       return true;
3142     }
3143   }
3144 
3145   // x / y -> x * (1.0 / y)
3146   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3147     .addUse(RHS)
3148     .setMIFlags(Flags);
3149   B.buildFMul(Res, LHS, RCP, Flags);
3150 
3151   MI.eraseFromParent();
3152   return true;
3153 }
3154 
3155 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3156                                                    MachineRegisterInfo &MRI,
3157                                                    MachineIRBuilder &B) const {
3158   Register Res = MI.getOperand(0).getReg();
3159   Register X = MI.getOperand(1).getReg();
3160   Register Y = MI.getOperand(2).getReg();
3161   uint16_t Flags = MI.getFlags();
3162   LLT ResTy = MRI.getType(Res);
3163 
3164   const MachineFunction &MF = B.getMF();
3165   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3166                             MI.getFlag(MachineInstr::FmAfn);
3167 
3168   if (!AllowInaccurateRcp)
3169     return false;
3170 
3171   auto NegY = B.buildFNeg(ResTy, Y);
3172   auto One = B.buildFConstant(ResTy, 1.0);
3173 
3174   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3175     .addUse(Y)
3176     .setMIFlags(Flags);
3177 
3178   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3179   R = B.buildFMA(ResTy, Tmp0, R, R);
3180 
3181   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3182   R = B.buildFMA(ResTy, Tmp1, R, R);
3183 
3184   auto Ret = B.buildFMul(ResTy, X, R);
3185   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3186 
3187   B.buildFMA(Res, Tmp2, R, Ret);
3188   MI.eraseFromParent();
3189   return true;
3190 }
3191 
3192 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3193                                          MachineRegisterInfo &MRI,
3194                                          MachineIRBuilder &B) const {
3195   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3196     return true;
3197 
3198   Register Res = MI.getOperand(0).getReg();
3199   Register LHS = MI.getOperand(1).getReg();
3200   Register RHS = MI.getOperand(2).getReg();
3201 
3202   uint16_t Flags = MI.getFlags();
3203 
3204   LLT S16 = LLT::scalar(16);
3205   LLT S32 = LLT::scalar(32);
3206 
3207   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3208   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3209 
3210   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3211     .addUse(RHSExt.getReg(0))
3212     .setMIFlags(Flags);
3213 
3214   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3215   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3216 
3217   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3218     .addUse(RDst.getReg(0))
3219     .addUse(RHS)
3220     .addUse(LHS)
3221     .setMIFlags(Flags);
3222 
3223   MI.eraseFromParent();
3224   return true;
3225 }
3226 
3227 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3228 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3229 static void toggleSPDenormMode(bool Enable,
3230                                MachineIRBuilder &B,
3231                                const GCNSubtarget &ST,
3232                                AMDGPU::SIModeRegisterDefaults Mode) {
3233   // Set SP denorm mode to this value.
3234   unsigned SPDenormMode =
3235     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3236 
3237   if (ST.hasDenormModeInst()) {
3238     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3239     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3240 
3241     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3242     B.buildInstr(AMDGPU::S_DENORM_MODE)
3243       .addImm(NewDenormModeValue);
3244 
3245   } else {
3246     // Select FP32 bit field in mode register.
3247     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3248                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3249                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3250 
3251     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3252       .addImm(SPDenormMode)
3253       .addImm(SPDenormModeBitField);
3254   }
3255 }
3256 
3257 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3258                                          MachineRegisterInfo &MRI,
3259                                          MachineIRBuilder &B) const {
3260   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3261     return true;
3262 
3263   Register Res = MI.getOperand(0).getReg();
3264   Register LHS = MI.getOperand(1).getReg();
3265   Register RHS = MI.getOperand(2).getReg();
3266   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3267   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3268 
3269   uint16_t Flags = MI.getFlags();
3270 
3271   LLT S32 = LLT::scalar(32);
3272   LLT S1 = LLT::scalar(1);
3273 
3274   auto One = B.buildFConstant(S32, 1.0f);
3275 
3276   auto DenominatorScaled =
3277     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3278       .addUse(LHS)
3279       .addUse(RHS)
3280       .addImm(0)
3281       .setMIFlags(Flags);
3282   auto NumeratorScaled =
3283     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3284       .addUse(LHS)
3285       .addUse(RHS)
3286       .addImm(1)
3287       .setMIFlags(Flags);
3288 
3289   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3290     .addUse(DenominatorScaled.getReg(0))
3291     .setMIFlags(Flags);
3292   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3293 
3294   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3295   // aren't modeled as reading it.
3296   if (!Mode.allFP32Denormals())
3297     toggleSPDenormMode(true, B, ST, Mode);
3298 
3299   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3300   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3301   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3302   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3303   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3304   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3305 
3306   if (!Mode.allFP32Denormals())
3307     toggleSPDenormMode(false, B, ST, Mode);
3308 
3309   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3310     .addUse(Fma4.getReg(0))
3311     .addUse(Fma1.getReg(0))
3312     .addUse(Fma3.getReg(0))
3313     .addUse(NumeratorScaled.getReg(1))
3314     .setMIFlags(Flags);
3315 
3316   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3317     .addUse(Fmas.getReg(0))
3318     .addUse(RHS)
3319     .addUse(LHS)
3320     .setMIFlags(Flags);
3321 
3322   MI.eraseFromParent();
3323   return true;
3324 }
3325 
3326 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3327                                          MachineRegisterInfo &MRI,
3328                                          MachineIRBuilder &B) const {
3329   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3330     return true;
3331 
3332   Register Res = MI.getOperand(0).getReg();
3333   Register LHS = MI.getOperand(1).getReg();
3334   Register RHS = MI.getOperand(2).getReg();
3335 
3336   uint16_t Flags = MI.getFlags();
3337 
3338   LLT S64 = LLT::scalar(64);
3339   LLT S1 = LLT::scalar(1);
3340 
3341   auto One = B.buildFConstant(S64, 1.0);
3342 
3343   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3344     .addUse(LHS)
3345     .addUse(RHS)
3346     .addImm(0)
3347     .setMIFlags(Flags);
3348 
3349   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3350 
3351   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3352     .addUse(DivScale0.getReg(0))
3353     .setMIFlags(Flags);
3354 
3355   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3356   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3357   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3358 
3359   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3360     .addUse(LHS)
3361     .addUse(RHS)
3362     .addImm(1)
3363     .setMIFlags(Flags);
3364 
3365   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3366   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3367   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3368 
3369   Register Scale;
3370   if (!ST.hasUsableDivScaleConditionOutput()) {
3371     // Workaround a hardware bug on SI where the condition output from div_scale
3372     // is not usable.
3373 
3374     LLT S32 = LLT::scalar(32);
3375 
3376     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3377     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3378     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3379     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3380 
3381     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3382                               Scale1Unmerge.getReg(1));
3383     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3384                               Scale0Unmerge.getReg(1));
3385     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3386   } else {
3387     Scale = DivScale1.getReg(1);
3388   }
3389 
3390   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3391     .addUse(Fma4.getReg(0))
3392     .addUse(Fma3.getReg(0))
3393     .addUse(Mul.getReg(0))
3394     .addUse(Scale)
3395     .setMIFlags(Flags);
3396 
3397   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3398     .addUse(Fmas.getReg(0))
3399     .addUse(RHS)
3400     .addUse(LHS)
3401     .setMIFlags(Flags);
3402 
3403   MI.eraseFromParent();
3404   return true;
3405 }
3406 
3407 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3408                                                  MachineRegisterInfo &MRI,
3409                                                  MachineIRBuilder &B) const {
3410   Register Res = MI.getOperand(0).getReg();
3411   Register LHS = MI.getOperand(2).getReg();
3412   Register RHS = MI.getOperand(3).getReg();
3413   uint16_t Flags = MI.getFlags();
3414 
3415   LLT S32 = LLT::scalar(32);
3416   LLT S1 = LLT::scalar(1);
3417 
3418   auto Abs = B.buildFAbs(S32, RHS, Flags);
3419   const APFloat C0Val(1.0f);
3420 
3421   auto C0 = B.buildConstant(S32, 0x6f800000);
3422   auto C1 = B.buildConstant(S32, 0x2f800000);
3423   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3424 
3425   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3426   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3427 
3428   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3429 
3430   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3431     .addUse(Mul0.getReg(0))
3432     .setMIFlags(Flags);
3433 
3434   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3435 
3436   B.buildFMul(Res, Sel, Mul1, Flags);
3437 
3438   MI.eraseFromParent();
3439   return true;
3440 }
3441 
3442 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3443 // FIXME: Why do we handle this one but not other removed instructions?
3444 //
3445 // Reciprocal square root.  The clamp prevents infinite results, clamping
3446 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3447 // +-max_float.
3448 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3449                                                     MachineRegisterInfo &MRI,
3450                                                     MachineIRBuilder &B) const {
3451   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3452     return true;
3453 
3454   Register Dst = MI.getOperand(0).getReg();
3455   Register Src = MI.getOperand(2).getReg();
3456   auto Flags = MI.getFlags();
3457 
3458   LLT Ty = MRI.getType(Dst);
3459 
3460   const fltSemantics *FltSemantics;
3461   if (Ty == LLT::scalar(32))
3462     FltSemantics = &APFloat::IEEEsingle();
3463   else if (Ty == LLT::scalar(64))
3464     FltSemantics = &APFloat::IEEEdouble();
3465   else
3466     return false;
3467 
3468   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3469     .addUse(Src)
3470     .setMIFlags(Flags);
3471 
3472   // We don't need to concern ourselves with the snan handling difference, since
3473   // the rsq quieted (or not) so use the one which will directly select.
3474   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3475   const bool UseIEEE = MFI->getMode().IEEE;
3476 
3477   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3478   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3479                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3480 
3481   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3482 
3483   if (UseIEEE)
3484     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3485   else
3486     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3487   MI.eraseFromParent();
3488   return true;
3489 }
3490 
3491 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3492   switch (IID) {
3493   case Intrinsic::amdgcn_ds_fadd:
3494     return AMDGPU::G_ATOMICRMW_FADD;
3495   case Intrinsic::amdgcn_ds_fmin:
3496     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3497   case Intrinsic::amdgcn_ds_fmax:
3498     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3499   default:
3500     llvm_unreachable("not a DS FP intrinsic");
3501   }
3502 }
3503 
3504 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3505                                                       MachineInstr &MI,
3506                                                       Intrinsic::ID IID) const {
3507   GISelChangeObserver &Observer = Helper.Observer;
3508   Observer.changingInstr(MI);
3509 
3510   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3511 
3512   // The remaining operands were used to set fields in the MemOperand on
3513   // construction.
3514   for (int I = 6; I > 3; --I)
3515     MI.RemoveOperand(I);
3516 
3517   MI.RemoveOperand(1); // Remove the intrinsic ID.
3518   Observer.changedInstr(MI);
3519   return true;
3520 }
3521 
3522 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3523                                             MachineRegisterInfo &MRI,
3524                                             MachineIRBuilder &B) const {
3525   uint64_t Offset =
3526     ST.getTargetLowering()->getImplicitParameterOffset(
3527       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3528   LLT DstTy = MRI.getType(DstReg);
3529   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3530 
3531   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3532   if (!loadInputValue(KernargPtrReg, B,
3533                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3534     return false;
3535 
3536   // FIXME: This should be nuw
3537   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3538   return true;
3539 }
3540 
3541 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3542                                                  MachineRegisterInfo &MRI,
3543                                                  MachineIRBuilder &B) const {
3544   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3545   if (!MFI->isEntryFunction()) {
3546     return legalizePreloadedArgIntrin(MI, MRI, B,
3547                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3548   }
3549 
3550   Register DstReg = MI.getOperand(0).getReg();
3551   if (!getImplicitArgPtr(DstReg, MRI, B))
3552     return false;
3553 
3554   MI.eraseFromParent();
3555   return true;
3556 }
3557 
3558 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3559                                               MachineRegisterInfo &MRI,
3560                                               MachineIRBuilder &B,
3561                                               unsigned AddrSpace) const {
3562   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3563   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3564   Register Hi32 = Unmerge.getReg(1);
3565 
3566   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3567   MI.eraseFromParent();
3568   return true;
3569 }
3570 
3571 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3572 // offset (the offset that is included in bounds checking and swizzling, to be
3573 // split between the instruction's voffset and immoffset fields) and soffset
3574 // (the offset that is excluded from bounds checking and swizzling, to go in
3575 // the instruction's soffset field).  This function takes the first kind of
3576 // offset and figures out how to split it between voffset and immoffset.
3577 std::tuple<Register, unsigned, unsigned>
3578 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3579                                         Register OrigOffset) const {
3580   const unsigned MaxImm = 4095;
3581   Register BaseReg;
3582   unsigned TotalConstOffset;
3583   const LLT S32 = LLT::scalar(32);
3584   MachineRegisterInfo &MRI = *B.getMRI();
3585 
3586   std::tie(BaseReg, TotalConstOffset) =
3587       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3588 
3589   unsigned ImmOffset = TotalConstOffset;
3590 
3591   // If BaseReg is a pointer, convert it to int.
3592   if (MRI.getType(BaseReg).isPointer())
3593     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3594 
3595   // If the immediate value is too big for the immoffset field, put the value
3596   // and -4096 into the immoffset field so that the value that is copied/added
3597   // for the voffset field is a multiple of 4096, and it stands more chance
3598   // of being CSEd with the copy/add for another similar load/store.
3599   // However, do not do that rounding down to a multiple of 4096 if that is a
3600   // negative number, as it appears to be illegal to have a negative offset
3601   // in the vgpr, even if adding the immediate offset makes it positive.
3602   unsigned Overflow = ImmOffset & ~MaxImm;
3603   ImmOffset -= Overflow;
3604   if ((int32_t)Overflow < 0) {
3605     Overflow += ImmOffset;
3606     ImmOffset = 0;
3607   }
3608 
3609   if (Overflow != 0) {
3610     if (!BaseReg) {
3611       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3612     } else {
3613       auto OverflowVal = B.buildConstant(S32, Overflow);
3614       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3615     }
3616   }
3617 
3618   if (!BaseReg)
3619     BaseReg = B.buildConstant(S32, 0).getReg(0);
3620 
3621   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3622 }
3623 
3624 /// Handle register layout difference for f16 images for some subtargets.
3625 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3626                                              MachineRegisterInfo &MRI,
3627                                              Register Reg,
3628                                              bool ImageStore) const {
3629   const LLT S16 = LLT::scalar(16);
3630   const LLT S32 = LLT::scalar(32);
3631   LLT StoreVT = MRI.getType(Reg);
3632   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3633 
3634   if (ST.hasUnpackedD16VMem()) {
3635     auto Unmerge = B.buildUnmerge(S16, Reg);
3636 
3637     SmallVector<Register, 4> WideRegs;
3638     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3639       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3640 
3641     int NumElts = StoreVT.getNumElements();
3642 
3643     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3644   }
3645 
3646   if (ImageStore && ST.hasImageStoreD16Bug()) {
3647     if (StoreVT.getNumElements() == 2) {
3648       SmallVector<Register, 4> PackedRegs;
3649       Reg = B.buildBitcast(S32, Reg).getReg(0);
3650       PackedRegs.push_back(Reg);
3651       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3652       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3653     }
3654 
3655     if (StoreVT.getNumElements() == 3) {
3656       SmallVector<Register, 4> PackedRegs;
3657       auto Unmerge = B.buildUnmerge(S16, Reg);
3658       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3659         PackedRegs.push_back(Unmerge.getReg(I));
3660       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3661       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3662       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3663     }
3664 
3665     if (StoreVT.getNumElements() == 4) {
3666       SmallVector<Register, 4> PackedRegs;
3667       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3668       auto Unmerge = B.buildUnmerge(S32, Reg);
3669       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3670         PackedRegs.push_back(Unmerge.getReg(I));
3671       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3672       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3673     }
3674 
3675     llvm_unreachable("invalid data type");
3676   }
3677 
3678   return Reg;
3679 }
3680 
3681 Register AMDGPULegalizerInfo::fixStoreSourceType(
3682   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3683   MachineRegisterInfo *MRI = B.getMRI();
3684   LLT Ty = MRI->getType(VData);
3685 
3686   const LLT S16 = LLT::scalar(16);
3687 
3688   // Fixup illegal register types for i8 stores.
3689   if (Ty == LLT::scalar(8) || Ty == S16) {
3690     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3691     return AnyExt;
3692   }
3693 
3694   if (Ty.isVector()) {
3695     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3696       if (IsFormat)
3697         return handleD16VData(B, *MRI, VData);
3698     }
3699   }
3700 
3701   return VData;
3702 }
3703 
3704 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3705                                               MachineRegisterInfo &MRI,
3706                                               MachineIRBuilder &B,
3707                                               bool IsTyped,
3708                                               bool IsFormat) const {
3709   Register VData = MI.getOperand(1).getReg();
3710   LLT Ty = MRI.getType(VData);
3711   LLT EltTy = Ty.getScalarType();
3712   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3713   const LLT S32 = LLT::scalar(32);
3714 
3715   VData = fixStoreSourceType(B, VData, IsFormat);
3716   Register RSrc = MI.getOperand(2).getReg();
3717 
3718   MachineMemOperand *MMO = *MI.memoperands_begin();
3719   const int MemSize = MMO->getSize();
3720 
3721   unsigned ImmOffset;
3722   unsigned TotalOffset;
3723 
3724   // The typed intrinsics add an immediate after the registers.
3725   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3726 
3727   // The struct intrinsic variants add one additional operand over raw.
3728   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3729   Register VIndex;
3730   int OpOffset = 0;
3731   if (HasVIndex) {
3732     VIndex = MI.getOperand(3).getReg();
3733     OpOffset = 1;
3734   }
3735 
3736   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3737   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3738 
3739   unsigned Format = 0;
3740   if (IsTyped) {
3741     Format = MI.getOperand(5 + OpOffset).getImm();
3742     ++OpOffset;
3743   }
3744 
3745   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3746 
3747   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3748   if (TotalOffset != 0)
3749     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3750 
3751   unsigned Opc;
3752   if (IsTyped) {
3753     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3754                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3755   } else if (IsFormat) {
3756     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3757                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3758   } else {
3759     switch (MemSize) {
3760     case 1:
3761       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3762       break;
3763     case 2:
3764       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3765       break;
3766     default:
3767       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3768       break;
3769     }
3770   }
3771 
3772   if (!VIndex)
3773     VIndex = B.buildConstant(S32, 0).getReg(0);
3774 
3775   auto MIB = B.buildInstr(Opc)
3776     .addUse(VData)              // vdata
3777     .addUse(RSrc)               // rsrc
3778     .addUse(VIndex)             // vindex
3779     .addUse(VOffset)            // voffset
3780     .addUse(SOffset)            // soffset
3781     .addImm(ImmOffset);         // offset(imm)
3782 
3783   if (IsTyped)
3784     MIB.addImm(Format);
3785 
3786   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3787      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3788      .addMemOperand(MMO);
3789 
3790   MI.eraseFromParent();
3791   return true;
3792 }
3793 
3794 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3795                                              MachineRegisterInfo &MRI,
3796                                              MachineIRBuilder &B,
3797                                              bool IsFormat,
3798                                              bool IsTyped) const {
3799   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3800   MachineMemOperand *MMO = *MI.memoperands_begin();
3801   const int MemSize = MMO->getSize();
3802   const LLT S32 = LLT::scalar(32);
3803 
3804   Register Dst = MI.getOperand(0).getReg();
3805   Register RSrc = MI.getOperand(2).getReg();
3806 
3807   // The typed intrinsics add an immediate after the registers.
3808   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3809 
3810   // The struct intrinsic variants add one additional operand over raw.
3811   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3812   Register VIndex;
3813   int OpOffset = 0;
3814   if (HasVIndex) {
3815     VIndex = MI.getOperand(3).getReg();
3816     OpOffset = 1;
3817   }
3818 
3819   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3820   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3821 
3822   unsigned Format = 0;
3823   if (IsTyped) {
3824     Format = MI.getOperand(5 + OpOffset).getImm();
3825     ++OpOffset;
3826   }
3827 
3828   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3829   unsigned ImmOffset;
3830   unsigned TotalOffset;
3831 
3832   LLT Ty = MRI.getType(Dst);
3833   LLT EltTy = Ty.getScalarType();
3834   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3835   const bool Unpacked = ST.hasUnpackedD16VMem();
3836 
3837   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3838   if (TotalOffset != 0)
3839     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3840 
3841   unsigned Opc;
3842 
3843   if (IsTyped) {
3844     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3845                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3846   } else if (IsFormat) {
3847     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3848                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3849   } else {
3850     switch (MemSize) {
3851     case 1:
3852       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3853       break;
3854     case 2:
3855       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3856       break;
3857     default:
3858       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3859       break;
3860     }
3861   }
3862 
3863   Register LoadDstReg;
3864 
3865   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3866   LLT UnpackedTy = Ty.changeElementSize(32);
3867 
3868   if (IsExtLoad)
3869     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3870   else if (Unpacked && IsD16 && Ty.isVector())
3871     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3872   else
3873     LoadDstReg = Dst;
3874 
3875   if (!VIndex)
3876     VIndex = B.buildConstant(S32, 0).getReg(0);
3877 
3878   auto MIB = B.buildInstr(Opc)
3879     .addDef(LoadDstReg)         // vdata
3880     .addUse(RSrc)               // rsrc
3881     .addUse(VIndex)             // vindex
3882     .addUse(VOffset)            // voffset
3883     .addUse(SOffset)            // soffset
3884     .addImm(ImmOffset);         // offset(imm)
3885 
3886   if (IsTyped)
3887     MIB.addImm(Format);
3888 
3889   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3890      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3891      .addMemOperand(MMO);
3892 
3893   if (LoadDstReg != Dst) {
3894     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3895 
3896     // Widen result for extending loads was widened.
3897     if (IsExtLoad)
3898       B.buildTrunc(Dst, LoadDstReg);
3899     else {
3900       // Repack to original 16-bit vector result
3901       // FIXME: G_TRUNC should work, but legalization currently fails
3902       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3903       SmallVector<Register, 4> Repack;
3904       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3905         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3906       B.buildMerge(Dst, Repack);
3907     }
3908   }
3909 
3910   MI.eraseFromParent();
3911   return true;
3912 }
3913 
3914 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3915                                                MachineIRBuilder &B,
3916                                                bool IsInc) const {
3917   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3918                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3919   B.buildInstr(Opc)
3920     .addDef(MI.getOperand(0).getReg())
3921     .addUse(MI.getOperand(2).getReg())
3922     .addUse(MI.getOperand(3).getReg())
3923     .cloneMemRefs(MI);
3924   MI.eraseFromParent();
3925   return true;
3926 }
3927 
3928 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3929   switch (IntrID) {
3930   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3931   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3932     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3933   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3934   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3935     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3936   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3937   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3938     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3939   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3940   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3941     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3942   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3943   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3944     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3945   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3946   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3947     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3948   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3949   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3950     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3951   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3952   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3953     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3954   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3955   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3956     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3957   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3958   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3959     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3960   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3961   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3962     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3963   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3964   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3965     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3966   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3967   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3968     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3969   case Intrinsic::amdgcn_buffer_atomic_fadd:
3970   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3971   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3972     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3973   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3974   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3975     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3976   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3977   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3978     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3979   default:
3980     llvm_unreachable("unhandled atomic opcode");
3981   }
3982 }
3983 
3984 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3985                                                MachineIRBuilder &B,
3986                                                Intrinsic::ID IID) const {
3987   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3988                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3989   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3990 
3991   Register Dst;
3992 
3993   int OpOffset = 0;
3994   if (HasReturn) {
3995     // A few FP atomics do not support return values.
3996     Dst = MI.getOperand(0).getReg();
3997   } else {
3998     OpOffset = -1;
3999   }
4000 
4001   Register VData = MI.getOperand(2 + OpOffset).getReg();
4002   Register CmpVal;
4003 
4004   if (IsCmpSwap) {
4005     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4006     ++OpOffset;
4007   }
4008 
4009   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4010   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4011 
4012   // The struct intrinsic variants add one additional operand over raw.
4013   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4014   Register VIndex;
4015   if (HasVIndex) {
4016     VIndex = MI.getOperand(4 + OpOffset).getReg();
4017     ++OpOffset;
4018   }
4019 
4020   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4021   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4022   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4023 
4024   MachineMemOperand *MMO = *MI.memoperands_begin();
4025 
4026   unsigned ImmOffset;
4027   unsigned TotalOffset;
4028   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4029   if (TotalOffset != 0)
4030     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4031 
4032   if (!VIndex)
4033     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4034 
4035   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4036 
4037   if (HasReturn)
4038     MIB.addDef(Dst);
4039 
4040   MIB.addUse(VData); // vdata
4041 
4042   if (IsCmpSwap)
4043     MIB.addReg(CmpVal);
4044 
4045   MIB.addUse(RSrc)               // rsrc
4046      .addUse(VIndex)             // vindex
4047      .addUse(VOffset)            // voffset
4048      .addUse(SOffset)            // soffset
4049      .addImm(ImmOffset)          // offset(imm)
4050      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4051      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4052      .addMemOperand(MMO);
4053 
4054   MI.eraseFromParent();
4055   return true;
4056 }
4057 
4058 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4059 /// vector with s16 typed elements.
4060 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4061                                       SmallVectorImpl<Register> &PackedAddrs,
4062                                       unsigned ArgOffset,
4063                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4064                                       bool IsA16, bool IsG16) {
4065   const LLT S16 = LLT::scalar(16);
4066   const LLT V2S16 = LLT::vector(2, 16);
4067   auto EndIdx = Intr->VAddrEnd;
4068 
4069   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4070     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4071     if (!SrcOp.isReg())
4072       continue; // _L to _LZ may have eliminated this.
4073 
4074     Register AddrReg = SrcOp.getReg();
4075 
4076     if (I < Intr->GradientStart) {
4077       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4078       PackedAddrs.push_back(AddrReg);
4079     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4080                (I >= Intr->CoordStart && !IsA16)) {
4081       // Handle any gradient or coordinate operands that should not be packed
4082       PackedAddrs.push_back(AddrReg);
4083     } else {
4084       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4085       // derivatives dx/dh and dx/dv are packed with undef.
4086       if (((I + 1) >= EndIdx) ||
4087           ((Intr->NumGradients / 2) % 2 == 1 &&
4088            (I == static_cast<unsigned>(Intr->GradientStart +
4089                                        (Intr->NumGradients / 2) - 1) ||
4090             I == static_cast<unsigned>(Intr->GradientStart +
4091                                        Intr->NumGradients - 1))) ||
4092           // Check for _L to _LZ optimization
4093           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4094         PackedAddrs.push_back(
4095             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4096                 .getReg(0));
4097       } else {
4098         PackedAddrs.push_back(
4099             B.buildBuildVector(
4100                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4101                 .getReg(0));
4102         ++I;
4103       }
4104     }
4105   }
4106 }
4107 
4108 /// Convert from separate vaddr components to a single vector address register,
4109 /// and replace the remaining operands with $noreg.
4110 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4111                                      int DimIdx, int NumVAddrs) {
4112   const LLT S32 = LLT::scalar(32);
4113 
4114   SmallVector<Register, 8> AddrRegs;
4115   for (int I = 0; I != NumVAddrs; ++I) {
4116     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4117     if (SrcOp.isReg()) {
4118       AddrRegs.push_back(SrcOp.getReg());
4119       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4120     }
4121   }
4122 
4123   int NumAddrRegs = AddrRegs.size();
4124   if (NumAddrRegs != 1) {
4125     // Round up to 8 elements for v5-v7
4126     // FIXME: Missing intermediate sized register classes and instructions.
4127     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4128       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4129       auto Undef = B.buildUndef(S32);
4130       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4131       NumAddrRegs = RoundedNumRegs;
4132     }
4133 
4134     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4135     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4136   }
4137 
4138   for (int I = 1; I != NumVAddrs; ++I) {
4139     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4140     if (SrcOp.isReg())
4141       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4142   }
4143 }
4144 
4145 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4146 ///
4147 /// Depending on the subtarget, load/store with 16-bit element data need to be
4148 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4149 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4150 /// registers.
4151 ///
4152 /// We don't want to directly select image instructions just yet, but also want
4153 /// to exposes all register repacking to the legalizer/combiners. We also don't
4154 /// want a selected instrution entering RegBankSelect. In order to avoid
4155 /// defining a multitude of intermediate image instructions, directly hack on
4156 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4157 /// now unnecessary arguments with $noreg.
4158 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4159     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4160     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4161 
4162   const unsigned NumDefs = MI.getNumExplicitDefs();
4163   const unsigned ArgOffset = NumDefs + 1;
4164   bool IsTFE = NumDefs == 2;
4165   // We are only processing the operands of d16 image operations on subtargets
4166   // that use the unpacked register layout, or need to repack the TFE result.
4167 
4168   // TODO: Do we need to guard against already legalized intrinsics?
4169   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4170       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4171 
4172   MachineRegisterInfo *MRI = B.getMRI();
4173   const LLT S32 = LLT::scalar(32);
4174   const LLT S16 = LLT::scalar(16);
4175   const LLT V2S16 = LLT::vector(2, 16);
4176 
4177   unsigned DMask = 0;
4178 
4179   // Check for 16 bit addresses and pack if true.
4180   LLT GradTy =
4181       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4182   LLT AddrTy =
4183       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4184   const bool IsG16 = GradTy == S16;
4185   const bool IsA16 = AddrTy == S16;
4186 
4187   int DMaskLanes = 0;
4188   if (!BaseOpcode->Atomic) {
4189     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4190     if (BaseOpcode->Gather4) {
4191       DMaskLanes = 4;
4192     } else if (DMask != 0) {
4193       DMaskLanes = countPopulation(DMask);
4194     } else if (!IsTFE && !BaseOpcode->Store) {
4195       // If dmask is 0, this is a no-op load. This can be eliminated.
4196       B.buildUndef(MI.getOperand(0));
4197       MI.eraseFromParent();
4198       return true;
4199     }
4200   }
4201 
4202   Observer.changingInstr(MI);
4203   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4204 
4205   unsigned NewOpcode = NumDefs == 0 ?
4206     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4207 
4208   // Track that we legalized this
4209   MI.setDesc(B.getTII().get(NewOpcode));
4210 
4211   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4212   // dmask to be at least 1 otherwise the instruction will fail
4213   if (IsTFE && DMask == 0) {
4214     DMask = 0x1;
4215     DMaskLanes = 1;
4216     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4217   }
4218 
4219   if (BaseOpcode->Atomic) {
4220     Register VData0 = MI.getOperand(2).getReg();
4221     LLT Ty = MRI->getType(VData0);
4222 
4223     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4224     if (Ty.isVector())
4225       return false;
4226 
4227     if (BaseOpcode->AtomicX2) {
4228       Register VData1 = MI.getOperand(3).getReg();
4229       // The two values are packed in one register.
4230       LLT PackedTy = LLT::vector(2, Ty);
4231       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4232       MI.getOperand(2).setReg(Concat.getReg(0));
4233       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4234     }
4235   }
4236 
4237   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4238 
4239   // Optimize _L to _LZ when _L is zero
4240   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4241           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4242     const ConstantFP *ConstantLod;
4243 
4244     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4245                  m_GFCst(ConstantLod))) {
4246       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4247         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4248         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4249             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4250                                                       Intr->Dim);
4251 
4252         // The starting indexes should remain in the same place.
4253         --CorrectedNumVAddrs;
4254 
4255         MI.getOperand(MI.getNumExplicitDefs())
4256             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4257         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4258         Intr = NewImageDimIntr;
4259       }
4260     }
4261   }
4262 
4263   // Optimize _mip away, when 'lod' is zero
4264   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4265     int64_t ConstantLod;
4266     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4267                  m_ICst(ConstantLod))) {
4268       if (ConstantLod == 0) {
4269         // TODO: Change intrinsic opcode and remove operand instead or replacing
4270         // it with 0, as the _L to _LZ handling is done above.
4271         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4272         --CorrectedNumVAddrs;
4273       }
4274     }
4275   }
4276 
4277   // Rewrite the addressing register layout before doing anything else.
4278   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4279     // 16 bit gradients are supported, but are tied to the A16 control
4280     // so both gradients and addresses must be 16 bit
4281     return false;
4282   }
4283 
4284   if (IsA16 && !ST.hasA16()) {
4285     // A16 not supported
4286     return false;
4287   }
4288 
4289   if (IsA16 || IsG16) {
4290     if (Intr->NumVAddrs > 1) {
4291       SmallVector<Register, 4> PackedRegs;
4292 
4293       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4294                                 IsG16);
4295 
4296       // See also below in the non-a16 branch
4297       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4298 
4299       if (!UseNSA && PackedRegs.size() > 1) {
4300         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4301         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4302         PackedRegs[0] = Concat.getReg(0);
4303         PackedRegs.resize(1);
4304       }
4305 
4306       const unsigned NumPacked = PackedRegs.size();
4307       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4308         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4309         if (!SrcOp.isReg()) {
4310           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4311           continue;
4312         }
4313 
4314         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4315 
4316         if (I - Intr->VAddrStart < NumPacked)
4317           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4318         else
4319           SrcOp.setReg(AMDGPU::NoRegister);
4320       }
4321     }
4322   } else {
4323     // If the register allocator cannot place the address registers contiguously
4324     // without introducing moves, then using the non-sequential address encoding
4325     // is always preferable, since it saves VALU instructions and is usually a
4326     // wash in terms of code size or even better.
4327     //
4328     // However, we currently have no way of hinting to the register allocator
4329     // that MIMG addresses should be placed contiguously when it is possible to
4330     // do so, so force non-NSA for the common 2-address case as a heuristic.
4331     //
4332     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4333     // allocation when possible.
4334     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4335 
4336     if (!UseNSA && Intr->NumVAddrs > 1)
4337       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4338                                Intr->NumVAddrs);
4339   }
4340 
4341   int Flags = 0;
4342   if (IsA16)
4343     Flags |= 1;
4344   if (IsG16)
4345     Flags |= 2;
4346   MI.addOperand(MachineOperand::CreateImm(Flags));
4347 
4348   if (BaseOpcode->Store) { // No TFE for stores?
4349     // TODO: Handle dmask trim
4350     Register VData = MI.getOperand(1).getReg();
4351     LLT Ty = MRI->getType(VData);
4352     if (!Ty.isVector() || Ty.getElementType() != S16)
4353       return true;
4354 
4355     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4356     if (RepackedReg != VData) {
4357       MI.getOperand(1).setReg(RepackedReg);
4358     }
4359 
4360     return true;
4361   }
4362 
4363   Register DstReg = MI.getOperand(0).getReg();
4364   LLT Ty = MRI->getType(DstReg);
4365   const LLT EltTy = Ty.getScalarType();
4366   const bool IsD16 = Ty.getScalarType() == S16;
4367   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4368 
4369   // Confirm that the return type is large enough for the dmask specified
4370   if (NumElts < DMaskLanes)
4371     return false;
4372 
4373   if (NumElts > 4 || DMaskLanes > 4)
4374     return false;
4375 
4376   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4377   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4378 
4379   // The raw dword aligned data component of the load. The only legal cases
4380   // where this matters should be when using the packed D16 format, for
4381   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4382   LLT RoundedTy;
4383 
4384   // S32 vector to to cover all data, plus TFE result element.
4385   LLT TFETy;
4386 
4387   // Register type to use for each loaded component. Will be S32 or V2S16.
4388   LLT RegTy;
4389 
4390   if (IsD16 && ST.hasUnpackedD16VMem()) {
4391     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4392     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4393     RegTy = S32;
4394   } else {
4395     unsigned EltSize = EltTy.getSizeInBits();
4396     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4397     unsigned RoundedSize = 32 * RoundedElts;
4398     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4399     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4400     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4401   }
4402 
4403   // The return type does not need adjustment.
4404   // TODO: Should we change s16 case to s32 or <2 x s16>?
4405   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4406     return true;
4407 
4408   Register Dst1Reg;
4409 
4410   // Insert after the instruction.
4411   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4412 
4413   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4414   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4415   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4416   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4417 
4418   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4419 
4420   MI.getOperand(0).setReg(NewResultReg);
4421 
4422   // In the IR, TFE is supposed to be used with a 2 element struct return
4423   // type. The intruction really returns these two values in one contiguous
4424   // register, with one additional dword beyond the loaded data. Rewrite the
4425   // return type to use a single register result.
4426 
4427   if (IsTFE) {
4428     Dst1Reg = MI.getOperand(1).getReg();
4429     if (MRI->getType(Dst1Reg) != S32)
4430       return false;
4431 
4432     // TODO: Make sure the TFE operand bit is set.
4433     MI.RemoveOperand(1);
4434 
4435     // Handle the easy case that requires no repack instructions.
4436     if (Ty == S32) {
4437       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4438       return true;
4439     }
4440   }
4441 
4442   // Now figure out how to copy the new result register back into the old
4443   // result.
4444   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4445 
4446   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4447 
4448   if (ResultNumRegs == 1) {
4449     assert(!IsTFE);
4450     ResultRegs[0] = NewResultReg;
4451   } else {
4452     // We have to repack into a new vector of some kind.
4453     for (int I = 0; I != NumDataRegs; ++I)
4454       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4455     B.buildUnmerge(ResultRegs, NewResultReg);
4456 
4457     // Drop the final TFE element to get the data part. The TFE result is
4458     // directly written to the right place already.
4459     if (IsTFE)
4460       ResultRegs.resize(NumDataRegs);
4461   }
4462 
4463   // For an s16 scalar result, we form an s32 result with a truncate regardless
4464   // of packed vs. unpacked.
4465   if (IsD16 && !Ty.isVector()) {
4466     B.buildTrunc(DstReg, ResultRegs[0]);
4467     return true;
4468   }
4469 
4470   // Avoid a build/concat_vector of 1 entry.
4471   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4472     B.buildBitcast(DstReg, ResultRegs[0]);
4473     return true;
4474   }
4475 
4476   assert(Ty.isVector());
4477 
4478   if (IsD16) {
4479     // For packed D16 results with TFE enabled, all the data components are
4480     // S32. Cast back to the expected type.
4481     //
4482     // TODO: We don't really need to use load s32 elements. We would only need one
4483     // cast for the TFE result if a multiple of v2s16 was used.
4484     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4485       for (Register &Reg : ResultRegs)
4486         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4487     } else if (ST.hasUnpackedD16VMem()) {
4488       for (Register &Reg : ResultRegs)
4489         Reg = B.buildTrunc(S16, Reg).getReg(0);
4490     }
4491   }
4492 
4493   auto padWithUndef = [&](LLT Ty, int NumElts) {
4494     if (NumElts == 0)
4495       return;
4496     Register Undef = B.buildUndef(Ty).getReg(0);
4497     for (int I = 0; I != NumElts; ++I)
4498       ResultRegs.push_back(Undef);
4499   };
4500 
4501   // Pad out any elements eliminated due to the dmask.
4502   LLT ResTy = MRI->getType(ResultRegs[0]);
4503   if (!ResTy.isVector()) {
4504     padWithUndef(ResTy, NumElts - ResultRegs.size());
4505     B.buildBuildVector(DstReg, ResultRegs);
4506     return true;
4507   }
4508 
4509   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4510   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4511 
4512   // Deal with the one annoying legal case.
4513   const LLT V3S16 = LLT::vector(3, 16);
4514   if (Ty == V3S16) {
4515     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4516     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4517     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4518     return true;
4519   }
4520 
4521   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4522   B.buildConcatVectors(DstReg, ResultRegs);
4523   return true;
4524 }
4525 
4526 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4527   LegalizerHelper &Helper, MachineInstr &MI) const {
4528   MachineIRBuilder &B = Helper.MIRBuilder;
4529   GISelChangeObserver &Observer = Helper.Observer;
4530 
4531   Register Dst = MI.getOperand(0).getReg();
4532   LLT Ty = B.getMRI()->getType(Dst);
4533   unsigned Size = Ty.getSizeInBits();
4534   MachineFunction &MF = B.getMF();
4535 
4536   Observer.changingInstr(MI);
4537 
4538   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4539     Ty = getBitcastRegisterType(Ty);
4540     Helper.bitcastDst(MI, Ty, 0);
4541     Dst = MI.getOperand(0).getReg();
4542     B.setInsertPt(B.getMBB(), MI);
4543   }
4544 
4545   // FIXME: We don't really need this intermediate instruction. The intrinsic
4546   // should be fixed to have a memory operand. Since it's readnone, we're not
4547   // allowed to add one.
4548   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4549   MI.RemoveOperand(1); // Remove intrinsic ID
4550 
4551   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4552   // TODO: Should this use datalayout alignment?
4553   const unsigned MemSize = (Size + 7) / 8;
4554   const Align MemAlign(4);
4555   MachineMemOperand *MMO = MF.getMachineMemOperand(
4556       MachinePointerInfo(),
4557       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4558           MachineMemOperand::MOInvariant,
4559       MemSize, MemAlign);
4560   MI.addMemOperand(MF, MMO);
4561 
4562   // There are no 96-bit result scalar loads, but widening to 128-bit should
4563   // always be legal. We may need to restore this to a 96-bit result if it turns
4564   // out this needs to be converted to a vector load during RegBankSelect.
4565   if (!isPowerOf2_32(Size)) {
4566     if (Ty.isVector())
4567       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4568     else
4569       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4570   }
4571 
4572   Observer.changedInstr(MI);
4573   return true;
4574 }
4575 
4576 // TODO: Move to selection
4577 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4578                                                 MachineRegisterInfo &MRI,
4579                                                 MachineIRBuilder &B) const {
4580   if (!ST.isTrapHandlerEnabled() ||
4581       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4582     return legalizeTrapEndpgm(MI, MRI, B);
4583 
4584   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4585     switch (*HsaAbiVer) {
4586     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4587     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4588       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4589     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4590       return ST.supportsGetDoorbellID() ?
4591           legalizeTrapHsa(MI, MRI, B) :
4592           legalizeTrapHsaQueuePtr(MI, MRI, B);
4593     }
4594   }
4595 
4596   llvm_unreachable("Unknown trap handler");
4597 }
4598 
4599 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4600     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4601   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4602   MI.eraseFromParent();
4603   return true;
4604 }
4605 
4606 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4607     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4608   // Pass queue pointer to trap handler as input, and insert trap instruction
4609   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4610   Register LiveIn =
4611     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4612   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4613     return false;
4614 
4615   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4616   B.buildCopy(SGPR01, LiveIn);
4617   B.buildInstr(AMDGPU::S_TRAP)
4618       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4619       .addReg(SGPR01, RegState::Implicit);
4620 
4621   MI.eraseFromParent();
4622   return true;
4623 }
4624 
4625 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4626     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4627   B.buildInstr(AMDGPU::S_TRAP)
4628       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4629   MI.eraseFromParent();
4630   return true;
4631 }
4632 
4633 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4634     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4635   // Is non-HSA path or trap-handler disabled? then, report a warning
4636   // accordingly
4637   if (!ST.isTrapHandlerEnabled() ||
4638       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4639     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4640                                      "debugtrap handler not supported",
4641                                      MI.getDebugLoc(), DS_Warning);
4642     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4643     Ctx.diagnose(NoTrap);
4644   } else {
4645     // Insert debug-trap instruction
4646     B.buildInstr(AMDGPU::S_TRAP)
4647         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4648   }
4649 
4650   MI.eraseFromParent();
4651   return true;
4652 }
4653 
4654 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4655                                                MachineIRBuilder &B) const {
4656   MachineRegisterInfo &MRI = *B.getMRI();
4657   const LLT S16 = LLT::scalar(16);
4658   const LLT S32 = LLT::scalar(32);
4659 
4660   Register DstReg = MI.getOperand(0).getReg();
4661   Register NodePtr = MI.getOperand(2).getReg();
4662   Register RayExtent = MI.getOperand(3).getReg();
4663   Register RayOrigin = MI.getOperand(4).getReg();
4664   Register RayDir = MI.getOperand(5).getReg();
4665   Register RayInvDir = MI.getOperand(6).getReg();
4666   Register TDescr = MI.getOperand(7).getReg();
4667 
4668   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4669   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4670   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4671                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4672                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4673                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4674 
4675   SmallVector<Register, 12> Ops;
4676   if (Is64) {
4677     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4678     Ops.push_back(Unmerge.getReg(0));
4679     Ops.push_back(Unmerge.getReg(1));
4680   } else {
4681     Ops.push_back(NodePtr);
4682   }
4683   Ops.push_back(RayExtent);
4684 
4685   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4686     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4687     Ops.push_back(Unmerge.getReg(0));
4688     Ops.push_back(Unmerge.getReg(1));
4689     Ops.push_back(Unmerge.getReg(2));
4690   };
4691 
4692   packLanes(RayOrigin);
4693   if (IsA16) {
4694     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4695     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4696     Register R1 = MRI.createGenericVirtualRegister(S32);
4697     Register R2 = MRI.createGenericVirtualRegister(S32);
4698     Register R3 = MRI.createGenericVirtualRegister(S32);
4699     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4700     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4701     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4702     Ops.push_back(R1);
4703     Ops.push_back(R2);
4704     Ops.push_back(R3);
4705   } else {
4706     packLanes(RayDir);
4707     packLanes(RayInvDir);
4708   }
4709 
4710   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4711     .addDef(DstReg)
4712     .addImm(Opcode);
4713 
4714   for (Register R : Ops) {
4715     MIB.addUse(R);
4716   }
4717 
4718   MIB.addUse(TDescr)
4719      .addImm(IsA16 ? 1 : 0)
4720      .cloneMemRefs(MI);
4721 
4722   MI.eraseFromParent();
4723   return true;
4724 }
4725 
4726 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4727                                             MachineInstr &MI) const {
4728   MachineIRBuilder &B = Helper.MIRBuilder;
4729   MachineRegisterInfo &MRI = *B.getMRI();
4730 
4731   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4732   auto IntrID = MI.getIntrinsicID();
4733   switch (IntrID) {
4734   case Intrinsic::amdgcn_if:
4735   case Intrinsic::amdgcn_else: {
4736     MachineInstr *Br = nullptr;
4737     MachineBasicBlock *UncondBrTarget = nullptr;
4738     bool Negated = false;
4739     if (MachineInstr *BrCond =
4740             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4741       const SIRegisterInfo *TRI
4742         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4743 
4744       Register Def = MI.getOperand(1).getReg();
4745       Register Use = MI.getOperand(3).getReg();
4746 
4747       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4748 
4749       if (Negated)
4750         std::swap(CondBrTarget, UncondBrTarget);
4751 
4752       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4753       if (IntrID == Intrinsic::amdgcn_if) {
4754         B.buildInstr(AMDGPU::SI_IF)
4755           .addDef(Def)
4756           .addUse(Use)
4757           .addMBB(UncondBrTarget);
4758       } else {
4759         B.buildInstr(AMDGPU::SI_ELSE)
4760             .addDef(Def)
4761             .addUse(Use)
4762             .addMBB(UncondBrTarget);
4763       }
4764 
4765       if (Br) {
4766         Br->getOperand(0).setMBB(CondBrTarget);
4767       } else {
4768         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4769         // since we're swapping branch targets it needs to be reinserted.
4770         // FIXME: IRTranslator should probably not do this
4771         B.buildBr(*CondBrTarget);
4772       }
4773 
4774       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4775       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4776       MI.eraseFromParent();
4777       BrCond->eraseFromParent();
4778       return true;
4779     }
4780 
4781     return false;
4782   }
4783   case Intrinsic::amdgcn_loop: {
4784     MachineInstr *Br = nullptr;
4785     MachineBasicBlock *UncondBrTarget = nullptr;
4786     bool Negated = false;
4787     if (MachineInstr *BrCond =
4788             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4789       const SIRegisterInfo *TRI
4790         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4791 
4792       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4793       Register Reg = MI.getOperand(2).getReg();
4794 
4795       if (Negated)
4796         std::swap(CondBrTarget, UncondBrTarget);
4797 
4798       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4799       B.buildInstr(AMDGPU::SI_LOOP)
4800         .addUse(Reg)
4801         .addMBB(UncondBrTarget);
4802 
4803       if (Br)
4804         Br->getOperand(0).setMBB(CondBrTarget);
4805       else
4806         B.buildBr(*CondBrTarget);
4807 
4808       MI.eraseFromParent();
4809       BrCond->eraseFromParent();
4810       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4811       return true;
4812     }
4813 
4814     return false;
4815   }
4816   case Intrinsic::amdgcn_kernarg_segment_ptr:
4817     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4818       // This only makes sense to call in a kernel, so just lower to null.
4819       B.buildConstant(MI.getOperand(0).getReg(), 0);
4820       MI.eraseFromParent();
4821       return true;
4822     }
4823 
4824     return legalizePreloadedArgIntrin(
4825       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4826   case Intrinsic::amdgcn_implicitarg_ptr:
4827     return legalizeImplicitArgPtr(MI, MRI, B);
4828   case Intrinsic::amdgcn_workitem_id_x:
4829     return legalizePreloadedArgIntrin(MI, MRI, B,
4830                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4831   case Intrinsic::amdgcn_workitem_id_y:
4832     return legalizePreloadedArgIntrin(MI, MRI, B,
4833                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4834   case Intrinsic::amdgcn_workitem_id_z:
4835     return legalizePreloadedArgIntrin(MI, MRI, B,
4836                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4837   case Intrinsic::amdgcn_workgroup_id_x:
4838     return legalizePreloadedArgIntrin(MI, MRI, B,
4839                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4840   case Intrinsic::amdgcn_workgroup_id_y:
4841     return legalizePreloadedArgIntrin(MI, MRI, B,
4842                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4843   case Intrinsic::amdgcn_workgroup_id_z:
4844     return legalizePreloadedArgIntrin(MI, MRI, B,
4845                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4846   case Intrinsic::amdgcn_dispatch_ptr:
4847     return legalizePreloadedArgIntrin(MI, MRI, B,
4848                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4849   case Intrinsic::amdgcn_queue_ptr:
4850     return legalizePreloadedArgIntrin(MI, MRI, B,
4851                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4852   case Intrinsic::amdgcn_implicit_buffer_ptr:
4853     return legalizePreloadedArgIntrin(
4854       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4855   case Intrinsic::amdgcn_dispatch_id:
4856     return legalizePreloadedArgIntrin(MI, MRI, B,
4857                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4858   case Intrinsic::amdgcn_fdiv_fast:
4859     return legalizeFDIVFastIntrin(MI, MRI, B);
4860   case Intrinsic::amdgcn_is_shared:
4861     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4862   case Intrinsic::amdgcn_is_private:
4863     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4864   case Intrinsic::amdgcn_wavefrontsize: {
4865     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4866     MI.eraseFromParent();
4867     return true;
4868   }
4869   case Intrinsic::amdgcn_s_buffer_load:
4870     return legalizeSBufferLoad(Helper, MI);
4871   case Intrinsic::amdgcn_raw_buffer_store:
4872   case Intrinsic::amdgcn_struct_buffer_store:
4873     return legalizeBufferStore(MI, MRI, B, false, false);
4874   case Intrinsic::amdgcn_raw_buffer_store_format:
4875   case Intrinsic::amdgcn_struct_buffer_store_format:
4876     return legalizeBufferStore(MI, MRI, B, false, true);
4877   case Intrinsic::amdgcn_raw_tbuffer_store:
4878   case Intrinsic::amdgcn_struct_tbuffer_store:
4879     return legalizeBufferStore(MI, MRI, B, true, true);
4880   case Intrinsic::amdgcn_raw_buffer_load:
4881   case Intrinsic::amdgcn_struct_buffer_load:
4882     return legalizeBufferLoad(MI, MRI, B, false, false);
4883   case Intrinsic::amdgcn_raw_buffer_load_format:
4884   case Intrinsic::amdgcn_struct_buffer_load_format:
4885     return legalizeBufferLoad(MI, MRI, B, true, false);
4886   case Intrinsic::amdgcn_raw_tbuffer_load:
4887   case Intrinsic::amdgcn_struct_tbuffer_load:
4888     return legalizeBufferLoad(MI, MRI, B, true, true);
4889   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4890   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4891   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4892   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4893   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4894   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4895   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4896   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4897   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4898   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4899   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4900   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4901   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4902   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4903   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4904   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4905   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4906   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4907   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4908   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4909   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4910   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4911   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4912   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4913   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4914   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4915   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4916   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4917   case Intrinsic::amdgcn_buffer_atomic_fadd:
4918   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4919   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4920   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4921   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4922     return legalizeBufferAtomic(MI, B, IntrID);
4923   case Intrinsic::amdgcn_atomic_inc:
4924     return legalizeAtomicIncDec(MI, B, true);
4925   case Intrinsic::amdgcn_atomic_dec:
4926     return legalizeAtomicIncDec(MI, B, false);
4927   case Intrinsic::trap:
4928     return legalizeTrapIntrinsic(MI, MRI, B);
4929   case Intrinsic::debugtrap:
4930     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4931   case Intrinsic::amdgcn_rsq_clamp:
4932     return legalizeRsqClampIntrinsic(MI, MRI, B);
4933   case Intrinsic::amdgcn_ds_fadd:
4934   case Intrinsic::amdgcn_ds_fmin:
4935   case Intrinsic::amdgcn_ds_fmax:
4936     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4937   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4938     return legalizeBVHIntrinsic(MI, B);
4939   default: {
4940     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4941             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4942       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4943     return true;
4944   }
4945   }
4946 
4947   return true;
4948 }
4949