1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 // If we have a truncating store or an extending load with a data size larger
226 // than 32-bits, we need to reduce to a 32-bit type.
227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return ST.enableFlatScratch() ? 128 : 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // Do not handle extending vector loads.
280   if (Ty.isVector() && MemSize != RegSize)
281     return false;
282 
283   // TODO: We should be able to widen loads if the alignment is high enough, but
284   // we also need to modify the memory access size.
285 #if 0
286   // Accept widening loads based on alignment.
287   if (IsLoad && MemSize < Size)
288     MemSize = std::max(MemSize, Align);
289 #endif
290 
291   // Only 1-byte and 2-byte to 32-bit extloads are valid.
292   if (MemSize != RegSize && RegSize != 32)
293     return false;
294 
295   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296     return false;
297 
298   switch (MemSize) {
299   case 8:
300   case 16:
301   case 32:
302   case 64:
303   case 128:
304     break;
305   case 96:
306     if (!ST.hasDwordx3LoadStores())
307       return false;
308     break;
309   case 256:
310   case 512:
311     // These may contextually need to be broken down.
312     break;
313   default:
314     return false;
315   }
316 
317   assert(RegSize >= MemSize);
318 
319   if (AlignBits < MemSize) {
320     const SITargetLowering *TLI = ST.getTargetLowering();
321     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322                                                  Align(AlignBits / 8)))
323       return false;
324   }
325 
326   return true;
327 }
328 
329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330 // workaround this. Eventually it should ignore the type for loads and only care
331 // about the size. Return true in cases where we will workaround this for now by
332 // bitcasting.
333 static bool loadStoreBitcastWorkaround(const LLT Ty) {
334   if (EnableNewLegality)
335     return false;
336 
337   const unsigned Size = Ty.getSizeInBits();
338   if (Size <= 64)
339     return false;
340   if (!Ty.isVector())
341     return true;
342 
343   LLT EltTy = Ty.getElementType();
344   if (EltTy.isPointer())
345     return true;
346 
347   unsigned EltSize = EltTy.getSizeInBits();
348   return EltSize != 32 && EltSize != 64;
349 }
350 
351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352   const LLT Ty = Query.Types[0];
353   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
354          !loadStoreBitcastWorkaround(Ty);
355 }
356 
357 /// Return true if a load or store of the type should be lowered with a bitcast
358 /// to a different type.
359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360                                        const unsigned MemSizeInBits) {
361   const unsigned Size = Ty.getSizeInBits();
362     if (Size != MemSizeInBits)
363       return Size <= 32 && Ty.isVector();
364 
365   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
366     return true;
367   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
368          !isRegisterVectorElementType(Ty.getElementType());
369 }
370 
371 /// Return true if we should legalize a load by widening an odd sized memory
372 /// access up to the alignment. Note this case when the memory access itself
373 /// changes, not the size of the result register.
374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375                             unsigned AlignInBits, unsigned AddrSpace,
376                             unsigned Opcode) {
377   // We don't want to widen cases that are naturally legal.
378   if (isPowerOf2_32(SizeInBits))
379     return false;
380 
381   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382   // end up widening these for a scalar load during RegBankSelect, since there
383   // aren't 96-bit scalar loads.
384   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385     return false;
386 
387   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388     return false;
389 
390   // A load is known dereferenceable up to the alignment, so it's legal to widen
391   // to it.
392   //
393   // TODO: Could check dereferenceable for less aligned cases.
394   unsigned RoundedSize = NextPowerOf2(SizeInBits);
395   if (AlignInBits < RoundedSize)
396     return false;
397 
398   // Do not widen if it would introduce a slow unaligned load.
399   const SITargetLowering *TLI = ST.getTargetLowering();
400   bool Fast = false;
401   return TLI->allowsMisalignedMemoryAccessesImpl(
402              RoundedSize, AddrSpace, Align(AlignInBits / 8),
403              MachineMemOperand::MOLoad, &Fast) &&
404          Fast;
405 }
406 
407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408                             unsigned Opcode) {
409   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410     return false;
411 
412   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413                          Query.MMODescrs[0].AlignInBits,
414                          Query.Types[1].getAddressSpace(), Opcode);
415 }
416 
417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
418                                          const GCNTargetMachine &TM)
419   :  ST(ST_) {
420   using namespace TargetOpcode;
421 
422   auto GetAddrSpacePtr = [&TM](unsigned AS) {
423     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424   };
425 
426   const LLT S1 = LLT::scalar(1);
427   const LLT S8 = LLT::scalar(8);
428   const LLT S16 = LLT::scalar(16);
429   const LLT S32 = LLT::scalar(32);
430   const LLT S64 = LLT::scalar(64);
431   const LLT S128 = LLT::scalar(128);
432   const LLT S256 = LLT::scalar(256);
433   const LLT S512 = LLT::scalar(512);
434   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435 
436   const LLT V2S8 = LLT::vector(2, 8);
437   const LLT V2S16 = LLT::vector(2, 16);
438   const LLT V4S16 = LLT::vector(4, 16);
439 
440   const LLT V2S32 = LLT::vector(2, 32);
441   const LLT V3S32 = LLT::vector(3, 32);
442   const LLT V4S32 = LLT::vector(4, 32);
443   const LLT V5S32 = LLT::vector(5, 32);
444   const LLT V6S32 = LLT::vector(6, 32);
445   const LLT V7S32 = LLT::vector(7, 32);
446   const LLT V8S32 = LLT::vector(8, 32);
447   const LLT V9S32 = LLT::vector(9, 32);
448   const LLT V10S32 = LLT::vector(10, 32);
449   const LLT V11S32 = LLT::vector(11, 32);
450   const LLT V12S32 = LLT::vector(12, 32);
451   const LLT V13S32 = LLT::vector(13, 32);
452   const LLT V14S32 = LLT::vector(14, 32);
453   const LLT V15S32 = LLT::vector(15, 32);
454   const LLT V16S32 = LLT::vector(16, 32);
455   const LLT V32S32 = LLT::vector(32, 32);
456 
457   const LLT V2S64 = LLT::vector(2, 64);
458   const LLT V3S64 = LLT::vector(3, 64);
459   const LLT V4S64 = LLT::vector(4, 64);
460   const LLT V5S64 = LLT::vector(5, 64);
461   const LLT V6S64 = LLT::vector(6, 64);
462   const LLT V7S64 = LLT::vector(7, 64);
463   const LLT V8S64 = LLT::vector(8, 64);
464   const LLT V16S64 = LLT::vector(16, 64);
465 
466   std::initializer_list<LLT> AllS32Vectors =
467     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469   std::initializer_list<LLT> AllS64Vectors =
470     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471 
472   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479 
480   const LLT CodePtr = FlatPtr;
481 
482   const std::initializer_list<LLT> AddrSpaces64 = {
483     GlobalPtr, ConstantPtr, FlatPtr
484   };
485 
486   const std::initializer_list<LLT> AddrSpaces32 = {
487     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488   };
489 
490   const std::initializer_list<LLT> FPTypesBase = {
491     S32, S64
492   };
493 
494   const std::initializer_list<LLT> FPTypes16 = {
495     S32, S64, S16
496   };
497 
498   const std::initializer_list<LLT> FPTypesPK16 = {
499     S32, S64, S16, V2S16
500   };
501 
502   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503 
504   setAction({G_BRCOND, S1}, Legal); // VCC branches
505   setAction({G_BRCOND, S32}, Legal); // SCC branches
506 
507   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
508   // elements for v3s16
509   getActionDefinitionsBuilder(G_PHI)
510     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
511     .legalFor(AllS32Vectors)
512     .legalFor(AllS64Vectors)
513     .legalFor(AddrSpaces64)
514     .legalFor(AddrSpaces32)
515     .legalIf(isPointer(0))
516     .clampScalar(0, S16, S256)
517     .widenScalarToNextPow2(0, 32)
518     .clampMaxNumElements(0, S32, 16)
519     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
520     .scalarize(0);
521 
522   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
523     // Full set of gfx9 features.
524     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
525       .legalFor({S32, S16, V2S16})
526       .clampScalar(0, S16, S32)
527       .clampMaxNumElements(0, S16, 2)
528       .scalarize(0)
529       .widenScalarToNextPow2(0, 32);
530 
531     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
532       .legalFor({S32, S16, V2S16}) // Clamp modifier
533       .minScalarOrElt(0, S16)
534       .clampMaxNumElements(0, S16, 2)
535       .scalarize(0)
536       .widenScalarToNextPow2(0, 32)
537       .lower();
538   } else if (ST.has16BitInsts()) {
539     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
540       .legalFor({S32, S16})
541       .clampScalar(0, S16, S32)
542       .scalarize(0)
543       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
544 
545     // Technically the saturating operations require clamp bit support, but this
546     // was introduced at the same time as 16-bit operations.
547     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
548       .legalFor({S32, S16}) // Clamp modifier
549       .minScalar(0, S16)
550       .scalarize(0)
551       .widenScalarToNextPow2(0, 16)
552       .lower();
553 
554     // We're just lowering this, but it helps get a better result to try to
555     // coerce to the desired type first.
556     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
557       .minScalar(0, S16)
558       .scalarize(0)
559       .lower();
560   } else {
561     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
562       .legalFor({S32})
563       .clampScalar(0, S32, S32)
564       .scalarize(0);
565 
566     if (ST.hasIntClamp()) {
567       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
568         .legalFor({S32}) // Clamp modifier.
569         .scalarize(0)
570         .minScalarOrElt(0, S32)
571         .lower();
572     } else {
573       // Clamp bit support was added in VI, along with 16-bit operations.
574       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
575         .minScalar(0, S32)
576         .scalarize(0)
577         .lower();
578     }
579 
580     // FIXME: DAG expansion gets better results. The widening uses the smaller
581     // range values and goes for the min/max lowering directly.
582     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
583       .minScalar(0, S32)
584       .scalarize(0)
585       .lower();
586   }
587 
588   getActionDefinitionsBuilder(
589       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
590       .customFor({S32, S64})
591       .clampScalar(0, S32, S64)
592       .widenScalarToNextPow2(0, 32)
593       .scalarize(0);
594 
595   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
596                    .legalFor({S32})
597                    .maxScalarOrElt(0, S32);
598 
599   if (ST.hasVOP3PInsts()) {
600     Mulh
601       .clampMaxNumElements(0, S8, 2)
602       .lowerFor({V2S8});
603   }
604 
605   Mulh
606     .scalarize(0)
607     .lower();
608 
609   // Report legal for any types we can handle anywhere. For the cases only legal
610   // on the SALU, RegBankSelect will be able to re-legalize.
611   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
612     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
613     .clampScalar(0, S32, S64)
614     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
615     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
616     .widenScalarToNextPow2(0)
617     .scalarize(0);
618 
619   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
620                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
621     .legalFor({{S32, S1}, {S32, S32}})
622     .minScalar(0, S32)
623     // TODO: .scalarize(0)
624     .lower();
625 
626   getActionDefinitionsBuilder(G_BITCAST)
627     // Don't worry about the size constraint.
628     .legalIf(all(isRegisterType(0), isRegisterType(1)))
629     .lower();
630 
631 
632   getActionDefinitionsBuilder(G_CONSTANT)
633     .legalFor({S1, S32, S64, S16, GlobalPtr,
634                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
635     .legalIf(isPointer(0))
636     .clampScalar(0, S32, S64)
637     .widenScalarToNextPow2(0);
638 
639   getActionDefinitionsBuilder(G_FCONSTANT)
640     .legalFor({S32, S64, S16})
641     .clampScalar(0, S16, S64);
642 
643   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
644       .legalIf(isRegisterType(0))
645       // s1 and s16 are special cases because they have legal operations on
646       // them, but don't really occupy registers in the normal way.
647       .legalFor({S1, S16})
648       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
649       .clampScalarOrElt(0, S32, MaxScalar)
650       .widenScalarToNextPow2(0, 32)
651       .clampMaxNumElements(0, S32, 16);
652 
653   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
654 
655   // If the amount is divergent, we have to do a wave reduction to get the
656   // maximum value, so this is expanded during RegBankSelect.
657   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
658     .legalFor({{PrivatePtr, S32}});
659 
660   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
661     .customIf(typeIsNot(0, PrivatePtr));
662 
663   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
664 
665   auto &FPOpActions = getActionDefinitionsBuilder(
666     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
667     .legalFor({S32, S64});
668   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
669     .customFor({S32, S64});
670   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
671     .customFor({S32, S64});
672 
673   if (ST.has16BitInsts()) {
674     if (ST.hasVOP3PInsts())
675       FPOpActions.legalFor({S16, V2S16});
676     else
677       FPOpActions.legalFor({S16});
678 
679     TrigActions.customFor({S16});
680     FDIVActions.customFor({S16});
681   }
682 
683   auto &MinNumMaxNum = getActionDefinitionsBuilder({
684       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
685 
686   if (ST.hasVOP3PInsts()) {
687     MinNumMaxNum.customFor(FPTypesPK16)
688       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689       .clampMaxNumElements(0, S16, 2)
690       .clampScalar(0, S16, S64)
691       .scalarize(0);
692   } else if (ST.has16BitInsts()) {
693     MinNumMaxNum.customFor(FPTypes16)
694       .clampScalar(0, S16, S64)
695       .scalarize(0);
696   } else {
697     MinNumMaxNum.customFor(FPTypesBase)
698       .clampScalar(0, S32, S64)
699       .scalarize(0);
700   }
701 
702   if (ST.hasVOP3PInsts())
703     FPOpActions.clampMaxNumElements(0, S16, 2);
704 
705   FPOpActions
706     .scalarize(0)
707     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
708 
709   TrigActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   FDIVActions
714     .scalarize(0)
715     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717   getActionDefinitionsBuilder({G_FNEG, G_FABS})
718     .legalFor(FPTypesPK16)
719     .clampMaxNumElements(0, S16, 2)
720     .scalarize(0)
721     .clampScalar(0, S16, S64);
722 
723   if (ST.has16BitInsts()) {
724     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
725       .legalFor({S32, S64, S16})
726       .scalarize(0)
727       .clampScalar(0, S16, S64);
728   } else {
729     getActionDefinitionsBuilder(G_FSQRT)
730       .legalFor({S32, S64})
731       .scalarize(0)
732       .clampScalar(0, S32, S64);
733 
734     if (ST.hasFractBug()) {
735       getActionDefinitionsBuilder(G_FFLOOR)
736         .customFor({S64})
737         .legalFor({S32, S64})
738         .scalarize(0)
739         .clampScalar(0, S32, S64);
740     } else {
741       getActionDefinitionsBuilder(G_FFLOOR)
742         .legalFor({S32, S64})
743         .scalarize(0)
744         .clampScalar(0, S32, S64);
745     }
746   }
747 
748   getActionDefinitionsBuilder(G_FPTRUNC)
749     .legalFor({{S32, S64}, {S16, S32}})
750     .scalarize(0)
751     .lower();
752 
753   getActionDefinitionsBuilder(G_FPEXT)
754     .legalFor({{S64, S32}, {S32, S16}})
755     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
756     .scalarize(0);
757 
758   getActionDefinitionsBuilder(G_FSUB)
759       // Use actual fsub instruction
760       .legalFor({S32})
761       // Must use fadd + fneg
762       .lowerFor({S64, S16, V2S16})
763       .scalarize(0)
764       .clampScalar(0, S32, S64);
765 
766   // Whether this is legal depends on the floating point mode for the function.
767   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
768   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
769     FMad.customFor({S32, S16});
770   else if (ST.hasMadMacF32Insts())
771     FMad.customFor({S32});
772   else if (ST.hasMadF16())
773     FMad.customFor({S16});
774   FMad.scalarize(0)
775       .lower();
776 
777   auto &FRem = getActionDefinitionsBuilder(G_FREM);
778   if (ST.has16BitInsts()) {
779     FRem.customFor({S16, S32, S64});
780   } else {
781     FRem.minScalar(0, S32)
782         .customFor({S32, S64});
783   }
784   FRem.scalarize(0);
785 
786   // TODO: Do we need to clamp maximum bitwidth?
787   getActionDefinitionsBuilder(G_TRUNC)
788     .legalIf(isScalar(0))
789     .legalFor({{V2S16, V2S32}})
790     .clampMaxNumElements(0, S16, 2)
791     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
792     // situations (like an invalid implicit use), we don't want to infinite loop
793     // in the legalizer.
794     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
795     .alwaysLegal();
796 
797   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
798     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
799                {S32, S1}, {S64, S1}, {S16, S1}})
800     .scalarize(0)
801     .clampScalar(0, S32, S64)
802     .widenScalarToNextPow2(1, 32);
803 
804   // TODO: Split s1->s64 during regbankselect for VALU.
805   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
806     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
807     .lowerFor({{S32, S64}})
808     .lowerIf(typeIs(1, S1))
809     .customFor({{S64, S64}});
810   if (ST.has16BitInsts())
811     IToFP.legalFor({{S16, S16}});
812   IToFP.clampScalar(1, S32, S64)
813        .minScalar(0, S32)
814        .scalarize(0)
815        .widenScalarToNextPow2(1);
816 
817   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
818     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
819     .customFor({{S64, S64}})
820     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
821   if (ST.has16BitInsts())
822     FPToI.legalFor({{S16, S16}});
823   else
824     FPToI.minScalar(1, S32);
825 
826   FPToI.minScalar(0, S32)
827        .widenScalarToNextPow2(0, 32)
828        .scalarize(0)
829        .lower();
830 
831   // Lower roundeven into G_FRINT
832   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
833     .scalarize(0)
834     .lower();
835 
836   if (ST.has16BitInsts()) {
837     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
838       .legalFor({S16, S32, S64})
839       .clampScalar(0, S16, S64)
840       .scalarize(0);
841   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
842     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
843       .legalFor({S32, S64})
844       .clampScalar(0, S32, S64)
845       .scalarize(0);
846   } else {
847     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
848       .legalFor({S32})
849       .customFor({S64})
850       .clampScalar(0, S32, S64)
851       .scalarize(0);
852   }
853 
854   getActionDefinitionsBuilder(G_PTR_ADD)
855     .legalIf(all(isPointer(0), sameSize(0, 1)))
856     .scalarize(0)
857     .scalarSameSizeAs(1, 0);
858 
859   getActionDefinitionsBuilder(G_PTRMASK)
860     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
861     .scalarSameSizeAs(1, 0)
862     .scalarize(0);
863 
864   auto &CmpBuilder =
865     getActionDefinitionsBuilder(G_ICMP)
866     // The compare output type differs based on the register bank of the output,
867     // so make both s1 and s32 legal.
868     //
869     // Scalar compares producing output in scc will be promoted to s32, as that
870     // is the allocatable register type that will be needed for the copy from
871     // scc. This will be promoted during RegBankSelect, and we assume something
872     // before that won't try to use s32 result types.
873     //
874     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
875     // bank.
876     .legalForCartesianProduct(
877       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
878     .legalForCartesianProduct(
879       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
880   if (ST.has16BitInsts()) {
881     CmpBuilder.legalFor({{S1, S16}});
882   }
883 
884   CmpBuilder
885     .widenScalarToNextPow2(1)
886     .clampScalar(1, S32, S64)
887     .scalarize(0)
888     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
889 
890   getActionDefinitionsBuilder(G_FCMP)
891     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
892     .widenScalarToNextPow2(1)
893     .clampScalar(1, S32, S64)
894     .scalarize(0);
895 
896   // FIXME: fpow has a selection pattern that should move to custom lowering.
897   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
898   if (ST.has16BitInsts())
899     Exp2Ops.legalFor({S32, S16});
900   else
901     Exp2Ops.legalFor({S32});
902   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
903   Exp2Ops.scalarize(0);
904 
905   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
906   if (ST.has16BitInsts())
907     ExpOps.customFor({{S32}, {S16}});
908   else
909     ExpOps.customFor({S32});
910   ExpOps.clampScalar(0, MinScalarFPTy, S32)
911         .scalarize(0);
912 
913   getActionDefinitionsBuilder(G_FPOWI)
914     .clampScalar(0, MinScalarFPTy, S32)
915     .lower();
916 
917   // The 64-bit versions produce 32-bit results, but only on the SALU.
918   getActionDefinitionsBuilder(G_CTPOP)
919     .legalFor({{S32, S32}, {S32, S64}})
920     .clampScalar(0, S32, S32)
921     .clampScalar(1, S32, S64)
922     .scalarize(0)
923     .widenScalarToNextPow2(0, 32)
924     .widenScalarToNextPow2(1, 32);
925 
926   // The hardware instructions return a different result on 0 than the generic
927   // instructions expect. The hardware produces -1, but these produce the
928   // bitwidth.
929   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
930     .scalarize(0)
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .widenScalarToNextPow2(0, 32)
934     .widenScalarToNextPow2(1, 32)
935     .lower();
936 
937   // The 64-bit versions produce 32-bit results, but only on the SALU.
938   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
939     .legalFor({{S32, S32}, {S32, S64}})
940     .clampScalar(0, S32, S32)
941     .clampScalar(1, S32, S64)
942     .scalarize(0)
943     .widenScalarToNextPow2(0, 32)
944     .widenScalarToNextPow2(1, 32);
945 
946   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
947   // RegBankSelect.
948   getActionDefinitionsBuilder(G_BITREVERSE)
949     .legalFor({S32, S64})
950     .clampScalar(0, S32, S64)
951     .scalarize(0)
952     .widenScalarToNextPow2(0);
953 
954   if (ST.has16BitInsts()) {
955     getActionDefinitionsBuilder(G_BSWAP)
956       .legalFor({S16, S32, V2S16})
957       .clampMaxNumElements(0, S16, 2)
958       // FIXME: Fixing non-power-of-2 before clamp is workaround for
959       // narrowScalar limitation.
960       .widenScalarToNextPow2(0)
961       .clampScalar(0, S16, S32)
962       .scalarize(0);
963 
964     if (ST.hasVOP3PInsts()) {
965       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
966         .legalFor({S32, S16, V2S16})
967         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
968         .clampMaxNumElements(0, S16, 2)
969         .minScalar(0, S16)
970         .widenScalarToNextPow2(0)
971         .scalarize(0)
972         .lower();
973     } else {
974       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
975         .legalFor({S32, S16})
976         .widenScalarToNextPow2(0)
977         .minScalar(0, S16)
978         .scalarize(0)
979         .lower();
980     }
981   } else {
982     // TODO: Should have same legality without v_perm_b32
983     getActionDefinitionsBuilder(G_BSWAP)
984       .legalFor({S32})
985       .lowerIf(scalarNarrowerThan(0, 32))
986       // FIXME: Fixing non-power-of-2 before clamp is workaround for
987       // narrowScalar limitation.
988       .widenScalarToNextPow2(0)
989       .maxScalar(0, S32)
990       .scalarize(0)
991       .lower();
992 
993     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
994       .legalFor({S32})
995       .minScalar(0, S32)
996       .widenScalarToNextPow2(0)
997       .scalarize(0)
998       .lower();
999   }
1000 
1001   getActionDefinitionsBuilder(G_INTTOPTR)
1002     // List the common cases
1003     .legalForCartesianProduct(AddrSpaces64, {S64})
1004     .legalForCartesianProduct(AddrSpaces32, {S32})
1005     .scalarize(0)
1006     // Accept any address space as long as the size matches
1007     .legalIf(sameSize(0, 1))
1008     .widenScalarIf(smallerThan(1, 0),
1009       [](const LegalityQuery &Query) {
1010         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1011       })
1012     .narrowScalarIf(largerThan(1, 0),
1013       [](const LegalityQuery &Query) {
1014         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015       });
1016 
1017   getActionDefinitionsBuilder(G_PTRTOINT)
1018     // List the common cases
1019     .legalForCartesianProduct(AddrSpaces64, {S64})
1020     .legalForCartesianProduct(AddrSpaces32, {S32})
1021     .scalarize(0)
1022     // Accept any address space as long as the size matches
1023     .legalIf(sameSize(0, 1))
1024     .widenScalarIf(smallerThan(0, 1),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1027       })
1028     .narrowScalarIf(
1029       largerThan(0, 1),
1030       [](const LegalityQuery &Query) {
1031         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1032       });
1033 
1034   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1035     .scalarize(0)
1036     .custom();
1037 
1038   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1039                                     bool IsLoad) -> bool {
1040     const LLT DstTy = Query.Types[0];
1041 
1042     // Split vector extloads.
1043     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1044     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1045 
1046     if (MemSize < DstTy.getSizeInBits())
1047       MemSize = std::max(MemSize, AlignBits);
1048 
1049     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1050       return true;
1051 
1052     const LLT PtrTy = Query.Types[1];
1053     unsigned AS = PtrTy.getAddressSpace();
1054     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1055       return true;
1056 
1057     // Catch weird sized loads that don't evenly divide into the access sizes
1058     // TODO: May be able to widen depending on alignment etc.
1059     unsigned NumRegs = (MemSize + 31) / 32;
1060     if (NumRegs == 3) {
1061       if (!ST.hasDwordx3LoadStores())
1062         return true;
1063     } else {
1064       // If the alignment allows, these should have been widened.
1065       if (!isPowerOf2_32(NumRegs))
1066         return true;
1067     }
1068 
1069     if (AlignBits < MemSize) {
1070       const SITargetLowering *TLI = ST.getTargetLowering();
1071       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1072                                                       Align(AlignBits / 8));
1073     }
1074 
1075     return false;
1076   };
1077 
1078   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1079   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1080   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1081 
1082   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1083   // LDS
1084   // TODO: Unsupported flat for SI.
1085 
1086   for (unsigned Op : {G_LOAD, G_STORE}) {
1087     const bool IsStore = Op == G_STORE;
1088 
1089     auto &Actions = getActionDefinitionsBuilder(Op);
1090     // Explicitly list some common cases.
1091     // TODO: Does this help compile time at all?
1092     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1093                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1094                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1095                                       {S64, GlobalPtr, 64, GlobalAlign32},
1096                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1097                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1098                                       {S32, GlobalPtr, 8, GlobalAlign8},
1099                                       {S32, GlobalPtr, 16, GlobalAlign16},
1100 
1101                                       {S32, LocalPtr, 32, 32},
1102                                       {S64, LocalPtr, 64, 32},
1103                                       {V2S32, LocalPtr, 64, 32},
1104                                       {S32, LocalPtr, 8, 8},
1105                                       {S32, LocalPtr, 16, 16},
1106                                       {V2S16, LocalPtr, 32, 32},
1107 
1108                                       {S32, PrivatePtr, 32, 32},
1109                                       {S32, PrivatePtr, 8, 8},
1110                                       {S32, PrivatePtr, 16, 16},
1111                                       {V2S16, PrivatePtr, 32, 32},
1112 
1113                                       {S32, ConstantPtr, 32, GlobalAlign32},
1114                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1115                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1116                                       {S64, ConstantPtr, 64, GlobalAlign32},
1117                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1118     Actions.legalIf(
1119       [=](const LegalityQuery &Query) -> bool {
1120         return isLoadStoreLegal(ST, Query);
1121       });
1122 
1123     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1124     // 64-bits.
1125     //
1126     // TODO: Should generalize bitcast action into coerce, which will also cover
1127     // inserting addrspacecasts.
1128     Actions.customIf(typeIs(1, Constant32Ptr));
1129 
1130     // Turn any illegal element vectors into something easier to deal
1131     // with. These will ultimately produce 32-bit scalar shifts to extract the
1132     // parts anyway.
1133     //
1134     // For odd 16-bit element vectors, prefer to split those into pieces with
1135     // 16-bit vector parts.
1136     Actions.bitcastIf(
1137       [=](const LegalityQuery &Query) -> bool {
1138         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1139                                           Query.MMODescrs[0].SizeInBits);
1140       }, bitcastToRegisterType(0));
1141 
1142     if (!IsStore) {
1143       // Widen suitably aligned loads by loading extra bytes. The standard
1144       // legalization actions can't properly express widening memory operands.
1145       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1146         return shouldWidenLoad(ST, Query, G_LOAD);
1147       });
1148     }
1149 
1150     // FIXME: load/store narrowing should be moved to lower action
1151     Actions
1152         .narrowScalarIf(
1153             [=](const LegalityQuery &Query) -> bool {
1154               return !Query.Types[0].isVector() &&
1155                      needToSplitMemOp(Query, Op == G_LOAD);
1156             },
1157             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1158               const LLT DstTy = Query.Types[0];
1159               const LLT PtrTy = Query.Types[1];
1160 
1161               const unsigned DstSize = DstTy.getSizeInBits();
1162               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1163 
1164               // Split extloads.
1165               if (DstSize > MemSize)
1166                 return std::make_pair(0, LLT::scalar(MemSize));
1167 
1168               if (!isPowerOf2_32(DstSize)) {
1169                 // We're probably decomposing an odd sized store. Try to split
1170                 // to the widest type. TODO: Account for alignment. As-is it
1171                 // should be OK, since the new parts will be further legalized.
1172                 unsigned FloorSize = PowerOf2Floor(DstSize);
1173                 return std::make_pair(0, LLT::scalar(FloorSize));
1174               }
1175 
1176               if (DstSize > 32 && (DstSize % 32 != 0)) {
1177                 // FIXME: Need a way to specify non-extload of larger size if
1178                 // suitably aligned.
1179                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1180               }
1181 
1182               unsigned MaxSize = maxSizeForAddrSpace(ST,
1183                                                      PtrTy.getAddressSpace(),
1184                                                      Op == G_LOAD);
1185               if (MemSize > MaxSize)
1186                 return std::make_pair(0, LLT::scalar(MaxSize));
1187 
1188               unsigned Align = Query.MMODescrs[0].AlignInBits;
1189               return std::make_pair(0, LLT::scalar(Align));
1190             })
1191         .fewerElementsIf(
1192             [=](const LegalityQuery &Query) -> bool {
1193               return Query.Types[0].isVector() &&
1194                      needToSplitMemOp(Query, Op == G_LOAD);
1195             },
1196             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1197               const LLT DstTy = Query.Types[0];
1198               const LLT PtrTy = Query.Types[1];
1199 
1200               LLT EltTy = DstTy.getElementType();
1201               unsigned MaxSize = maxSizeForAddrSpace(ST,
1202                                                      PtrTy.getAddressSpace(),
1203                                                      Op == G_LOAD);
1204 
1205               // FIXME: Handle widened to power of 2 results better. This ends
1206               // up scalarizing.
1207               // FIXME: 3 element stores scalarized on SI
1208 
1209               // Split if it's too large for the address space.
1210               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1211                 unsigned NumElts = DstTy.getNumElements();
1212                 unsigned EltSize = EltTy.getSizeInBits();
1213 
1214                 if (MaxSize % EltSize == 0) {
1215                   return std::make_pair(
1216                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1217                 }
1218 
1219                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1220 
1221                 // FIXME: Refine when odd breakdowns handled
1222                 // The scalars will need to be re-legalized.
1223                 if (NumPieces == 1 || NumPieces >= NumElts ||
1224                     NumElts % NumPieces != 0)
1225                   return std::make_pair(0, EltTy);
1226 
1227                 return std::make_pair(0,
1228                                       LLT::vector(NumElts / NumPieces, EltTy));
1229               }
1230 
1231               // FIXME: We could probably handle weird extending loads better.
1232               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1233               if (DstTy.getSizeInBits() > MemSize)
1234                 return std::make_pair(0, EltTy);
1235 
1236               unsigned EltSize = EltTy.getSizeInBits();
1237               unsigned DstSize = DstTy.getSizeInBits();
1238               if (!isPowerOf2_32(DstSize)) {
1239                 // We're probably decomposing an odd sized store. Try to split
1240                 // to the widest type. TODO: Account for alignment. As-is it
1241                 // should be OK, since the new parts will be further legalized.
1242                 unsigned FloorSize = PowerOf2Floor(DstSize);
1243                 return std::make_pair(
1244                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1245               }
1246 
1247               // Need to split because of alignment.
1248               unsigned Align = Query.MMODescrs[0].AlignInBits;
1249               if (EltSize > Align &&
1250                   (EltSize / Align < DstTy.getNumElements())) {
1251                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1252               }
1253 
1254               // May need relegalization for the scalars.
1255               return std::make_pair(0, EltTy);
1256             })
1257     .lowerIfMemSizeNotPow2()
1258     .minScalar(0, S32)
1259     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1260     .widenScalarToNextPow2(0)
1261     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1262     .lower();
1263   }
1264 
1265   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1266                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1267                                                   {S32, GlobalPtr, 16, 2 * 8},
1268                                                   {S32, LocalPtr, 8, 8},
1269                                                   {S32, LocalPtr, 16, 16},
1270                                                   {S32, PrivatePtr, 8, 8},
1271                                                   {S32, PrivatePtr, 16, 16},
1272                                                   {S32, ConstantPtr, 8, 8},
1273                                                   {S32, ConstantPtr, 16, 2 * 8}})
1274                        .legalIf(
1275                          [=](const LegalityQuery &Query) -> bool {
1276                            return isLoadStoreLegal(ST, Query);
1277                          });
1278 
1279   if (ST.hasFlatAddressSpace()) {
1280     ExtLoads.legalForTypesWithMemDesc(
1281         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1282   }
1283 
1284   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1285   // 64-bits.
1286   //
1287   // TODO: Should generalize bitcast action into coerce, which will also cover
1288   // inserting addrspacecasts.
1289   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1290 
1291   ExtLoads.clampScalar(0, S32, S32)
1292           .widenScalarToNextPow2(0)
1293           .unsupportedIfMemSizeNotPow2()
1294           .lower();
1295 
1296   auto &Atomics = getActionDefinitionsBuilder(
1297     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1298      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1299      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1300      G_ATOMICRMW_UMIN})
1301     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1302                {S64, GlobalPtr}, {S64, LocalPtr},
1303                {S32, RegionPtr}, {S64, RegionPtr}});
1304   if (ST.hasFlatAddressSpace()) {
1305     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1306   }
1307 
1308   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1309   if (ST.hasLDSFPAtomics()) {
1310     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1311     if (ST.hasGFX90AInsts())
1312       Atomic.legalFor({{S64, LocalPtr}});
1313   }
1314   if (ST.hasAtomicFaddInsts())
1315     Atomic.legalFor({{S32, GlobalPtr}});
1316 
1317   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1318   // demarshalling
1319   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1320     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1321                 {S32, FlatPtr}, {S64, FlatPtr}})
1322     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1323                {S32, RegionPtr}, {S64, RegionPtr}});
1324   // TODO: Pointer types, any 32-bit or 64-bit vector
1325 
1326   // Condition should be s32 for scalar, s1 for vector.
1327   getActionDefinitionsBuilder(G_SELECT)
1328     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1329           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1330           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1331     .clampScalar(0, S16, S64)
1332     .scalarize(1)
1333     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1334     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1335     .clampMaxNumElements(0, S32, 2)
1336     .clampMaxNumElements(0, LocalPtr, 2)
1337     .clampMaxNumElements(0, PrivatePtr, 2)
1338     .scalarize(0)
1339     .widenScalarToNextPow2(0)
1340     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1341 
1342   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1343   // be more flexible with the shift amount type.
1344   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1345     .legalFor({{S32, S32}, {S64, S32}});
1346   if (ST.has16BitInsts()) {
1347     if (ST.hasVOP3PInsts()) {
1348       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1349             .clampMaxNumElements(0, S16, 2);
1350     } else
1351       Shifts.legalFor({{S16, S16}});
1352 
1353     // TODO: Support 16-bit shift amounts for all types
1354     Shifts.widenScalarIf(
1355       [=](const LegalityQuery &Query) {
1356         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1357         // 32-bit amount.
1358         const LLT ValTy = Query.Types[0];
1359         const LLT AmountTy = Query.Types[1];
1360         return ValTy.getSizeInBits() <= 16 &&
1361                AmountTy.getSizeInBits() < 16;
1362       }, changeTo(1, S16));
1363     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1364     Shifts.clampScalar(1, S32, S32);
1365     Shifts.clampScalar(0, S16, S64);
1366     Shifts.widenScalarToNextPow2(0, 16);
1367 
1368     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1369       .minScalar(0, S16)
1370       .scalarize(0)
1371       .lower();
1372   } else {
1373     // Make sure we legalize the shift amount type first, as the general
1374     // expansion for the shifted type will produce much worse code if it hasn't
1375     // been truncated already.
1376     Shifts.clampScalar(1, S32, S32);
1377     Shifts.clampScalar(0, S32, S64);
1378     Shifts.widenScalarToNextPow2(0, 32);
1379 
1380     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381       .minScalar(0, S32)
1382       .scalarize(0)
1383       .lower();
1384   }
1385   Shifts.scalarize(0);
1386 
1387   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1388     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1389     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1390     unsigned IdxTypeIdx = 2;
1391 
1392     getActionDefinitionsBuilder(Op)
1393       .customIf([=](const LegalityQuery &Query) {
1394           const LLT EltTy = Query.Types[EltTypeIdx];
1395           const LLT VecTy = Query.Types[VecTypeIdx];
1396           const LLT IdxTy = Query.Types[IdxTypeIdx];
1397           const unsigned EltSize = EltTy.getSizeInBits();
1398           return (EltSize == 32 || EltSize == 64) &&
1399                   VecTy.getSizeInBits() % 32 == 0 &&
1400                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1401                   IdxTy.getSizeInBits() == 32;
1402         })
1403       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1404                  bitcastToVectorElement32(VecTypeIdx))
1405       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1406       .bitcastIf(
1407         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1408         [=](const LegalityQuery &Query) {
1409           // For > 64-bit element types, try to turn this into a 64-bit
1410           // element vector since we may be able to do better indexing
1411           // if this is scalar. If not, fall back to 32.
1412           const LLT EltTy = Query.Types[EltTypeIdx];
1413           const LLT VecTy = Query.Types[VecTypeIdx];
1414           const unsigned DstEltSize = EltTy.getSizeInBits();
1415           const unsigned VecSize = VecTy.getSizeInBits();
1416 
1417           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1418           return std::make_pair(
1419             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1420         })
1421       .clampScalar(EltTypeIdx, S32, S64)
1422       .clampScalar(VecTypeIdx, S32, S64)
1423       .clampScalar(IdxTypeIdx, S32, S32)
1424       .clampMaxNumElements(VecTypeIdx, S32, 32)
1425       // TODO: Clamp elements for 64-bit vectors?
1426       // It should only be necessary with variable indexes.
1427       // As a last resort, lower to the stack
1428       .lower();
1429   }
1430 
1431   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1432     .unsupportedIf([=](const LegalityQuery &Query) {
1433         const LLT &EltTy = Query.Types[1].getElementType();
1434         return Query.Types[0] != EltTy;
1435       });
1436 
1437   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1438     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1439     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1440 
1441     // FIXME: Doesn't handle extract of illegal sizes.
1442     getActionDefinitionsBuilder(Op)
1443       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1444       // FIXME: Multiples of 16 should not be legal.
1445       .legalIf([=](const LegalityQuery &Query) {
1446           const LLT BigTy = Query.Types[BigTyIdx];
1447           const LLT LitTy = Query.Types[LitTyIdx];
1448           return (BigTy.getSizeInBits() % 32 == 0) &&
1449                  (LitTy.getSizeInBits() % 16 == 0);
1450         })
1451       .widenScalarIf(
1452         [=](const LegalityQuery &Query) {
1453           const LLT BigTy = Query.Types[BigTyIdx];
1454           return (BigTy.getScalarSizeInBits() < 16);
1455         },
1456         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1457       .widenScalarIf(
1458         [=](const LegalityQuery &Query) {
1459           const LLT LitTy = Query.Types[LitTyIdx];
1460           return (LitTy.getScalarSizeInBits() < 16);
1461         },
1462         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1463       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1464       .widenScalarToNextPow2(BigTyIdx, 32);
1465 
1466   }
1467 
1468   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1469     .legalForCartesianProduct(AllS32Vectors, {S32})
1470     .legalForCartesianProduct(AllS64Vectors, {S64})
1471     .clampNumElements(0, V16S32, V32S32)
1472     .clampNumElements(0, V2S64, V16S64)
1473     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1474 
1475   if (ST.hasScalarPackInsts()) {
1476     BuildVector
1477       // FIXME: Should probably widen s1 vectors straight to s32
1478       .minScalarOrElt(0, S16)
1479       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1480       .minScalar(1, S32);
1481 
1482     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1483       .legalFor({V2S16, S32})
1484       .lower();
1485     BuildVector.minScalarOrElt(0, S32);
1486   } else {
1487     BuildVector.customFor({V2S16, S16});
1488     BuildVector.minScalarOrElt(0, S32);
1489 
1490     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1491       .customFor({V2S16, S32})
1492       .lower();
1493   }
1494 
1495   BuildVector.legalIf(isRegisterType(0));
1496 
1497   // FIXME: Clamp maximum size
1498   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1499     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1500     .clampMaxNumElements(0, S32, 32)
1501     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1502     .clampMaxNumElements(0, S16, 64);
1503 
1504   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1505   // pre-legalize.
1506   if (ST.hasVOP3PInsts()) {
1507     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1508       .customFor({V2S16, V2S16})
1509       .lower();
1510   } else
1511     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1512 
1513   // Merge/Unmerge
1514   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1515     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1516     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1517 
1518     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1519       const LLT Ty = Query.Types[TypeIdx];
1520       if (Ty.isVector()) {
1521         const LLT &EltTy = Ty.getElementType();
1522         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1523           return true;
1524         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1525           return true;
1526       }
1527       return false;
1528     };
1529 
1530     auto &Builder = getActionDefinitionsBuilder(Op)
1531       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1532       .lowerFor({{S16, V2S16}})
1533       .lowerIf([=](const LegalityQuery &Query) {
1534           const LLT BigTy = Query.Types[BigTyIdx];
1535           return BigTy.getSizeInBits() == 32;
1536         })
1537       // Try to widen to s16 first for small types.
1538       // TODO: Only do this on targets with legal s16 shifts
1539       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1540       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1541       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1542       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1543                            elementTypeIs(1, S16)),
1544                        changeTo(1, V2S16))
1545       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1546       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1547       // valid.
1548       .clampScalar(LitTyIdx, S32, S512)
1549       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1550       // Break up vectors with weird elements into scalars
1551       .fewerElementsIf(
1552         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1553         scalarize(0))
1554       .fewerElementsIf(
1555         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1556         scalarize(1))
1557       .clampScalar(BigTyIdx, S32, MaxScalar);
1558 
1559     if (Op == G_MERGE_VALUES) {
1560       Builder.widenScalarIf(
1561         // TODO: Use 16-bit shifts if legal for 8-bit values?
1562         [=](const LegalityQuery &Query) {
1563           const LLT Ty = Query.Types[LitTyIdx];
1564           return Ty.getSizeInBits() < 32;
1565         },
1566         changeTo(LitTyIdx, S32));
1567     }
1568 
1569     Builder.widenScalarIf(
1570       [=](const LegalityQuery &Query) {
1571         const LLT Ty = Query.Types[BigTyIdx];
1572         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1573           Ty.getSizeInBits() % 16 != 0;
1574       },
1575       [=](const LegalityQuery &Query) {
1576         // Pick the next power of 2, or a multiple of 64 over 128.
1577         // Whichever is smaller.
1578         const LLT &Ty = Query.Types[BigTyIdx];
1579         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1580         if (NewSizeInBits >= 256) {
1581           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1582           if (RoundedTo < NewSizeInBits)
1583             NewSizeInBits = RoundedTo;
1584         }
1585         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1586       })
1587       // Any vectors left are the wrong size. Scalarize them.
1588       .scalarize(0)
1589       .scalarize(1);
1590   }
1591 
1592   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1593   // RegBankSelect.
1594   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1595     .legalFor({{S32}, {S64}});
1596 
1597   if (ST.hasVOP3PInsts()) {
1598     SextInReg.lowerFor({{V2S16}})
1599       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1600       // get more vector shift opportunities, since we'll get those when
1601       // expanded.
1602       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1603   } else if (ST.has16BitInsts()) {
1604     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1605   } else {
1606     // Prefer to promote to s32 before lowering if we don't have 16-bit
1607     // shifts. This avoid a lot of intermediate truncate and extend operations.
1608     SextInReg.lowerFor({{S32}, {S64}});
1609   }
1610 
1611   SextInReg
1612     .scalarize(0)
1613     .clampScalar(0, S32, S64)
1614     .lower();
1615 
1616   // TODO: Only Try to form v2s16 with legal packed instructions.
1617   getActionDefinitionsBuilder(G_FSHR)
1618     .legalFor({{S32, S32}})
1619     .lowerFor({{V2S16, V2S16}})
1620     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1621     .scalarize(0)
1622     .lower();
1623 
1624   if (ST.hasVOP3PInsts()) {
1625     getActionDefinitionsBuilder(G_FSHL)
1626       .lowerFor({{V2S16, V2S16}})
1627       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1628       .scalarize(0)
1629       .lower();
1630   } else {
1631     getActionDefinitionsBuilder(G_FSHL)
1632       .scalarize(0)
1633       .lower();
1634   }
1635 
1636   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1637     .legalFor({S64});
1638 
1639   getActionDefinitionsBuilder(G_FENCE)
1640     .alwaysLegal();
1641 
1642   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1643       .scalarize(0)
1644       .minScalar(0, S32)
1645       .lower();
1646 
1647   getActionDefinitionsBuilder({
1648       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1649       G_FCOPYSIGN,
1650 
1651       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1652       G_ATOMICRMW_NAND,
1653       G_ATOMICRMW_FSUB,
1654       G_READ_REGISTER,
1655       G_WRITE_REGISTER,
1656 
1657       G_SADDO, G_SSUBO,
1658 
1659        // TODO: Implement
1660       G_FMINIMUM, G_FMAXIMUM}).lower();
1661 
1662   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1663         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1664         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1665     .unsupported();
1666 
1667   computeTables();
1668   verify(*ST.getInstrInfo());
1669 }
1670 
1671 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1672                                          MachineInstr &MI) const {
1673   MachineIRBuilder &B = Helper.MIRBuilder;
1674   MachineRegisterInfo &MRI = *B.getMRI();
1675 
1676   switch (MI.getOpcode()) {
1677   case TargetOpcode::G_ADDRSPACE_CAST:
1678     return legalizeAddrSpaceCast(MI, MRI, B);
1679   case TargetOpcode::G_FRINT:
1680     return legalizeFrint(MI, MRI, B);
1681   case TargetOpcode::G_FCEIL:
1682     return legalizeFceil(MI, MRI, B);
1683   case TargetOpcode::G_FREM:
1684     return legalizeFrem(MI, MRI, B);
1685   case TargetOpcode::G_INTRINSIC_TRUNC:
1686     return legalizeIntrinsicTrunc(MI, MRI, B);
1687   case TargetOpcode::G_SITOFP:
1688     return legalizeITOFP(MI, MRI, B, true);
1689   case TargetOpcode::G_UITOFP:
1690     return legalizeITOFP(MI, MRI, B, false);
1691   case TargetOpcode::G_FPTOSI:
1692     return legalizeFPTOI(MI, MRI, B, true);
1693   case TargetOpcode::G_FPTOUI:
1694     return legalizeFPTOI(MI, MRI, B, false);
1695   case TargetOpcode::G_FMINNUM:
1696   case TargetOpcode::G_FMAXNUM:
1697   case TargetOpcode::G_FMINNUM_IEEE:
1698   case TargetOpcode::G_FMAXNUM_IEEE:
1699     return legalizeMinNumMaxNum(Helper, MI);
1700   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1701     return legalizeExtractVectorElt(MI, MRI, B);
1702   case TargetOpcode::G_INSERT_VECTOR_ELT:
1703     return legalizeInsertVectorElt(MI, MRI, B);
1704   case TargetOpcode::G_SHUFFLE_VECTOR:
1705     return legalizeShuffleVector(MI, MRI, B);
1706   case TargetOpcode::G_FSIN:
1707   case TargetOpcode::G_FCOS:
1708     return legalizeSinCos(MI, MRI, B);
1709   case TargetOpcode::G_GLOBAL_VALUE:
1710     return legalizeGlobalValue(MI, MRI, B);
1711   case TargetOpcode::G_LOAD:
1712   case TargetOpcode::G_SEXTLOAD:
1713   case TargetOpcode::G_ZEXTLOAD:
1714     return legalizeLoad(Helper, MI);
1715   case TargetOpcode::G_FMAD:
1716     return legalizeFMad(MI, MRI, B);
1717   case TargetOpcode::G_FDIV:
1718     return legalizeFDIV(MI, MRI, B);
1719   case TargetOpcode::G_UDIV:
1720   case TargetOpcode::G_UREM:
1721   case TargetOpcode::G_UDIVREM:
1722     return legalizeUnsignedDIV_REM(MI, MRI, B);
1723   case TargetOpcode::G_SDIV:
1724   case TargetOpcode::G_SREM:
1725   case TargetOpcode::G_SDIVREM:
1726     return legalizeSignedDIV_REM(MI, MRI, B);
1727   case TargetOpcode::G_ATOMIC_CMPXCHG:
1728     return legalizeAtomicCmpXChg(MI, MRI, B);
1729   case TargetOpcode::G_FLOG:
1730     return legalizeFlog(MI, B, numbers::ln2f);
1731   case TargetOpcode::G_FLOG10:
1732     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1733   case TargetOpcode::G_FEXP:
1734     return legalizeFExp(MI, B);
1735   case TargetOpcode::G_FPOW:
1736     return legalizeFPow(MI, B);
1737   case TargetOpcode::G_FFLOOR:
1738     return legalizeFFloor(MI, MRI, B);
1739   case TargetOpcode::G_BUILD_VECTOR:
1740     return legalizeBuildVector(MI, MRI, B);
1741   default:
1742     return false;
1743   }
1744 
1745   llvm_unreachable("expected switch to return");
1746 }
1747 
1748 Register AMDGPULegalizerInfo::getSegmentAperture(
1749   unsigned AS,
1750   MachineRegisterInfo &MRI,
1751   MachineIRBuilder &B) const {
1752   MachineFunction &MF = B.getMF();
1753   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1754   const LLT S32 = LLT::scalar(32);
1755 
1756   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1757 
1758   if (ST.hasApertureRegs()) {
1759     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1760     // getreg.
1761     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1762         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1763         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1764     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1765         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1766         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1767     unsigned Encoding =
1768         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1769         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1770         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1771 
1772     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1773 
1774     B.buildInstr(AMDGPU::S_GETREG_B32)
1775       .addDef(GetReg)
1776       .addImm(Encoding);
1777     MRI.setType(GetReg, S32);
1778 
1779     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1780     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1781   }
1782 
1783   Register QueuePtr = MRI.createGenericVirtualRegister(
1784     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1785 
1786   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1787     return Register();
1788 
1789   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1790   // private_segment_aperture_base_hi.
1791   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1792 
1793   // TODO: can we be smarter about machine pointer info?
1794   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1795   MachineMemOperand *MMO = MF.getMachineMemOperand(
1796       PtrInfo,
1797       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1798           MachineMemOperand::MOInvariant,
1799       4, commonAlignment(Align(64), StructOffset));
1800 
1801   Register LoadAddr;
1802 
1803   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1804   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1805 }
1806 
1807 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1808   MachineInstr &MI, MachineRegisterInfo &MRI,
1809   MachineIRBuilder &B) const {
1810   MachineFunction &MF = B.getMF();
1811 
1812   const LLT S32 = LLT::scalar(32);
1813   Register Dst = MI.getOperand(0).getReg();
1814   Register Src = MI.getOperand(1).getReg();
1815 
1816   LLT DstTy = MRI.getType(Dst);
1817   LLT SrcTy = MRI.getType(Src);
1818   unsigned DestAS = DstTy.getAddressSpace();
1819   unsigned SrcAS = SrcTy.getAddressSpace();
1820 
1821   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1822   // vector element.
1823   assert(!DstTy.isVector());
1824 
1825   const AMDGPUTargetMachine &TM
1826     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1827 
1828   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1829     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1830     return true;
1831   }
1832 
1833   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1834     // Truncate.
1835     B.buildExtract(Dst, Src, 0);
1836     MI.eraseFromParent();
1837     return true;
1838   }
1839 
1840   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1841     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1842     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1843 
1844     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1845     // another. Merge operands are required to be the same type, but creating an
1846     // extra ptrtoint would be kind of pointless.
1847     auto HighAddr = B.buildConstant(
1848       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1849     B.buildMerge(Dst, {Src, HighAddr});
1850     MI.eraseFromParent();
1851     return true;
1852   }
1853 
1854   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1855     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1856            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1857     unsigned NullVal = TM.getNullPointerValue(DestAS);
1858 
1859     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1860     auto FlatNull = B.buildConstant(SrcTy, 0);
1861 
1862     // Extract low 32-bits of the pointer.
1863     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1864 
1865     auto CmpRes =
1866         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1867     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1868 
1869     MI.eraseFromParent();
1870     return true;
1871   }
1872 
1873   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1874     return false;
1875 
1876   if (!ST.hasFlatAddressSpace())
1877     return false;
1878 
1879   auto SegmentNull =
1880       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1881   auto FlatNull =
1882       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1883 
1884   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1885   if (!ApertureReg.isValid())
1886     return false;
1887 
1888   auto CmpRes =
1889       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1890 
1891   // Coerce the type of the low half of the result so we can use merge_values.
1892   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1893 
1894   // TODO: Should we allow mismatched types but matching sizes in merges to
1895   // avoid the ptrtoint?
1896   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1897   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1898 
1899   MI.eraseFromParent();
1900   return true;
1901 }
1902 
1903 bool AMDGPULegalizerInfo::legalizeFrint(
1904   MachineInstr &MI, MachineRegisterInfo &MRI,
1905   MachineIRBuilder &B) const {
1906   Register Src = MI.getOperand(1).getReg();
1907   LLT Ty = MRI.getType(Src);
1908   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1909 
1910   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1911   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1912 
1913   auto C1 = B.buildFConstant(Ty, C1Val);
1914   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1915 
1916   // TODO: Should this propagate fast-math-flags?
1917   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1918   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1919 
1920   auto C2 = B.buildFConstant(Ty, C2Val);
1921   auto Fabs = B.buildFAbs(Ty, Src);
1922 
1923   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1924   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1925   MI.eraseFromParent();
1926   return true;
1927 }
1928 
1929 bool AMDGPULegalizerInfo::legalizeFceil(
1930   MachineInstr &MI, MachineRegisterInfo &MRI,
1931   MachineIRBuilder &B) const {
1932 
1933   const LLT S1 = LLT::scalar(1);
1934   const LLT S64 = LLT::scalar(64);
1935 
1936   Register Src = MI.getOperand(1).getReg();
1937   assert(MRI.getType(Src) == S64);
1938 
1939   // result = trunc(src)
1940   // if (src > 0.0 && src != result)
1941   //   result += 1.0
1942 
1943   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1944 
1945   const auto Zero = B.buildFConstant(S64, 0.0);
1946   const auto One = B.buildFConstant(S64, 1.0);
1947   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1948   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1949   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1950   auto Add = B.buildSelect(S64, And, One, Zero);
1951 
1952   // TODO: Should this propagate fast-math-flags?
1953   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1954   return true;
1955 }
1956 
1957 bool AMDGPULegalizerInfo::legalizeFrem(
1958   MachineInstr &MI, MachineRegisterInfo &MRI,
1959   MachineIRBuilder &B) const {
1960     Register DstReg = MI.getOperand(0).getReg();
1961     Register Src0Reg = MI.getOperand(1).getReg();
1962     Register Src1Reg = MI.getOperand(2).getReg();
1963     auto Flags = MI.getFlags();
1964     LLT Ty = MRI.getType(DstReg);
1965 
1966     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1967     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1968     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1969     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1970     MI.eraseFromParent();
1971     return true;
1972 }
1973 
1974 static MachineInstrBuilder extractF64Exponent(Register Hi,
1975                                               MachineIRBuilder &B) {
1976   const unsigned FractBits = 52;
1977   const unsigned ExpBits = 11;
1978   LLT S32 = LLT::scalar(32);
1979 
1980   auto Const0 = B.buildConstant(S32, FractBits - 32);
1981   auto Const1 = B.buildConstant(S32, ExpBits);
1982 
1983   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1984     .addUse(Hi)
1985     .addUse(Const0.getReg(0))
1986     .addUse(Const1.getReg(0));
1987 
1988   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1989 }
1990 
1991 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1992   MachineInstr &MI, MachineRegisterInfo &MRI,
1993   MachineIRBuilder &B) const {
1994   const LLT S1 = LLT::scalar(1);
1995   const LLT S32 = LLT::scalar(32);
1996   const LLT S64 = LLT::scalar(64);
1997 
1998   Register Src = MI.getOperand(1).getReg();
1999   assert(MRI.getType(Src) == S64);
2000 
2001   // TODO: Should this use extract since the low half is unused?
2002   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2003   Register Hi = Unmerge.getReg(1);
2004 
2005   // Extract the upper half, since this is where we will find the sign and
2006   // exponent.
2007   auto Exp = extractF64Exponent(Hi, B);
2008 
2009   const unsigned FractBits = 52;
2010 
2011   // Extract the sign bit.
2012   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2013   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2014 
2015   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2016 
2017   const auto Zero32 = B.buildConstant(S32, 0);
2018 
2019   // Extend back to 64-bits.
2020   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2021 
2022   auto Shr = B.buildAShr(S64, FractMask, Exp);
2023   auto Not = B.buildNot(S64, Shr);
2024   auto Tmp0 = B.buildAnd(S64, Src, Not);
2025   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2026 
2027   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2028   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2029 
2030   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2031   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2032   MI.eraseFromParent();
2033   return true;
2034 }
2035 
2036 bool AMDGPULegalizerInfo::legalizeITOFP(
2037   MachineInstr &MI, MachineRegisterInfo &MRI,
2038   MachineIRBuilder &B, bool Signed) const {
2039 
2040   Register Dst = MI.getOperand(0).getReg();
2041   Register Src = MI.getOperand(1).getReg();
2042 
2043   const LLT S64 = LLT::scalar(64);
2044   const LLT S32 = LLT::scalar(32);
2045 
2046   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2047 
2048   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2049 
2050   auto CvtHi = Signed ?
2051     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2052     B.buildUITOFP(S64, Unmerge.getReg(1));
2053 
2054   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2055 
2056   auto ThirtyTwo = B.buildConstant(S32, 32);
2057   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2058     .addUse(CvtHi.getReg(0))
2059     .addUse(ThirtyTwo.getReg(0));
2060 
2061   // TODO: Should this propagate fast-math-flags?
2062   B.buildFAdd(Dst, LdExp, CvtLo);
2063   MI.eraseFromParent();
2064   return true;
2065 }
2066 
2067 // TODO: Copied from DAG implementation. Verify logic and document how this
2068 // actually works.
2069 bool AMDGPULegalizerInfo::legalizeFPTOI(
2070   MachineInstr &MI, MachineRegisterInfo &MRI,
2071   MachineIRBuilder &B, bool Signed) const {
2072 
2073   Register Dst = MI.getOperand(0).getReg();
2074   Register Src = MI.getOperand(1).getReg();
2075 
2076   const LLT S64 = LLT::scalar(64);
2077   const LLT S32 = LLT::scalar(32);
2078 
2079   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2080 
2081   unsigned Flags = MI.getFlags();
2082 
2083   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2084   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2085   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2086 
2087   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2088   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2089   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2090 
2091   auto Hi = Signed ?
2092     B.buildFPTOSI(S32, FloorMul) :
2093     B.buildFPTOUI(S32, FloorMul);
2094   auto Lo = B.buildFPTOUI(S32, Fma);
2095 
2096   B.buildMerge(Dst, { Lo, Hi });
2097   MI.eraseFromParent();
2098 
2099   return true;
2100 }
2101 
2102 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2103                                                MachineInstr &MI) const {
2104   MachineFunction &MF = Helper.MIRBuilder.getMF();
2105   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2106 
2107   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2108                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2109 
2110   // With ieee_mode disabled, the instructions have the correct behavior
2111   // already for G_FMINNUM/G_FMAXNUM
2112   if (!MFI->getMode().IEEE)
2113     return !IsIEEEOp;
2114 
2115   if (IsIEEEOp)
2116     return true;
2117 
2118   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2119 }
2120 
2121 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2122   MachineInstr &MI, MachineRegisterInfo &MRI,
2123   MachineIRBuilder &B) const {
2124   // TODO: Should move some of this into LegalizerHelper.
2125 
2126   // TODO: Promote dynamic indexing of s16 to s32
2127 
2128   // FIXME: Artifact combiner probably should have replaced the truncated
2129   // constant before this, so we shouldn't need
2130   // getConstantVRegValWithLookThrough.
2131   Optional<ValueAndVReg> MaybeIdxVal =
2132       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2133   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2134     return true;
2135   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2136 
2137   Register Dst = MI.getOperand(0).getReg();
2138   Register Vec = MI.getOperand(1).getReg();
2139 
2140   LLT VecTy = MRI.getType(Vec);
2141   LLT EltTy = VecTy.getElementType();
2142   assert(EltTy == MRI.getType(Dst));
2143 
2144   if (IdxVal < VecTy.getNumElements())
2145     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2146   else
2147     B.buildUndef(Dst);
2148 
2149   MI.eraseFromParent();
2150   return true;
2151 }
2152 
2153 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2154   MachineInstr &MI, MachineRegisterInfo &MRI,
2155   MachineIRBuilder &B) const {
2156   // TODO: Should move some of this into LegalizerHelper.
2157 
2158   // TODO: Promote dynamic indexing of s16 to s32
2159 
2160   // FIXME: Artifact combiner probably should have replaced the truncated
2161   // constant before this, so we shouldn't need
2162   // getConstantVRegValWithLookThrough.
2163   Optional<ValueAndVReg> MaybeIdxVal =
2164       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2165   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2166     return true;
2167 
2168   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2169   Register Dst = MI.getOperand(0).getReg();
2170   Register Vec = MI.getOperand(1).getReg();
2171   Register Ins = MI.getOperand(2).getReg();
2172 
2173   LLT VecTy = MRI.getType(Vec);
2174   LLT EltTy = VecTy.getElementType();
2175   assert(EltTy == MRI.getType(Ins));
2176 
2177   if (IdxVal < VecTy.getNumElements())
2178     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2179   else
2180     B.buildUndef(Dst);
2181 
2182   MI.eraseFromParent();
2183   return true;
2184 }
2185 
2186 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2187   MachineInstr &MI, MachineRegisterInfo &MRI,
2188   MachineIRBuilder &B) const {
2189   const LLT V2S16 = LLT::vector(2, 16);
2190 
2191   Register Dst = MI.getOperand(0).getReg();
2192   Register Src0 = MI.getOperand(1).getReg();
2193   LLT DstTy = MRI.getType(Dst);
2194   LLT SrcTy = MRI.getType(Src0);
2195 
2196   if (SrcTy == V2S16 && DstTy == V2S16 &&
2197       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2198     return true;
2199 
2200   MachineIRBuilder HelperBuilder(MI);
2201   GISelObserverWrapper DummyObserver;
2202   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2203   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2204 }
2205 
2206 bool AMDGPULegalizerInfo::legalizeSinCos(
2207   MachineInstr &MI, MachineRegisterInfo &MRI,
2208   MachineIRBuilder &B) const {
2209 
2210   Register DstReg = MI.getOperand(0).getReg();
2211   Register SrcReg = MI.getOperand(1).getReg();
2212   LLT Ty = MRI.getType(DstReg);
2213   unsigned Flags = MI.getFlags();
2214 
2215   Register TrigVal;
2216   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2217   if (ST.hasTrigReducedRange()) {
2218     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2219     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2220       .addUse(MulVal.getReg(0))
2221       .setMIFlags(Flags).getReg(0);
2222   } else
2223     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2224 
2225   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2226     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2227   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2228     .addUse(TrigVal)
2229     .setMIFlags(Flags);
2230   MI.eraseFromParent();
2231   return true;
2232 }
2233 
2234 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2235                                                   MachineIRBuilder &B,
2236                                                   const GlobalValue *GV,
2237                                                   int64_t Offset,
2238                                                   unsigned GAFlags) const {
2239   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2240   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2241   // to the following code sequence:
2242   //
2243   // For constant address space:
2244   //   s_getpc_b64 s[0:1]
2245   //   s_add_u32 s0, s0, $symbol
2246   //   s_addc_u32 s1, s1, 0
2247   //
2248   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2249   //   a fixup or relocation is emitted to replace $symbol with a literal
2250   //   constant, which is a pc-relative offset from the encoding of the $symbol
2251   //   operand to the global variable.
2252   //
2253   // For global address space:
2254   //   s_getpc_b64 s[0:1]
2255   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2256   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2257   //
2258   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2259   //   fixups or relocations are emitted to replace $symbol@*@lo and
2260   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2261   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2262   //   operand to the global variable.
2263   //
2264   // What we want here is an offset from the value returned by s_getpc
2265   // (which is the address of the s_add_u32 instruction) to the global
2266   // variable, but since the encoding of $symbol starts 4 bytes after the start
2267   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2268   // small. This requires us to add 4 to the global variable offset in order to
2269   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2270   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2271   // instruction.
2272 
2273   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2274 
2275   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2276     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2277 
2278   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2279     .addDef(PCReg);
2280 
2281   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2282   if (GAFlags == SIInstrInfo::MO_NONE)
2283     MIB.addImm(0);
2284   else
2285     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2286 
2287   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2288 
2289   if (PtrTy.getSizeInBits() == 32)
2290     B.buildExtract(DstReg, PCReg, 0);
2291   return true;
2292  }
2293 
2294 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2295   MachineInstr &MI, MachineRegisterInfo &MRI,
2296   MachineIRBuilder &B) const {
2297   Register DstReg = MI.getOperand(0).getReg();
2298   LLT Ty = MRI.getType(DstReg);
2299   unsigned AS = Ty.getAddressSpace();
2300 
2301   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2302   MachineFunction &MF = B.getMF();
2303   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2304 
2305   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2306     if (!MFI->isModuleEntryFunction() &&
2307         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2308       const Function &Fn = MF.getFunction();
2309       DiagnosticInfoUnsupported BadLDSDecl(
2310         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2311         DS_Warning);
2312       Fn.getContext().diagnose(BadLDSDecl);
2313 
2314       // We currently don't have a way to correctly allocate LDS objects that
2315       // aren't directly associated with a kernel. We do force inlining of
2316       // functions that use local objects. However, if these dead functions are
2317       // not eliminated, we don't want a compile time error. Just emit a warning
2318       // and a trap, since there should be no callable path here.
2319       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2320       B.buildUndef(DstReg);
2321       MI.eraseFromParent();
2322       return true;
2323     }
2324 
2325     // TODO: We could emit code to handle the initialization somewhere.
2326     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2327       const SITargetLowering *TLI = ST.getTargetLowering();
2328       if (!TLI->shouldUseLDSConstAddress(GV)) {
2329         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2330         return true; // Leave in place;
2331       }
2332 
2333       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2334         Type *Ty = GV->getValueType();
2335         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2336         // zero-sized type in other languages to declare the dynamic shared
2337         // memory which size is not known at the compile time. They will be
2338         // allocated by the runtime and placed directly after the static
2339         // allocated ones. They all share the same offset.
2340         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2341           // Adjust alignment for that dynamic shared memory array.
2342           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2343           LLT S32 = LLT::scalar(32);
2344           auto Sz =
2345               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2346           B.buildIntToPtr(DstReg, Sz);
2347           MI.eraseFromParent();
2348           return true;
2349         }
2350       }
2351 
2352       B.buildConstant(
2353           DstReg,
2354           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2355       MI.eraseFromParent();
2356       return true;
2357     }
2358 
2359     const Function &Fn = MF.getFunction();
2360     DiagnosticInfoUnsupported BadInit(
2361       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2362     Fn.getContext().diagnose(BadInit);
2363     return true;
2364   }
2365 
2366   const SITargetLowering *TLI = ST.getTargetLowering();
2367 
2368   if (TLI->shouldEmitFixup(GV)) {
2369     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2370     MI.eraseFromParent();
2371     return true;
2372   }
2373 
2374   if (TLI->shouldEmitPCReloc(GV)) {
2375     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2376     MI.eraseFromParent();
2377     return true;
2378   }
2379 
2380   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2381   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2382 
2383   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2384       MachinePointerInfo::getGOT(MF),
2385       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2386           MachineMemOperand::MOInvariant,
2387       8 /*Size*/, Align(8));
2388 
2389   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2390 
2391   if (Ty.getSizeInBits() == 32) {
2392     // Truncate if this is a 32-bit constant adrdess.
2393     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2394     B.buildExtract(DstReg, Load, 0);
2395   } else
2396     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2397 
2398   MI.eraseFromParent();
2399   return true;
2400 }
2401 
2402 static LLT widenToNextPowerOf2(LLT Ty) {
2403   if (Ty.isVector())
2404     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2405   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2406 }
2407 
2408 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2409                                        MachineInstr &MI) const {
2410   MachineIRBuilder &B = Helper.MIRBuilder;
2411   MachineRegisterInfo &MRI = *B.getMRI();
2412   GISelChangeObserver &Observer = Helper.Observer;
2413 
2414   Register PtrReg = MI.getOperand(1).getReg();
2415   LLT PtrTy = MRI.getType(PtrReg);
2416   unsigned AddrSpace = PtrTy.getAddressSpace();
2417 
2418   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2419     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2420     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2421     Observer.changingInstr(MI);
2422     MI.getOperand(1).setReg(Cast.getReg(0));
2423     Observer.changedInstr(MI);
2424     return true;
2425   }
2426 
2427   if (MI.getOpcode() != AMDGPU::G_LOAD)
2428     return false;
2429 
2430   Register ValReg = MI.getOperand(0).getReg();
2431   LLT ValTy = MRI.getType(ValReg);
2432 
2433   MachineMemOperand *MMO = *MI.memoperands_begin();
2434   const unsigned ValSize = ValTy.getSizeInBits();
2435   const unsigned MemSize = 8 * MMO->getSize();
2436   const Align MemAlign = MMO->getAlign();
2437   const unsigned AlignInBits = 8 * MemAlign.value();
2438 
2439   // Widen non-power-of-2 loads to the alignment if needed
2440   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2441     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2442 
2443     // This was already the correct extending load result type, so just adjust
2444     // the memory type.
2445     if (WideMemSize == ValSize) {
2446       MachineFunction &MF = B.getMF();
2447 
2448       MachineMemOperand *WideMMO =
2449           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2450       Observer.changingInstr(MI);
2451       MI.setMemRefs(MF, {WideMMO});
2452       Observer.changedInstr(MI);
2453       return true;
2454     }
2455 
2456     // Don't bother handling edge case that should probably never be produced.
2457     if (ValSize > WideMemSize)
2458       return false;
2459 
2460     LLT WideTy = widenToNextPowerOf2(ValTy);
2461 
2462     Register WideLoad;
2463     if (!WideTy.isVector()) {
2464       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2465       B.buildTrunc(ValReg, WideLoad).getReg(0);
2466     } else {
2467       // Extract the subvector.
2468 
2469       if (isRegisterType(ValTy)) {
2470         // If this a case where G_EXTRACT is legal, use it.
2471         // (e.g. <3 x s32> -> <4 x s32>)
2472         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2473         B.buildExtract(ValReg, WideLoad, 0);
2474       } else {
2475         // For cases where the widened type isn't a nice register value, unmerge
2476         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2477         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2478         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2479         B.setInsertPt(B.getMBB(), MI.getIterator());
2480         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2481       }
2482     }
2483 
2484     MI.eraseFromParent();
2485     return true;
2486   }
2487 
2488   return false;
2489 }
2490 
2491 bool AMDGPULegalizerInfo::legalizeFMad(
2492   MachineInstr &MI, MachineRegisterInfo &MRI,
2493   MachineIRBuilder &B) const {
2494   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2495   assert(Ty.isScalar());
2496 
2497   MachineFunction &MF = B.getMF();
2498   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2499 
2500   // TODO: Always legal with future ftz flag.
2501   // FIXME: Do we need just output?
2502   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2503     return true;
2504   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2505     return true;
2506 
2507   MachineIRBuilder HelperBuilder(MI);
2508   GISelObserverWrapper DummyObserver;
2509   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2510   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2511 }
2512 
2513 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2514   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2515   Register DstReg = MI.getOperand(0).getReg();
2516   Register PtrReg = MI.getOperand(1).getReg();
2517   Register CmpVal = MI.getOperand(2).getReg();
2518   Register NewVal = MI.getOperand(3).getReg();
2519 
2520   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2521          "this should not have been custom lowered");
2522 
2523   LLT ValTy = MRI.getType(CmpVal);
2524   LLT VecTy = LLT::vector(2, ValTy);
2525 
2526   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2527 
2528   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2529     .addDef(DstReg)
2530     .addUse(PtrReg)
2531     .addUse(PackedVal)
2532     .setMemRefs(MI.memoperands());
2533 
2534   MI.eraseFromParent();
2535   return true;
2536 }
2537 
2538 bool AMDGPULegalizerInfo::legalizeFlog(
2539   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2540   Register Dst = MI.getOperand(0).getReg();
2541   Register Src = MI.getOperand(1).getReg();
2542   LLT Ty = B.getMRI()->getType(Dst);
2543   unsigned Flags = MI.getFlags();
2544 
2545   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2546   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2547 
2548   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2549   MI.eraseFromParent();
2550   return true;
2551 }
2552 
2553 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2554                                        MachineIRBuilder &B) const {
2555   Register Dst = MI.getOperand(0).getReg();
2556   Register Src = MI.getOperand(1).getReg();
2557   unsigned Flags = MI.getFlags();
2558   LLT Ty = B.getMRI()->getType(Dst);
2559 
2560   auto K = B.buildFConstant(Ty, numbers::log2e);
2561   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2562   B.buildFExp2(Dst, Mul, Flags);
2563   MI.eraseFromParent();
2564   return true;
2565 }
2566 
2567 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2568                                        MachineIRBuilder &B) const {
2569   Register Dst = MI.getOperand(0).getReg();
2570   Register Src0 = MI.getOperand(1).getReg();
2571   Register Src1 = MI.getOperand(2).getReg();
2572   unsigned Flags = MI.getFlags();
2573   LLT Ty = B.getMRI()->getType(Dst);
2574   const LLT S16 = LLT::scalar(16);
2575   const LLT S32 = LLT::scalar(32);
2576 
2577   if (Ty == S32) {
2578     auto Log = B.buildFLog2(S32, Src0, Flags);
2579     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2580       .addUse(Log.getReg(0))
2581       .addUse(Src1)
2582       .setMIFlags(Flags);
2583     B.buildFExp2(Dst, Mul, Flags);
2584   } else if (Ty == S16) {
2585     // There's no f16 fmul_legacy, so we need to convert for it.
2586     auto Log = B.buildFLog2(S16, Src0, Flags);
2587     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2588     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2589     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2590       .addUse(Ext0.getReg(0))
2591       .addUse(Ext1.getReg(0))
2592       .setMIFlags(Flags);
2593 
2594     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2595   } else
2596     return false;
2597 
2598   MI.eraseFromParent();
2599   return true;
2600 }
2601 
2602 // Find a source register, ignoring any possible source modifiers.
2603 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2604   Register ModSrc = OrigSrc;
2605   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2606     ModSrc = SrcFNeg->getOperand(1).getReg();
2607     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2608       ModSrc = SrcFAbs->getOperand(1).getReg();
2609   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2610     ModSrc = SrcFAbs->getOperand(1).getReg();
2611   return ModSrc;
2612 }
2613 
2614 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2615                                          MachineRegisterInfo &MRI,
2616                                          MachineIRBuilder &B) const {
2617 
2618   const LLT S1 = LLT::scalar(1);
2619   const LLT S64 = LLT::scalar(64);
2620   Register Dst = MI.getOperand(0).getReg();
2621   Register OrigSrc = MI.getOperand(1).getReg();
2622   unsigned Flags = MI.getFlags();
2623   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2624          "this should not have been custom lowered");
2625 
2626   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2627   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2628   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2629   // V_FRACT bug is:
2630   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2631   //
2632   // Convert floor(x) to (x - fract(x))
2633 
2634   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2635     .addUse(OrigSrc)
2636     .setMIFlags(Flags);
2637 
2638   // Give source modifier matching some assistance before obscuring a foldable
2639   // pattern.
2640 
2641   // TODO: We can avoid the neg on the fract? The input sign to fract
2642   // shouldn't matter?
2643   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2644 
2645   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2646 
2647   Register Min = MRI.createGenericVirtualRegister(S64);
2648 
2649   // We don't need to concern ourselves with the snan handling difference, so
2650   // use the one which will directly select.
2651   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2652   if (MFI->getMode().IEEE)
2653     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2654   else
2655     B.buildFMinNum(Min, Fract, Const, Flags);
2656 
2657   Register CorrectedFract = Min;
2658   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2659     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2660     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2661   }
2662 
2663   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2664   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2665 
2666   MI.eraseFromParent();
2667   return true;
2668 }
2669 
2670 // Turn an illegal packed v2s16 build vector into bit operations.
2671 // TODO: This should probably be a bitcast action in LegalizerHelper.
2672 bool AMDGPULegalizerInfo::legalizeBuildVector(
2673   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2674   Register Dst = MI.getOperand(0).getReg();
2675   const LLT S32 = LLT::scalar(32);
2676   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2677 
2678   Register Src0 = MI.getOperand(1).getReg();
2679   Register Src1 = MI.getOperand(2).getReg();
2680   assert(MRI.getType(Src0) == LLT::scalar(16));
2681 
2682   auto Merge = B.buildMerge(S32, {Src0, Src1});
2683   B.buildBitcast(Dst, Merge);
2684 
2685   MI.eraseFromParent();
2686   return true;
2687 }
2688 
2689 // Check that this is a G_XOR x, -1
2690 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2691   if (MI.getOpcode() != TargetOpcode::G_XOR)
2692     return false;
2693   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2694   return ConstVal && *ConstVal == -1;
2695 }
2696 
2697 // Return the use branch instruction, otherwise null if the usage is invalid.
2698 static MachineInstr *
2699 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2700                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2701   Register CondDef = MI.getOperand(0).getReg();
2702   if (!MRI.hasOneNonDBGUse(CondDef))
2703     return nullptr;
2704 
2705   MachineBasicBlock *Parent = MI.getParent();
2706   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2707 
2708   if (isNot(MRI, *UseMI)) {
2709     Register NegatedCond = UseMI->getOperand(0).getReg();
2710     if (!MRI.hasOneNonDBGUse(NegatedCond))
2711       return nullptr;
2712 
2713     // We're deleting the def of this value, so we need to remove it.
2714     UseMI->eraseFromParent();
2715 
2716     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2717     Negated = true;
2718   }
2719 
2720   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2721     return nullptr;
2722 
2723   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2724   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2725   if (Next == Parent->end()) {
2726     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2727     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2728       return nullptr;
2729     UncondBrTarget = &*NextMBB;
2730   } else {
2731     if (Next->getOpcode() != AMDGPU::G_BR)
2732       return nullptr;
2733     Br = &*Next;
2734     UncondBrTarget = Br->getOperand(0).getMBB();
2735   }
2736 
2737   return UseMI;
2738 }
2739 
2740 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2741                                          const ArgDescriptor *Arg,
2742                                          const TargetRegisterClass *ArgRC,
2743                                          LLT ArgTy) const {
2744   MCRegister SrcReg = Arg->getRegister();
2745   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2746   assert(DstReg.isVirtual() && "Virtual register expected");
2747 
2748   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2749                                              ArgTy);
2750   if (Arg->isMasked()) {
2751     // TODO: Should we try to emit this once in the entry block?
2752     const LLT S32 = LLT::scalar(32);
2753     const unsigned Mask = Arg->getMask();
2754     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2755 
2756     Register AndMaskSrc = LiveIn;
2757 
2758     if (Shift != 0) {
2759       auto ShiftAmt = B.buildConstant(S32, Shift);
2760       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2761     }
2762 
2763     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2764   } else {
2765     B.buildCopy(DstReg, LiveIn);
2766   }
2767 
2768   return true;
2769 }
2770 
2771 bool AMDGPULegalizerInfo::loadInputValue(
2772     Register DstReg, MachineIRBuilder &B,
2773     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2774   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2775   const ArgDescriptor *Arg;
2776   const TargetRegisterClass *ArgRC;
2777   LLT ArgTy;
2778   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2779 
2780   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2781     return false; // TODO: Handle these
2782   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2783 }
2784 
2785 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2786     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2787     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2788   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2789     return false;
2790 
2791   MI.eraseFromParent();
2792   return true;
2793 }
2794 
2795 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2796                                        MachineRegisterInfo &MRI,
2797                                        MachineIRBuilder &B) const {
2798   Register Dst = MI.getOperand(0).getReg();
2799   LLT DstTy = MRI.getType(Dst);
2800   LLT S16 = LLT::scalar(16);
2801   LLT S32 = LLT::scalar(32);
2802   LLT S64 = LLT::scalar(64);
2803 
2804   if (DstTy == S16)
2805     return legalizeFDIV16(MI, MRI, B);
2806   if (DstTy == S32)
2807     return legalizeFDIV32(MI, MRI, B);
2808   if (DstTy == S64)
2809     return legalizeFDIV64(MI, MRI, B);
2810 
2811   return false;
2812 }
2813 
2814 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2815                                                         Register DstDivReg,
2816                                                         Register DstRemReg,
2817                                                         Register X,
2818                                                         Register Y) const {
2819   const LLT S1 = LLT::scalar(1);
2820   const LLT S32 = LLT::scalar(32);
2821 
2822   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2823   // algorithm used here.
2824 
2825   // Initial estimate of inv(y).
2826   auto FloatY = B.buildUITOFP(S32, Y);
2827   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2828   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2829   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2830   auto Z = B.buildFPTOUI(S32, ScaledY);
2831 
2832   // One round of UNR.
2833   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2834   auto NegYZ = B.buildMul(S32, NegY, Z);
2835   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2836 
2837   // Quotient/remainder estimate.
2838   auto Q = B.buildUMulH(S32, X, Z);
2839   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2840 
2841   // First quotient/remainder refinement.
2842   auto One = B.buildConstant(S32, 1);
2843   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2844   if (DstDivReg)
2845     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2846   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2847 
2848   // Second quotient/remainder refinement.
2849   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2850   if (DstDivReg)
2851     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2852 
2853   if (DstRemReg)
2854     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2855 }
2856 
2857 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2858 //
2859 // Return lo, hi of result
2860 //
2861 // %cvt.lo = G_UITOFP Val.lo
2862 // %cvt.hi = G_UITOFP Val.hi
2863 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2864 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2865 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2866 // %mul2 = G_FMUL %mul1, 2**(-32)
2867 // %trunc = G_INTRINSIC_TRUNC %mul2
2868 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2869 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2870 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2871                                                        Register Val) {
2872   const LLT S32 = LLT::scalar(32);
2873   auto Unmerge = B.buildUnmerge(S32, Val);
2874 
2875   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2876   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2877 
2878   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2879                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2880 
2881   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2882   auto Mul1 =
2883       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2884 
2885   // 2**(-32)
2886   auto Mul2 =
2887       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2888   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2889 
2890   // -(2**32)
2891   auto Mad2 = B.buildFMAD(S32, Trunc,
2892                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2893 
2894   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2895   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2896 
2897   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2898 }
2899 
2900 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2901                                                         Register DstDivReg,
2902                                                         Register DstRemReg,
2903                                                         Register Numer,
2904                                                         Register Denom) const {
2905   const LLT S32 = LLT::scalar(32);
2906   const LLT S64 = LLT::scalar(64);
2907   const LLT S1 = LLT::scalar(1);
2908   Register RcpLo, RcpHi;
2909 
2910   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2911 
2912   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2913 
2914   auto Zero64 = B.buildConstant(S64, 0);
2915   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2916 
2917   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2918   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2919 
2920   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2921   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2922   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2923 
2924   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2925   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2926   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2927   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2928 
2929   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2930   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2931   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2932   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2933   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2934 
2935   auto Zero32 = B.buildConstant(S32, 0);
2936   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2937   auto Add2_HiC =
2938       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2939   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2940   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2941 
2942   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2943   Register NumerLo = UnmergeNumer.getReg(0);
2944   Register NumerHi = UnmergeNumer.getReg(1);
2945 
2946   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2947   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2948   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2949   Register Mul3_Lo = UnmergeMul3.getReg(0);
2950   Register Mul3_Hi = UnmergeMul3.getReg(1);
2951   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2952   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2953   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2954   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2955 
2956   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2957   Register DenomLo = UnmergeDenom.getReg(0);
2958   Register DenomHi = UnmergeDenom.getReg(1);
2959 
2960   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2961   auto C1 = B.buildSExt(S32, CmpHi);
2962 
2963   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2964   auto C2 = B.buildSExt(S32, CmpLo);
2965 
2966   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2967   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2968 
2969   // TODO: Here and below portions of the code can be enclosed into if/endif.
2970   // Currently control flow is unconditional and we have 4 selects after
2971   // potential endif to substitute PHIs.
2972 
2973   // if C3 != 0 ...
2974   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2975   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2976   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2977   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2978 
2979   auto One64 = B.buildConstant(S64, 1);
2980   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2981 
2982   auto C4 =
2983       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2984   auto C5 =
2985       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2986   auto C6 = B.buildSelect(
2987       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2988 
2989   // if (C6 != 0)
2990   auto Add4 = B.buildAdd(S64, Add3, One64);
2991   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2992 
2993   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2994   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2995   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2996 
2997   // endif C6
2998   // endif C3
2999 
3000   if (DstDivReg) {
3001     auto Sel1 = B.buildSelect(
3002         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3003     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3004                   Sel1, MulHi3);
3005   }
3006 
3007   if (DstRemReg) {
3008     auto Sel2 = B.buildSelect(
3009         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3010     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3011                   Sel2, Sub1);
3012   }
3013 }
3014 
3015 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3016                                                   MachineRegisterInfo &MRI,
3017                                                   MachineIRBuilder &B) const {
3018   Register DstDivReg, DstRemReg;
3019   switch (MI.getOpcode()) {
3020   default:
3021     llvm_unreachable("Unexpected opcode!");
3022   case AMDGPU::G_UDIV: {
3023     DstDivReg = MI.getOperand(0).getReg();
3024     break;
3025   }
3026   case AMDGPU::G_UREM: {
3027     DstRemReg = MI.getOperand(0).getReg();
3028     break;
3029   }
3030   case AMDGPU::G_UDIVREM: {
3031     DstDivReg = MI.getOperand(0).getReg();
3032     DstRemReg = MI.getOperand(1).getReg();
3033     break;
3034   }
3035   }
3036 
3037   const LLT S64 = LLT::scalar(64);
3038   const LLT S32 = LLT::scalar(32);
3039   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3040   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3041   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3042   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3043 
3044   if (Ty == S32)
3045     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3046   else if (Ty == S64)
3047     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3048   else
3049     return false;
3050 
3051   MI.eraseFromParent();
3052   return true;
3053 }
3054 
3055 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3056                                                 MachineRegisterInfo &MRI,
3057                                                 MachineIRBuilder &B) const {
3058   const LLT S64 = LLT::scalar(64);
3059   const LLT S32 = LLT::scalar(32);
3060 
3061   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3062   if (Ty != S32 && Ty != S64)
3063     return false;
3064 
3065   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3066   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3067   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3068 
3069   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3070   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3071   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3072 
3073   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3074   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3075 
3076   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3077   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3078 
3079   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3080   switch (MI.getOpcode()) {
3081   default:
3082     llvm_unreachable("Unexpected opcode!");
3083   case AMDGPU::G_SDIV: {
3084     DstDivReg = MI.getOperand(0).getReg();
3085     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3086     break;
3087   }
3088   case AMDGPU::G_SREM: {
3089     DstRemReg = MI.getOperand(0).getReg();
3090     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3091     break;
3092   }
3093   case AMDGPU::G_SDIVREM: {
3094     DstDivReg = MI.getOperand(0).getReg();
3095     DstRemReg = MI.getOperand(1).getReg();
3096     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3097     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3098     break;
3099   }
3100   }
3101 
3102   if (Ty == S32)
3103     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3104   else
3105     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3106 
3107   if (DstDivReg) {
3108     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3109     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3110     B.buildSub(DstDivReg, SignXor, Sign);
3111   }
3112 
3113   if (DstRemReg) {
3114     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3115     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3116     B.buildSub(DstRemReg, SignXor, Sign);
3117   }
3118 
3119   MI.eraseFromParent();
3120   return true;
3121 }
3122 
3123 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3124                                                  MachineRegisterInfo &MRI,
3125                                                  MachineIRBuilder &B) const {
3126   Register Res = MI.getOperand(0).getReg();
3127   Register LHS = MI.getOperand(1).getReg();
3128   Register RHS = MI.getOperand(2).getReg();
3129   uint16_t Flags = MI.getFlags();
3130   LLT ResTy = MRI.getType(Res);
3131 
3132   const MachineFunction &MF = B.getMF();
3133   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3134                             MI.getFlag(MachineInstr::FmAfn);
3135 
3136   if (!AllowInaccurateRcp)
3137     return false;
3138 
3139   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3140     // 1 / x -> RCP(x)
3141     if (CLHS->isExactlyValue(1.0)) {
3142       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3143         .addUse(RHS)
3144         .setMIFlags(Flags);
3145 
3146       MI.eraseFromParent();
3147       return true;
3148     }
3149 
3150     // -1 / x -> RCP( FNEG(x) )
3151     if (CLHS->isExactlyValue(-1.0)) {
3152       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3153       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3154         .addUse(FNeg.getReg(0))
3155         .setMIFlags(Flags);
3156 
3157       MI.eraseFromParent();
3158       return true;
3159     }
3160   }
3161 
3162   // x / y -> x * (1.0 / y)
3163   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3164     .addUse(RHS)
3165     .setMIFlags(Flags);
3166   B.buildFMul(Res, LHS, RCP, Flags);
3167 
3168   MI.eraseFromParent();
3169   return true;
3170 }
3171 
3172 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3173                                                    MachineRegisterInfo &MRI,
3174                                                    MachineIRBuilder &B) const {
3175   Register Res = MI.getOperand(0).getReg();
3176   Register X = MI.getOperand(1).getReg();
3177   Register Y = MI.getOperand(2).getReg();
3178   uint16_t Flags = MI.getFlags();
3179   LLT ResTy = MRI.getType(Res);
3180 
3181   const MachineFunction &MF = B.getMF();
3182   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3183                             MI.getFlag(MachineInstr::FmAfn);
3184 
3185   if (!AllowInaccurateRcp)
3186     return false;
3187 
3188   auto NegY = B.buildFNeg(ResTy, Y);
3189   auto One = B.buildFConstant(ResTy, 1.0);
3190 
3191   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3192     .addUse(Y)
3193     .setMIFlags(Flags);
3194 
3195   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3196   R = B.buildFMA(ResTy, Tmp0, R, R);
3197 
3198   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3199   R = B.buildFMA(ResTy, Tmp1, R, R);
3200 
3201   auto Ret = B.buildFMul(ResTy, X, R);
3202   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3203 
3204   B.buildFMA(Res, Tmp2, R, Ret);
3205   MI.eraseFromParent();
3206   return true;
3207 }
3208 
3209 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3210                                          MachineRegisterInfo &MRI,
3211                                          MachineIRBuilder &B) const {
3212   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3213     return true;
3214 
3215   Register Res = MI.getOperand(0).getReg();
3216   Register LHS = MI.getOperand(1).getReg();
3217   Register RHS = MI.getOperand(2).getReg();
3218 
3219   uint16_t Flags = MI.getFlags();
3220 
3221   LLT S16 = LLT::scalar(16);
3222   LLT S32 = LLT::scalar(32);
3223 
3224   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3225   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3226 
3227   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3228     .addUse(RHSExt.getReg(0))
3229     .setMIFlags(Flags);
3230 
3231   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3232   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3233 
3234   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3235     .addUse(RDst.getReg(0))
3236     .addUse(RHS)
3237     .addUse(LHS)
3238     .setMIFlags(Flags);
3239 
3240   MI.eraseFromParent();
3241   return true;
3242 }
3243 
3244 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3245 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3246 static void toggleSPDenormMode(bool Enable,
3247                                MachineIRBuilder &B,
3248                                const GCNSubtarget &ST,
3249                                AMDGPU::SIModeRegisterDefaults Mode) {
3250   // Set SP denorm mode to this value.
3251   unsigned SPDenormMode =
3252     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3253 
3254   if (ST.hasDenormModeInst()) {
3255     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3256     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3257 
3258     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3259     B.buildInstr(AMDGPU::S_DENORM_MODE)
3260       .addImm(NewDenormModeValue);
3261 
3262   } else {
3263     // Select FP32 bit field in mode register.
3264     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3265                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3266                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3267 
3268     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3269       .addImm(SPDenormMode)
3270       .addImm(SPDenormModeBitField);
3271   }
3272 }
3273 
3274 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3275                                          MachineRegisterInfo &MRI,
3276                                          MachineIRBuilder &B) const {
3277   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3278     return true;
3279 
3280   Register Res = MI.getOperand(0).getReg();
3281   Register LHS = MI.getOperand(1).getReg();
3282   Register RHS = MI.getOperand(2).getReg();
3283   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3284   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3285 
3286   uint16_t Flags = MI.getFlags();
3287 
3288   LLT S32 = LLT::scalar(32);
3289   LLT S1 = LLT::scalar(1);
3290 
3291   auto One = B.buildFConstant(S32, 1.0f);
3292 
3293   auto DenominatorScaled =
3294     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3295       .addUse(LHS)
3296       .addUse(RHS)
3297       .addImm(0)
3298       .setMIFlags(Flags);
3299   auto NumeratorScaled =
3300     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3301       .addUse(LHS)
3302       .addUse(RHS)
3303       .addImm(1)
3304       .setMIFlags(Flags);
3305 
3306   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3307     .addUse(DenominatorScaled.getReg(0))
3308     .setMIFlags(Flags);
3309   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3310 
3311   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3312   // aren't modeled as reading it.
3313   if (!Mode.allFP32Denormals())
3314     toggleSPDenormMode(true, B, ST, Mode);
3315 
3316   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3317   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3318   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3319   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3320   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3321   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3322 
3323   if (!Mode.allFP32Denormals())
3324     toggleSPDenormMode(false, B, ST, Mode);
3325 
3326   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3327     .addUse(Fma4.getReg(0))
3328     .addUse(Fma1.getReg(0))
3329     .addUse(Fma3.getReg(0))
3330     .addUse(NumeratorScaled.getReg(1))
3331     .setMIFlags(Flags);
3332 
3333   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3334     .addUse(Fmas.getReg(0))
3335     .addUse(RHS)
3336     .addUse(LHS)
3337     .setMIFlags(Flags);
3338 
3339   MI.eraseFromParent();
3340   return true;
3341 }
3342 
3343 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3344                                          MachineRegisterInfo &MRI,
3345                                          MachineIRBuilder &B) const {
3346   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3347     return true;
3348 
3349   Register Res = MI.getOperand(0).getReg();
3350   Register LHS = MI.getOperand(1).getReg();
3351   Register RHS = MI.getOperand(2).getReg();
3352 
3353   uint16_t Flags = MI.getFlags();
3354 
3355   LLT S64 = LLT::scalar(64);
3356   LLT S1 = LLT::scalar(1);
3357 
3358   auto One = B.buildFConstant(S64, 1.0);
3359 
3360   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3361     .addUse(LHS)
3362     .addUse(RHS)
3363     .addImm(0)
3364     .setMIFlags(Flags);
3365 
3366   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3367 
3368   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3369     .addUse(DivScale0.getReg(0))
3370     .setMIFlags(Flags);
3371 
3372   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3373   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3374   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3375 
3376   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3377     .addUse(LHS)
3378     .addUse(RHS)
3379     .addImm(1)
3380     .setMIFlags(Flags);
3381 
3382   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3383   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3384   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3385 
3386   Register Scale;
3387   if (!ST.hasUsableDivScaleConditionOutput()) {
3388     // Workaround a hardware bug on SI where the condition output from div_scale
3389     // is not usable.
3390 
3391     LLT S32 = LLT::scalar(32);
3392 
3393     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3394     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3395     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3396     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3397 
3398     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3399                               Scale1Unmerge.getReg(1));
3400     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3401                               Scale0Unmerge.getReg(1));
3402     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3403   } else {
3404     Scale = DivScale1.getReg(1);
3405   }
3406 
3407   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3408     .addUse(Fma4.getReg(0))
3409     .addUse(Fma3.getReg(0))
3410     .addUse(Mul.getReg(0))
3411     .addUse(Scale)
3412     .setMIFlags(Flags);
3413 
3414   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3415     .addUse(Fmas.getReg(0))
3416     .addUse(RHS)
3417     .addUse(LHS)
3418     .setMIFlags(Flags);
3419 
3420   MI.eraseFromParent();
3421   return true;
3422 }
3423 
3424 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3425                                                  MachineRegisterInfo &MRI,
3426                                                  MachineIRBuilder &B) const {
3427   Register Res = MI.getOperand(0).getReg();
3428   Register LHS = MI.getOperand(2).getReg();
3429   Register RHS = MI.getOperand(3).getReg();
3430   uint16_t Flags = MI.getFlags();
3431 
3432   LLT S32 = LLT::scalar(32);
3433   LLT S1 = LLT::scalar(1);
3434 
3435   auto Abs = B.buildFAbs(S32, RHS, Flags);
3436   const APFloat C0Val(1.0f);
3437 
3438   auto C0 = B.buildConstant(S32, 0x6f800000);
3439   auto C1 = B.buildConstant(S32, 0x2f800000);
3440   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3441 
3442   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3443   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3444 
3445   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3446 
3447   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3448     .addUse(Mul0.getReg(0))
3449     .setMIFlags(Flags);
3450 
3451   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3452 
3453   B.buildFMul(Res, Sel, Mul1, Flags);
3454 
3455   MI.eraseFromParent();
3456   return true;
3457 }
3458 
3459 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3460 // FIXME: Why do we handle this one but not other removed instructions?
3461 //
3462 // Reciprocal square root.  The clamp prevents infinite results, clamping
3463 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3464 // +-max_float.
3465 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3466                                                     MachineRegisterInfo &MRI,
3467                                                     MachineIRBuilder &B) const {
3468   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3469     return true;
3470 
3471   Register Dst = MI.getOperand(0).getReg();
3472   Register Src = MI.getOperand(2).getReg();
3473   auto Flags = MI.getFlags();
3474 
3475   LLT Ty = MRI.getType(Dst);
3476 
3477   const fltSemantics *FltSemantics;
3478   if (Ty == LLT::scalar(32))
3479     FltSemantics = &APFloat::IEEEsingle();
3480   else if (Ty == LLT::scalar(64))
3481     FltSemantics = &APFloat::IEEEdouble();
3482   else
3483     return false;
3484 
3485   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3486     .addUse(Src)
3487     .setMIFlags(Flags);
3488 
3489   // We don't need to concern ourselves with the snan handling difference, since
3490   // the rsq quieted (or not) so use the one which will directly select.
3491   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3492   const bool UseIEEE = MFI->getMode().IEEE;
3493 
3494   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3495   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3496                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3497 
3498   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3499 
3500   if (UseIEEE)
3501     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3502   else
3503     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3504   MI.eraseFromParent();
3505   return true;
3506 }
3507 
3508 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3509   switch (IID) {
3510   case Intrinsic::amdgcn_ds_fadd:
3511     return AMDGPU::G_ATOMICRMW_FADD;
3512   case Intrinsic::amdgcn_ds_fmin:
3513     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3514   case Intrinsic::amdgcn_ds_fmax:
3515     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3516   default:
3517     llvm_unreachable("not a DS FP intrinsic");
3518   }
3519 }
3520 
3521 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3522                                                       MachineInstr &MI,
3523                                                       Intrinsic::ID IID) const {
3524   GISelChangeObserver &Observer = Helper.Observer;
3525   Observer.changingInstr(MI);
3526 
3527   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3528 
3529   // The remaining operands were used to set fields in the MemOperand on
3530   // construction.
3531   for (int I = 6; I > 3; --I)
3532     MI.RemoveOperand(I);
3533 
3534   MI.RemoveOperand(1); // Remove the intrinsic ID.
3535   Observer.changedInstr(MI);
3536   return true;
3537 }
3538 
3539 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3540                                             MachineRegisterInfo &MRI,
3541                                             MachineIRBuilder &B) const {
3542   uint64_t Offset =
3543     ST.getTargetLowering()->getImplicitParameterOffset(
3544       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3545   LLT DstTy = MRI.getType(DstReg);
3546   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3547 
3548   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3549   if (!loadInputValue(KernargPtrReg, B,
3550                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3551     return false;
3552 
3553   // FIXME: This should be nuw
3554   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3555   return true;
3556 }
3557 
3558 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3559                                                  MachineRegisterInfo &MRI,
3560                                                  MachineIRBuilder &B) const {
3561   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3562   if (!MFI->isEntryFunction()) {
3563     return legalizePreloadedArgIntrin(MI, MRI, B,
3564                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3565   }
3566 
3567   Register DstReg = MI.getOperand(0).getReg();
3568   if (!getImplicitArgPtr(DstReg, MRI, B))
3569     return false;
3570 
3571   MI.eraseFromParent();
3572   return true;
3573 }
3574 
3575 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3576                                               MachineRegisterInfo &MRI,
3577                                               MachineIRBuilder &B,
3578                                               unsigned AddrSpace) const {
3579   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3580   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3581   Register Hi32 = Unmerge.getReg(1);
3582 
3583   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3584   MI.eraseFromParent();
3585   return true;
3586 }
3587 
3588 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3589 // offset (the offset that is included in bounds checking and swizzling, to be
3590 // split between the instruction's voffset and immoffset fields) and soffset
3591 // (the offset that is excluded from bounds checking and swizzling, to go in
3592 // the instruction's soffset field).  This function takes the first kind of
3593 // offset and figures out how to split it between voffset and immoffset.
3594 std::tuple<Register, unsigned, unsigned>
3595 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3596                                         Register OrigOffset) const {
3597   const unsigned MaxImm = 4095;
3598   Register BaseReg;
3599   unsigned TotalConstOffset;
3600   const LLT S32 = LLT::scalar(32);
3601   MachineRegisterInfo &MRI = *B.getMRI();
3602 
3603   std::tie(BaseReg, TotalConstOffset) =
3604       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3605 
3606   unsigned ImmOffset = TotalConstOffset;
3607 
3608   // If BaseReg is a pointer, convert it to int.
3609   if (MRI.getType(BaseReg).isPointer())
3610     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3611 
3612   // If the immediate value is too big for the immoffset field, put the value
3613   // and -4096 into the immoffset field so that the value that is copied/added
3614   // for the voffset field is a multiple of 4096, and it stands more chance
3615   // of being CSEd with the copy/add for another similar load/store.
3616   // However, do not do that rounding down to a multiple of 4096 if that is a
3617   // negative number, as it appears to be illegal to have a negative offset
3618   // in the vgpr, even if adding the immediate offset makes it positive.
3619   unsigned Overflow = ImmOffset & ~MaxImm;
3620   ImmOffset -= Overflow;
3621   if ((int32_t)Overflow < 0) {
3622     Overflow += ImmOffset;
3623     ImmOffset = 0;
3624   }
3625 
3626   if (Overflow != 0) {
3627     if (!BaseReg) {
3628       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3629     } else {
3630       auto OverflowVal = B.buildConstant(S32, Overflow);
3631       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3632     }
3633   }
3634 
3635   if (!BaseReg)
3636     BaseReg = B.buildConstant(S32, 0).getReg(0);
3637 
3638   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3639 }
3640 
3641 /// Handle register layout difference for f16 images for some subtargets.
3642 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3643                                              MachineRegisterInfo &MRI,
3644                                              Register Reg,
3645                                              bool ImageStore) const {
3646   const LLT S16 = LLT::scalar(16);
3647   const LLT S32 = LLT::scalar(32);
3648   LLT StoreVT = MRI.getType(Reg);
3649   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3650 
3651   if (ST.hasUnpackedD16VMem()) {
3652     auto Unmerge = B.buildUnmerge(S16, Reg);
3653 
3654     SmallVector<Register, 4> WideRegs;
3655     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3656       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3657 
3658     int NumElts = StoreVT.getNumElements();
3659 
3660     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3661   }
3662 
3663   if (ImageStore && ST.hasImageStoreD16Bug()) {
3664     if (StoreVT.getNumElements() == 2) {
3665       SmallVector<Register, 4> PackedRegs;
3666       Reg = B.buildBitcast(S32, Reg).getReg(0);
3667       PackedRegs.push_back(Reg);
3668       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3669       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3670     }
3671 
3672     if (StoreVT.getNumElements() == 3) {
3673       SmallVector<Register, 4> PackedRegs;
3674       auto Unmerge = B.buildUnmerge(S16, Reg);
3675       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3676         PackedRegs.push_back(Unmerge.getReg(I));
3677       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3678       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3679       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3680     }
3681 
3682     if (StoreVT.getNumElements() == 4) {
3683       SmallVector<Register, 4> PackedRegs;
3684       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3685       auto Unmerge = B.buildUnmerge(S32, Reg);
3686       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3687         PackedRegs.push_back(Unmerge.getReg(I));
3688       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3689       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3690     }
3691 
3692     llvm_unreachable("invalid data type");
3693   }
3694 
3695   return Reg;
3696 }
3697 
3698 Register AMDGPULegalizerInfo::fixStoreSourceType(
3699   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3700   MachineRegisterInfo *MRI = B.getMRI();
3701   LLT Ty = MRI->getType(VData);
3702 
3703   const LLT S16 = LLT::scalar(16);
3704 
3705   // Fixup illegal register types for i8 stores.
3706   if (Ty == LLT::scalar(8) || Ty == S16) {
3707     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3708     return AnyExt;
3709   }
3710 
3711   if (Ty.isVector()) {
3712     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3713       if (IsFormat)
3714         return handleD16VData(B, *MRI, VData);
3715     }
3716   }
3717 
3718   return VData;
3719 }
3720 
3721 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3722                                               MachineRegisterInfo &MRI,
3723                                               MachineIRBuilder &B,
3724                                               bool IsTyped,
3725                                               bool IsFormat) const {
3726   Register VData = MI.getOperand(1).getReg();
3727   LLT Ty = MRI.getType(VData);
3728   LLT EltTy = Ty.getScalarType();
3729   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3730   const LLT S32 = LLT::scalar(32);
3731 
3732   VData = fixStoreSourceType(B, VData, IsFormat);
3733   Register RSrc = MI.getOperand(2).getReg();
3734 
3735   MachineMemOperand *MMO = *MI.memoperands_begin();
3736   const int MemSize = MMO->getSize();
3737 
3738   unsigned ImmOffset;
3739   unsigned TotalOffset;
3740 
3741   // The typed intrinsics add an immediate after the registers.
3742   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3743 
3744   // The struct intrinsic variants add one additional operand over raw.
3745   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3746   Register VIndex;
3747   int OpOffset = 0;
3748   if (HasVIndex) {
3749     VIndex = MI.getOperand(3).getReg();
3750     OpOffset = 1;
3751   }
3752 
3753   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3754   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3755 
3756   unsigned Format = 0;
3757   if (IsTyped) {
3758     Format = MI.getOperand(5 + OpOffset).getImm();
3759     ++OpOffset;
3760   }
3761 
3762   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3763 
3764   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3765   if (TotalOffset != 0)
3766     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3767 
3768   unsigned Opc;
3769   if (IsTyped) {
3770     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3771                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3772   } else if (IsFormat) {
3773     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3774                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3775   } else {
3776     switch (MemSize) {
3777     case 1:
3778       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3779       break;
3780     case 2:
3781       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3782       break;
3783     default:
3784       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3785       break;
3786     }
3787   }
3788 
3789   if (!VIndex)
3790     VIndex = B.buildConstant(S32, 0).getReg(0);
3791 
3792   auto MIB = B.buildInstr(Opc)
3793     .addUse(VData)              // vdata
3794     .addUse(RSrc)               // rsrc
3795     .addUse(VIndex)             // vindex
3796     .addUse(VOffset)            // voffset
3797     .addUse(SOffset)            // soffset
3798     .addImm(ImmOffset);         // offset(imm)
3799 
3800   if (IsTyped)
3801     MIB.addImm(Format);
3802 
3803   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3804      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3805      .addMemOperand(MMO);
3806 
3807   MI.eraseFromParent();
3808   return true;
3809 }
3810 
3811 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3812                                              MachineRegisterInfo &MRI,
3813                                              MachineIRBuilder &B,
3814                                              bool IsFormat,
3815                                              bool IsTyped) const {
3816   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3817   MachineMemOperand *MMO = *MI.memoperands_begin();
3818   const int MemSize = MMO->getSize();
3819   const LLT S32 = LLT::scalar(32);
3820 
3821   Register Dst = MI.getOperand(0).getReg();
3822   Register RSrc = MI.getOperand(2).getReg();
3823 
3824   // The typed intrinsics add an immediate after the registers.
3825   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3826 
3827   // The struct intrinsic variants add one additional operand over raw.
3828   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3829   Register VIndex;
3830   int OpOffset = 0;
3831   if (HasVIndex) {
3832     VIndex = MI.getOperand(3).getReg();
3833     OpOffset = 1;
3834   }
3835 
3836   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3837   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3838 
3839   unsigned Format = 0;
3840   if (IsTyped) {
3841     Format = MI.getOperand(5 + OpOffset).getImm();
3842     ++OpOffset;
3843   }
3844 
3845   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3846   unsigned ImmOffset;
3847   unsigned TotalOffset;
3848 
3849   LLT Ty = MRI.getType(Dst);
3850   LLT EltTy = Ty.getScalarType();
3851   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3852   const bool Unpacked = ST.hasUnpackedD16VMem();
3853 
3854   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3855   if (TotalOffset != 0)
3856     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3857 
3858   unsigned Opc;
3859 
3860   if (IsTyped) {
3861     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3862                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3863   } else if (IsFormat) {
3864     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3865                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3866   } else {
3867     switch (MemSize) {
3868     case 1:
3869       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3870       break;
3871     case 2:
3872       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3873       break;
3874     default:
3875       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3876       break;
3877     }
3878   }
3879 
3880   Register LoadDstReg;
3881 
3882   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3883   LLT UnpackedTy = Ty.changeElementSize(32);
3884 
3885   if (IsExtLoad)
3886     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3887   else if (Unpacked && IsD16 && Ty.isVector())
3888     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3889   else
3890     LoadDstReg = Dst;
3891 
3892   if (!VIndex)
3893     VIndex = B.buildConstant(S32, 0).getReg(0);
3894 
3895   auto MIB = B.buildInstr(Opc)
3896     .addDef(LoadDstReg)         // vdata
3897     .addUse(RSrc)               // rsrc
3898     .addUse(VIndex)             // vindex
3899     .addUse(VOffset)            // voffset
3900     .addUse(SOffset)            // soffset
3901     .addImm(ImmOffset);         // offset(imm)
3902 
3903   if (IsTyped)
3904     MIB.addImm(Format);
3905 
3906   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3907      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3908      .addMemOperand(MMO);
3909 
3910   if (LoadDstReg != Dst) {
3911     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3912 
3913     // Widen result for extending loads was widened.
3914     if (IsExtLoad)
3915       B.buildTrunc(Dst, LoadDstReg);
3916     else {
3917       // Repack to original 16-bit vector result
3918       // FIXME: G_TRUNC should work, but legalization currently fails
3919       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3920       SmallVector<Register, 4> Repack;
3921       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3922         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3923       B.buildMerge(Dst, Repack);
3924     }
3925   }
3926 
3927   MI.eraseFromParent();
3928   return true;
3929 }
3930 
3931 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3932                                                MachineIRBuilder &B,
3933                                                bool IsInc) const {
3934   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3935                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3936   B.buildInstr(Opc)
3937     .addDef(MI.getOperand(0).getReg())
3938     .addUse(MI.getOperand(2).getReg())
3939     .addUse(MI.getOperand(3).getReg())
3940     .cloneMemRefs(MI);
3941   MI.eraseFromParent();
3942   return true;
3943 }
3944 
3945 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3946   switch (IntrID) {
3947   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3948   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3949     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3950   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3951   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3952     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3953   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3954   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3955     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3956   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3957   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3958     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3959   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3960   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3961     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3962   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3963   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3964     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3965   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3966   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3967     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3968   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3969   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3970     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3971   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3972   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3973     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3974   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3975   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3976     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3977   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3978   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3979     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3980   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3981   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3982     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3983   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3984   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3985     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3986   case Intrinsic::amdgcn_buffer_atomic_fadd:
3987   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3988   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3989     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3990   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3991   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3992     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3993   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3994   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3995     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3996   default:
3997     llvm_unreachable("unhandled atomic opcode");
3998   }
3999 }
4000 
4001 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4002                                                MachineIRBuilder &B,
4003                                                Intrinsic::ID IID) const {
4004   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4005                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4006   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4007 
4008   Register Dst;
4009 
4010   int OpOffset = 0;
4011   if (HasReturn) {
4012     // A few FP atomics do not support return values.
4013     Dst = MI.getOperand(0).getReg();
4014   } else {
4015     OpOffset = -1;
4016   }
4017 
4018   Register VData = MI.getOperand(2 + OpOffset).getReg();
4019   Register CmpVal;
4020 
4021   if (IsCmpSwap) {
4022     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4023     ++OpOffset;
4024   }
4025 
4026   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4027   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4028 
4029   // The struct intrinsic variants add one additional operand over raw.
4030   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4031   Register VIndex;
4032   if (HasVIndex) {
4033     VIndex = MI.getOperand(4 + OpOffset).getReg();
4034     ++OpOffset;
4035   }
4036 
4037   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4038   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4039   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4040 
4041   MachineMemOperand *MMO = *MI.memoperands_begin();
4042 
4043   unsigned ImmOffset;
4044   unsigned TotalOffset;
4045   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4046   if (TotalOffset != 0)
4047     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4048 
4049   if (!VIndex)
4050     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4051 
4052   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4053 
4054   if (HasReturn)
4055     MIB.addDef(Dst);
4056 
4057   MIB.addUse(VData); // vdata
4058 
4059   if (IsCmpSwap)
4060     MIB.addReg(CmpVal);
4061 
4062   MIB.addUse(RSrc)               // rsrc
4063      .addUse(VIndex)             // vindex
4064      .addUse(VOffset)            // voffset
4065      .addUse(SOffset)            // soffset
4066      .addImm(ImmOffset)          // offset(imm)
4067      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4068      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4069      .addMemOperand(MMO);
4070 
4071   MI.eraseFromParent();
4072   return true;
4073 }
4074 
4075 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4076 /// vector with s16 typed elements.
4077 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4078                                       SmallVectorImpl<Register> &PackedAddrs,
4079                                       unsigned ArgOffset,
4080                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4081                                       bool IsA16, bool IsG16) {
4082   const LLT S16 = LLT::scalar(16);
4083   const LLT V2S16 = LLT::vector(2, 16);
4084   auto EndIdx = Intr->VAddrEnd;
4085 
4086   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4087     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4088     if (!SrcOp.isReg())
4089       continue; // _L to _LZ may have eliminated this.
4090 
4091     Register AddrReg = SrcOp.getReg();
4092 
4093     if (I < Intr->GradientStart) {
4094       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4095       PackedAddrs.push_back(AddrReg);
4096     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4097                (I >= Intr->CoordStart && !IsA16)) {
4098       // Handle any gradient or coordinate operands that should not be packed
4099       PackedAddrs.push_back(AddrReg);
4100     } else {
4101       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4102       // derivatives dx/dh and dx/dv are packed with undef.
4103       if (((I + 1) >= EndIdx) ||
4104           ((Intr->NumGradients / 2) % 2 == 1 &&
4105            (I == static_cast<unsigned>(Intr->GradientStart +
4106                                        (Intr->NumGradients / 2) - 1) ||
4107             I == static_cast<unsigned>(Intr->GradientStart +
4108                                        Intr->NumGradients - 1))) ||
4109           // Check for _L to _LZ optimization
4110           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4111         PackedAddrs.push_back(
4112             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4113                 .getReg(0));
4114       } else {
4115         PackedAddrs.push_back(
4116             B.buildBuildVector(
4117                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4118                 .getReg(0));
4119         ++I;
4120       }
4121     }
4122   }
4123 }
4124 
4125 /// Convert from separate vaddr components to a single vector address register,
4126 /// and replace the remaining operands with $noreg.
4127 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4128                                      int DimIdx, int NumVAddrs) {
4129   const LLT S32 = LLT::scalar(32);
4130 
4131   SmallVector<Register, 8> AddrRegs;
4132   for (int I = 0; I != NumVAddrs; ++I) {
4133     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4134     if (SrcOp.isReg()) {
4135       AddrRegs.push_back(SrcOp.getReg());
4136       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4137     }
4138   }
4139 
4140   int NumAddrRegs = AddrRegs.size();
4141   if (NumAddrRegs != 1) {
4142     // Round up to 8 elements for v5-v7
4143     // FIXME: Missing intermediate sized register classes and instructions.
4144     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4145       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4146       auto Undef = B.buildUndef(S32);
4147       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4148       NumAddrRegs = RoundedNumRegs;
4149     }
4150 
4151     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4152     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4153   }
4154 
4155   for (int I = 1; I != NumVAddrs; ++I) {
4156     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4157     if (SrcOp.isReg())
4158       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4159   }
4160 }
4161 
4162 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4163 ///
4164 /// Depending on the subtarget, load/store with 16-bit element data need to be
4165 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4166 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4167 /// registers.
4168 ///
4169 /// We don't want to directly select image instructions just yet, but also want
4170 /// to exposes all register repacking to the legalizer/combiners. We also don't
4171 /// want a selected instrution entering RegBankSelect. In order to avoid
4172 /// defining a multitude of intermediate image instructions, directly hack on
4173 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4174 /// now unnecessary arguments with $noreg.
4175 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4176     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4177     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4178 
4179   const unsigned NumDefs = MI.getNumExplicitDefs();
4180   const unsigned ArgOffset = NumDefs + 1;
4181   bool IsTFE = NumDefs == 2;
4182   // We are only processing the operands of d16 image operations on subtargets
4183   // that use the unpacked register layout, or need to repack the TFE result.
4184 
4185   // TODO: Do we need to guard against already legalized intrinsics?
4186   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4187       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4188 
4189   MachineRegisterInfo *MRI = B.getMRI();
4190   const LLT S32 = LLT::scalar(32);
4191   const LLT S16 = LLT::scalar(16);
4192   const LLT V2S16 = LLT::vector(2, 16);
4193 
4194   unsigned DMask = 0;
4195 
4196   // Check for 16 bit addresses and pack if true.
4197   LLT GradTy =
4198       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4199   LLT AddrTy =
4200       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4201   const bool IsG16 = GradTy == S16;
4202   const bool IsA16 = AddrTy == S16;
4203 
4204   int DMaskLanes = 0;
4205   if (!BaseOpcode->Atomic) {
4206     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4207     if (BaseOpcode->Gather4) {
4208       DMaskLanes = 4;
4209     } else if (DMask != 0) {
4210       DMaskLanes = countPopulation(DMask);
4211     } else if (!IsTFE && !BaseOpcode->Store) {
4212       // If dmask is 0, this is a no-op load. This can be eliminated.
4213       B.buildUndef(MI.getOperand(0));
4214       MI.eraseFromParent();
4215       return true;
4216     }
4217   }
4218 
4219   Observer.changingInstr(MI);
4220   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4221 
4222   unsigned NewOpcode = NumDefs == 0 ?
4223     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4224 
4225   // Track that we legalized this
4226   MI.setDesc(B.getTII().get(NewOpcode));
4227 
4228   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4229   // dmask to be at least 1 otherwise the instruction will fail
4230   if (IsTFE && DMask == 0) {
4231     DMask = 0x1;
4232     DMaskLanes = 1;
4233     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4234   }
4235 
4236   if (BaseOpcode->Atomic) {
4237     Register VData0 = MI.getOperand(2).getReg();
4238     LLT Ty = MRI->getType(VData0);
4239 
4240     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4241     if (Ty.isVector())
4242       return false;
4243 
4244     if (BaseOpcode->AtomicX2) {
4245       Register VData1 = MI.getOperand(3).getReg();
4246       // The two values are packed in one register.
4247       LLT PackedTy = LLT::vector(2, Ty);
4248       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4249       MI.getOperand(2).setReg(Concat.getReg(0));
4250       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4251     }
4252   }
4253 
4254   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4255 
4256   // Optimize _L to _LZ when _L is zero
4257   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4258           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4259     const ConstantFP *ConstantLod;
4260 
4261     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4262                  m_GFCst(ConstantLod))) {
4263       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4264         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4265         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4266             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4267                                                       Intr->Dim);
4268 
4269         // The starting indexes should remain in the same place.
4270         --CorrectedNumVAddrs;
4271 
4272         MI.getOperand(MI.getNumExplicitDefs())
4273             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4274         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4275         Intr = NewImageDimIntr;
4276       }
4277     }
4278   }
4279 
4280   // Optimize _mip away, when 'lod' is zero
4281   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4282     int64_t ConstantLod;
4283     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4284                  m_ICst(ConstantLod))) {
4285       if (ConstantLod == 0) {
4286         // TODO: Change intrinsic opcode and remove operand instead or replacing
4287         // it with 0, as the _L to _LZ handling is done above.
4288         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4289         --CorrectedNumVAddrs;
4290       }
4291     }
4292   }
4293 
4294   // Rewrite the addressing register layout before doing anything else.
4295   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4296     // 16 bit gradients are supported, but are tied to the A16 control
4297     // so both gradients and addresses must be 16 bit
4298     return false;
4299   }
4300 
4301   if (IsA16 && !ST.hasA16()) {
4302     // A16 not supported
4303     return false;
4304   }
4305 
4306   if (IsA16 || IsG16) {
4307     if (Intr->NumVAddrs > 1) {
4308       SmallVector<Register, 4> PackedRegs;
4309 
4310       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4311                                 IsG16);
4312 
4313       // See also below in the non-a16 branch
4314       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4315 
4316       if (!UseNSA && PackedRegs.size() > 1) {
4317         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4318         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4319         PackedRegs[0] = Concat.getReg(0);
4320         PackedRegs.resize(1);
4321       }
4322 
4323       const unsigned NumPacked = PackedRegs.size();
4324       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4325         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4326         if (!SrcOp.isReg()) {
4327           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4328           continue;
4329         }
4330 
4331         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4332 
4333         if (I - Intr->VAddrStart < NumPacked)
4334           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4335         else
4336           SrcOp.setReg(AMDGPU::NoRegister);
4337       }
4338     }
4339   } else {
4340     // If the register allocator cannot place the address registers contiguously
4341     // without introducing moves, then using the non-sequential address encoding
4342     // is always preferable, since it saves VALU instructions and is usually a
4343     // wash in terms of code size or even better.
4344     //
4345     // However, we currently have no way of hinting to the register allocator
4346     // that MIMG addresses should be placed contiguously when it is possible to
4347     // do so, so force non-NSA for the common 2-address case as a heuristic.
4348     //
4349     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4350     // allocation when possible.
4351     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4352 
4353     if (!UseNSA && Intr->NumVAddrs > 1)
4354       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4355                                Intr->NumVAddrs);
4356   }
4357 
4358   int Flags = 0;
4359   if (IsA16)
4360     Flags |= 1;
4361   if (IsG16)
4362     Flags |= 2;
4363   MI.addOperand(MachineOperand::CreateImm(Flags));
4364 
4365   if (BaseOpcode->Store) { // No TFE for stores?
4366     // TODO: Handle dmask trim
4367     Register VData = MI.getOperand(1).getReg();
4368     LLT Ty = MRI->getType(VData);
4369     if (!Ty.isVector() || Ty.getElementType() != S16)
4370       return true;
4371 
4372     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4373     if (RepackedReg != VData) {
4374       MI.getOperand(1).setReg(RepackedReg);
4375     }
4376 
4377     return true;
4378   }
4379 
4380   Register DstReg = MI.getOperand(0).getReg();
4381   LLT Ty = MRI->getType(DstReg);
4382   const LLT EltTy = Ty.getScalarType();
4383   const bool IsD16 = Ty.getScalarType() == S16;
4384   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4385 
4386   // Confirm that the return type is large enough for the dmask specified
4387   if (NumElts < DMaskLanes)
4388     return false;
4389 
4390   if (NumElts > 4 || DMaskLanes > 4)
4391     return false;
4392 
4393   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4394   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4395 
4396   // The raw dword aligned data component of the load. The only legal cases
4397   // where this matters should be when using the packed D16 format, for
4398   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4399   LLT RoundedTy;
4400 
4401   // S32 vector to to cover all data, plus TFE result element.
4402   LLT TFETy;
4403 
4404   // Register type to use for each loaded component. Will be S32 or V2S16.
4405   LLT RegTy;
4406 
4407   if (IsD16 && ST.hasUnpackedD16VMem()) {
4408     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4409     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4410     RegTy = S32;
4411   } else {
4412     unsigned EltSize = EltTy.getSizeInBits();
4413     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4414     unsigned RoundedSize = 32 * RoundedElts;
4415     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4416     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4417     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4418   }
4419 
4420   // The return type does not need adjustment.
4421   // TODO: Should we change s16 case to s32 or <2 x s16>?
4422   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4423     return true;
4424 
4425   Register Dst1Reg;
4426 
4427   // Insert after the instruction.
4428   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4429 
4430   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4431   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4432   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4433   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4434 
4435   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4436 
4437   MI.getOperand(0).setReg(NewResultReg);
4438 
4439   // In the IR, TFE is supposed to be used with a 2 element struct return
4440   // type. The intruction really returns these two values in one contiguous
4441   // register, with one additional dword beyond the loaded data. Rewrite the
4442   // return type to use a single register result.
4443 
4444   if (IsTFE) {
4445     Dst1Reg = MI.getOperand(1).getReg();
4446     if (MRI->getType(Dst1Reg) != S32)
4447       return false;
4448 
4449     // TODO: Make sure the TFE operand bit is set.
4450     MI.RemoveOperand(1);
4451 
4452     // Handle the easy case that requires no repack instructions.
4453     if (Ty == S32) {
4454       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4455       return true;
4456     }
4457   }
4458 
4459   // Now figure out how to copy the new result register back into the old
4460   // result.
4461   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4462 
4463   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4464 
4465   if (ResultNumRegs == 1) {
4466     assert(!IsTFE);
4467     ResultRegs[0] = NewResultReg;
4468   } else {
4469     // We have to repack into a new vector of some kind.
4470     for (int I = 0; I != NumDataRegs; ++I)
4471       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4472     B.buildUnmerge(ResultRegs, NewResultReg);
4473 
4474     // Drop the final TFE element to get the data part. The TFE result is
4475     // directly written to the right place already.
4476     if (IsTFE)
4477       ResultRegs.resize(NumDataRegs);
4478   }
4479 
4480   // For an s16 scalar result, we form an s32 result with a truncate regardless
4481   // of packed vs. unpacked.
4482   if (IsD16 && !Ty.isVector()) {
4483     B.buildTrunc(DstReg, ResultRegs[0]);
4484     return true;
4485   }
4486 
4487   // Avoid a build/concat_vector of 1 entry.
4488   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4489     B.buildBitcast(DstReg, ResultRegs[0]);
4490     return true;
4491   }
4492 
4493   assert(Ty.isVector());
4494 
4495   if (IsD16) {
4496     // For packed D16 results with TFE enabled, all the data components are
4497     // S32. Cast back to the expected type.
4498     //
4499     // TODO: We don't really need to use load s32 elements. We would only need one
4500     // cast for the TFE result if a multiple of v2s16 was used.
4501     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4502       for (Register &Reg : ResultRegs)
4503         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4504     } else if (ST.hasUnpackedD16VMem()) {
4505       for (Register &Reg : ResultRegs)
4506         Reg = B.buildTrunc(S16, Reg).getReg(0);
4507     }
4508   }
4509 
4510   auto padWithUndef = [&](LLT Ty, int NumElts) {
4511     if (NumElts == 0)
4512       return;
4513     Register Undef = B.buildUndef(Ty).getReg(0);
4514     for (int I = 0; I != NumElts; ++I)
4515       ResultRegs.push_back(Undef);
4516   };
4517 
4518   // Pad out any elements eliminated due to the dmask.
4519   LLT ResTy = MRI->getType(ResultRegs[0]);
4520   if (!ResTy.isVector()) {
4521     padWithUndef(ResTy, NumElts - ResultRegs.size());
4522     B.buildBuildVector(DstReg, ResultRegs);
4523     return true;
4524   }
4525 
4526   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4527   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4528 
4529   // Deal with the one annoying legal case.
4530   const LLT V3S16 = LLT::vector(3, 16);
4531   if (Ty == V3S16) {
4532     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4533     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4534     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4535     return true;
4536   }
4537 
4538   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4539   B.buildConcatVectors(DstReg, ResultRegs);
4540   return true;
4541 }
4542 
4543 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4544   LegalizerHelper &Helper, MachineInstr &MI) const {
4545   MachineIRBuilder &B = Helper.MIRBuilder;
4546   GISelChangeObserver &Observer = Helper.Observer;
4547 
4548   Register Dst = MI.getOperand(0).getReg();
4549   LLT Ty = B.getMRI()->getType(Dst);
4550   unsigned Size = Ty.getSizeInBits();
4551   MachineFunction &MF = B.getMF();
4552 
4553   Observer.changingInstr(MI);
4554 
4555   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4556     Ty = getBitcastRegisterType(Ty);
4557     Helper.bitcastDst(MI, Ty, 0);
4558     Dst = MI.getOperand(0).getReg();
4559     B.setInsertPt(B.getMBB(), MI);
4560   }
4561 
4562   // FIXME: We don't really need this intermediate instruction. The intrinsic
4563   // should be fixed to have a memory operand. Since it's readnone, we're not
4564   // allowed to add one.
4565   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4566   MI.RemoveOperand(1); // Remove intrinsic ID
4567 
4568   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4569   // TODO: Should this use datalayout alignment?
4570   const unsigned MemSize = (Size + 7) / 8;
4571   const Align MemAlign(4);
4572   MachineMemOperand *MMO = MF.getMachineMemOperand(
4573       MachinePointerInfo(),
4574       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4575           MachineMemOperand::MOInvariant,
4576       MemSize, MemAlign);
4577   MI.addMemOperand(MF, MMO);
4578 
4579   // There are no 96-bit result scalar loads, but widening to 128-bit should
4580   // always be legal. We may need to restore this to a 96-bit result if it turns
4581   // out this needs to be converted to a vector load during RegBankSelect.
4582   if (!isPowerOf2_32(Size)) {
4583     if (Ty.isVector())
4584       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4585     else
4586       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4587   }
4588 
4589   Observer.changedInstr(MI);
4590   return true;
4591 }
4592 
4593 // TODO: Move to selection
4594 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4595                                                 MachineRegisterInfo &MRI,
4596                                                 MachineIRBuilder &B) const {
4597   if (!ST.isTrapHandlerEnabled() ||
4598       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4599     return legalizeTrapEndpgm(MI, MRI, B);
4600 
4601   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4602     switch (*HsaAbiVer) {
4603     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4604     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4605       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4606     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4607       return ST.supportsGetDoorbellID() ?
4608           legalizeTrapHsa(MI, MRI, B) :
4609           legalizeTrapHsaQueuePtr(MI, MRI, B);
4610     }
4611   }
4612 
4613   llvm_unreachable("Unknown trap handler");
4614 }
4615 
4616 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4617     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4618   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4619   MI.eraseFromParent();
4620   return true;
4621 }
4622 
4623 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4624     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4625   // Pass queue pointer to trap handler as input, and insert trap instruction
4626   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4627   Register LiveIn =
4628     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4629   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4630     return false;
4631 
4632   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4633   B.buildCopy(SGPR01, LiveIn);
4634   B.buildInstr(AMDGPU::S_TRAP)
4635       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4636       .addReg(SGPR01, RegState::Implicit);
4637 
4638   MI.eraseFromParent();
4639   return true;
4640 }
4641 
4642 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4643     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4644   B.buildInstr(AMDGPU::S_TRAP)
4645       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4646   MI.eraseFromParent();
4647   return true;
4648 }
4649 
4650 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4651     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4652   // Is non-HSA path or trap-handler disabled? then, report a warning
4653   // accordingly
4654   if (!ST.isTrapHandlerEnabled() ||
4655       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4656     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4657                                      "debugtrap handler not supported",
4658                                      MI.getDebugLoc(), DS_Warning);
4659     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4660     Ctx.diagnose(NoTrap);
4661   } else {
4662     // Insert debug-trap instruction
4663     B.buildInstr(AMDGPU::S_TRAP)
4664         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4665   }
4666 
4667   MI.eraseFromParent();
4668   return true;
4669 }
4670 
4671 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4672                                                MachineIRBuilder &B) const {
4673   MachineRegisterInfo &MRI = *B.getMRI();
4674   const LLT S16 = LLT::scalar(16);
4675   const LLT S32 = LLT::scalar(32);
4676 
4677   Register DstReg = MI.getOperand(0).getReg();
4678   Register NodePtr = MI.getOperand(2).getReg();
4679   Register RayExtent = MI.getOperand(3).getReg();
4680   Register RayOrigin = MI.getOperand(4).getReg();
4681   Register RayDir = MI.getOperand(5).getReg();
4682   Register RayInvDir = MI.getOperand(6).getReg();
4683   Register TDescr = MI.getOperand(7).getReg();
4684 
4685   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4686   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4687   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4688                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4689                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4690                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4691 
4692   SmallVector<Register, 12> Ops;
4693   if (Is64) {
4694     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4695     Ops.push_back(Unmerge.getReg(0));
4696     Ops.push_back(Unmerge.getReg(1));
4697   } else {
4698     Ops.push_back(NodePtr);
4699   }
4700   Ops.push_back(RayExtent);
4701 
4702   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4703     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4704     Ops.push_back(Unmerge.getReg(0));
4705     Ops.push_back(Unmerge.getReg(1));
4706     Ops.push_back(Unmerge.getReg(2));
4707   };
4708 
4709   packLanes(RayOrigin);
4710   if (IsA16) {
4711     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4712     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4713     Register R1 = MRI.createGenericVirtualRegister(S32);
4714     Register R2 = MRI.createGenericVirtualRegister(S32);
4715     Register R3 = MRI.createGenericVirtualRegister(S32);
4716     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4717     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4718     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4719     Ops.push_back(R1);
4720     Ops.push_back(R2);
4721     Ops.push_back(R3);
4722   } else {
4723     packLanes(RayDir);
4724     packLanes(RayInvDir);
4725   }
4726 
4727   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4728     .addDef(DstReg)
4729     .addImm(Opcode);
4730 
4731   for (Register R : Ops) {
4732     MIB.addUse(R);
4733   }
4734 
4735   MIB.addUse(TDescr)
4736      .addImm(IsA16 ? 1 : 0)
4737      .cloneMemRefs(MI);
4738 
4739   MI.eraseFromParent();
4740   return true;
4741 }
4742 
4743 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4744                                             MachineInstr &MI) const {
4745   MachineIRBuilder &B = Helper.MIRBuilder;
4746   MachineRegisterInfo &MRI = *B.getMRI();
4747 
4748   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4749   auto IntrID = MI.getIntrinsicID();
4750   switch (IntrID) {
4751   case Intrinsic::amdgcn_if:
4752   case Intrinsic::amdgcn_else: {
4753     MachineInstr *Br = nullptr;
4754     MachineBasicBlock *UncondBrTarget = nullptr;
4755     bool Negated = false;
4756     if (MachineInstr *BrCond =
4757             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4758       const SIRegisterInfo *TRI
4759         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4760 
4761       Register Def = MI.getOperand(1).getReg();
4762       Register Use = MI.getOperand(3).getReg();
4763 
4764       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4765 
4766       if (Negated)
4767         std::swap(CondBrTarget, UncondBrTarget);
4768 
4769       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4770       if (IntrID == Intrinsic::amdgcn_if) {
4771         B.buildInstr(AMDGPU::SI_IF)
4772           .addDef(Def)
4773           .addUse(Use)
4774           .addMBB(UncondBrTarget);
4775       } else {
4776         B.buildInstr(AMDGPU::SI_ELSE)
4777             .addDef(Def)
4778             .addUse(Use)
4779             .addMBB(UncondBrTarget);
4780       }
4781 
4782       if (Br) {
4783         Br->getOperand(0).setMBB(CondBrTarget);
4784       } else {
4785         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4786         // since we're swapping branch targets it needs to be reinserted.
4787         // FIXME: IRTranslator should probably not do this
4788         B.buildBr(*CondBrTarget);
4789       }
4790 
4791       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4792       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4793       MI.eraseFromParent();
4794       BrCond->eraseFromParent();
4795       return true;
4796     }
4797 
4798     return false;
4799   }
4800   case Intrinsic::amdgcn_loop: {
4801     MachineInstr *Br = nullptr;
4802     MachineBasicBlock *UncondBrTarget = nullptr;
4803     bool Negated = false;
4804     if (MachineInstr *BrCond =
4805             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4806       const SIRegisterInfo *TRI
4807         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4808 
4809       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4810       Register Reg = MI.getOperand(2).getReg();
4811 
4812       if (Negated)
4813         std::swap(CondBrTarget, UncondBrTarget);
4814 
4815       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4816       B.buildInstr(AMDGPU::SI_LOOP)
4817         .addUse(Reg)
4818         .addMBB(UncondBrTarget);
4819 
4820       if (Br)
4821         Br->getOperand(0).setMBB(CondBrTarget);
4822       else
4823         B.buildBr(*CondBrTarget);
4824 
4825       MI.eraseFromParent();
4826       BrCond->eraseFromParent();
4827       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4828       return true;
4829     }
4830 
4831     return false;
4832   }
4833   case Intrinsic::amdgcn_kernarg_segment_ptr:
4834     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4835       // This only makes sense to call in a kernel, so just lower to null.
4836       B.buildConstant(MI.getOperand(0).getReg(), 0);
4837       MI.eraseFromParent();
4838       return true;
4839     }
4840 
4841     return legalizePreloadedArgIntrin(
4842       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4843   case Intrinsic::amdgcn_implicitarg_ptr:
4844     return legalizeImplicitArgPtr(MI, MRI, B);
4845   case Intrinsic::amdgcn_workitem_id_x:
4846     return legalizePreloadedArgIntrin(MI, MRI, B,
4847                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4848   case Intrinsic::amdgcn_workitem_id_y:
4849     return legalizePreloadedArgIntrin(MI, MRI, B,
4850                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4851   case Intrinsic::amdgcn_workitem_id_z:
4852     return legalizePreloadedArgIntrin(MI, MRI, B,
4853                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4854   case Intrinsic::amdgcn_workgroup_id_x:
4855     return legalizePreloadedArgIntrin(MI, MRI, B,
4856                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4857   case Intrinsic::amdgcn_workgroup_id_y:
4858     return legalizePreloadedArgIntrin(MI, MRI, B,
4859                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4860   case Intrinsic::amdgcn_workgroup_id_z:
4861     return legalizePreloadedArgIntrin(MI, MRI, B,
4862                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4863   case Intrinsic::amdgcn_dispatch_ptr:
4864     return legalizePreloadedArgIntrin(MI, MRI, B,
4865                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4866   case Intrinsic::amdgcn_queue_ptr:
4867     return legalizePreloadedArgIntrin(MI, MRI, B,
4868                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4869   case Intrinsic::amdgcn_implicit_buffer_ptr:
4870     return legalizePreloadedArgIntrin(
4871       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4872   case Intrinsic::amdgcn_dispatch_id:
4873     return legalizePreloadedArgIntrin(MI, MRI, B,
4874                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4875   case Intrinsic::amdgcn_fdiv_fast:
4876     return legalizeFDIVFastIntrin(MI, MRI, B);
4877   case Intrinsic::amdgcn_is_shared:
4878     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4879   case Intrinsic::amdgcn_is_private:
4880     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4881   case Intrinsic::amdgcn_wavefrontsize: {
4882     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4883     MI.eraseFromParent();
4884     return true;
4885   }
4886   case Intrinsic::amdgcn_s_buffer_load:
4887     return legalizeSBufferLoad(Helper, MI);
4888   case Intrinsic::amdgcn_raw_buffer_store:
4889   case Intrinsic::amdgcn_struct_buffer_store:
4890     return legalizeBufferStore(MI, MRI, B, false, false);
4891   case Intrinsic::amdgcn_raw_buffer_store_format:
4892   case Intrinsic::amdgcn_struct_buffer_store_format:
4893     return legalizeBufferStore(MI, MRI, B, false, true);
4894   case Intrinsic::amdgcn_raw_tbuffer_store:
4895   case Intrinsic::amdgcn_struct_tbuffer_store:
4896     return legalizeBufferStore(MI, MRI, B, true, true);
4897   case Intrinsic::amdgcn_raw_buffer_load:
4898   case Intrinsic::amdgcn_struct_buffer_load:
4899     return legalizeBufferLoad(MI, MRI, B, false, false);
4900   case Intrinsic::amdgcn_raw_buffer_load_format:
4901   case Intrinsic::amdgcn_struct_buffer_load_format:
4902     return legalizeBufferLoad(MI, MRI, B, true, false);
4903   case Intrinsic::amdgcn_raw_tbuffer_load:
4904   case Intrinsic::amdgcn_struct_tbuffer_load:
4905     return legalizeBufferLoad(MI, MRI, B, true, true);
4906   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4907   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4908   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4909   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4910   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4911   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4912   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4913   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4914   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4915   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4916   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4917   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4918   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4919   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4920   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4921   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4922   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4923   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4924   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4925   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4926   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4927   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4928   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4929   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4930   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4931   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4932   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4933   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4934   case Intrinsic::amdgcn_buffer_atomic_fadd:
4935   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4936   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4937   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4938   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4939     return legalizeBufferAtomic(MI, B, IntrID);
4940   case Intrinsic::amdgcn_atomic_inc:
4941     return legalizeAtomicIncDec(MI, B, true);
4942   case Intrinsic::amdgcn_atomic_dec:
4943     return legalizeAtomicIncDec(MI, B, false);
4944   case Intrinsic::trap:
4945     return legalizeTrapIntrinsic(MI, MRI, B);
4946   case Intrinsic::debugtrap:
4947     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4948   case Intrinsic::amdgcn_rsq_clamp:
4949     return legalizeRsqClampIntrinsic(MI, MRI, B);
4950   case Intrinsic::amdgcn_ds_fadd:
4951   case Intrinsic::amdgcn_ds_fmin:
4952   case Intrinsic::amdgcn_ds_fmax:
4953     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4954   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4955     return legalizeBVHIntrinsic(MI, B);
4956   default: {
4957     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4958             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4959       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4960     return true;
4961   }
4962   }
4963 
4964   return true;
4965 }
4966