1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 // If we have a truncating store or an extending load with a data size larger
226 // than 32-bits, we need to reduce to a 32-bit type.
227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return ST.enableFlatScratch() ? 128 : 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // Do not handle extending vector loads.
280   if (Ty.isVector() && MemSize != RegSize)
281     return false;
282 
283   // TODO: We should be able to widen loads if the alignment is high enough, but
284   // we also need to modify the memory access size.
285 #if 0
286   // Accept widening loads based on alignment.
287   if (IsLoad && MemSize < Size)
288     MemSize = std::max(MemSize, Align);
289 #endif
290 
291   // Only 1-byte and 2-byte to 32-bit extloads are valid.
292   if (MemSize != RegSize && RegSize != 32)
293     return false;
294 
295   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296     return false;
297 
298   switch (MemSize) {
299   case 8:
300   case 16:
301   case 32:
302   case 64:
303   case 128:
304     break;
305   case 96:
306     if (!ST.hasDwordx3LoadStores())
307       return false;
308     break;
309   case 256:
310   case 512:
311     // These may contextually need to be broken down.
312     break;
313   default:
314     return false;
315   }
316 
317   assert(RegSize >= MemSize);
318 
319   if (AlignBits < MemSize) {
320     const SITargetLowering *TLI = ST.getTargetLowering();
321     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322                                                  Align(AlignBits / 8)))
323       return false;
324   }
325 
326   return true;
327 }
328 
329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330 // workaround this. Eventually it should ignore the type for loads and only care
331 // about the size. Return true in cases where we will workaround this for now by
332 // bitcasting.
333 static bool loadStoreBitcastWorkaround(const LLT Ty) {
334   if (EnableNewLegality)
335     return false;
336 
337   const unsigned Size = Ty.getSizeInBits();
338   if (Size <= 64)
339     return false;
340   if (!Ty.isVector())
341     return true;
342 
343   LLT EltTy = Ty.getElementType();
344   if (EltTy.isPointer())
345     return true;
346 
347   unsigned EltSize = EltTy.getSizeInBits();
348   return EltSize != 32 && EltSize != 64;
349 }
350 
351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352   const LLT Ty = Query.Types[0];
353   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
354          !loadStoreBitcastWorkaround(Ty);
355 }
356 
357 /// Return true if a load or store of the type should be lowered with a bitcast
358 /// to a different type.
359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360                                        const unsigned MemSizeInBits) {
361   const unsigned Size = Ty.getSizeInBits();
362   if (Size != MemSizeInBits)
363     return Size <= 32 && Ty.isVector();
364 
365   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
366     return true;
367   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
368          !isRegisterVectorElementType(Ty.getElementType());
369 }
370 
371 /// Return true if we should legalize a load by widening an odd sized memory
372 /// access up to the alignment. Note this case when the memory access itself
373 /// changes, not the size of the result register.
374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375                             unsigned AlignInBits, unsigned AddrSpace,
376                             unsigned Opcode) {
377   // We don't want to widen cases that are naturally legal.
378   if (isPowerOf2_32(SizeInBits))
379     return false;
380 
381   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382   // end up widening these for a scalar load during RegBankSelect, since there
383   // aren't 96-bit scalar loads.
384   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385     return false;
386 
387   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388     return false;
389 
390   // A load is known dereferenceable up to the alignment, so it's legal to widen
391   // to it.
392   //
393   // TODO: Could check dereferenceable for less aligned cases.
394   unsigned RoundedSize = NextPowerOf2(SizeInBits);
395   if (AlignInBits < RoundedSize)
396     return false;
397 
398   // Do not widen if it would introduce a slow unaligned load.
399   const SITargetLowering *TLI = ST.getTargetLowering();
400   bool Fast = false;
401   return TLI->allowsMisalignedMemoryAccessesImpl(
402              RoundedSize, AddrSpace, Align(AlignInBits / 8),
403              MachineMemOperand::MOLoad, &Fast) &&
404          Fast;
405 }
406 
407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408                             unsigned Opcode) {
409   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410     return false;
411 
412   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413                          Query.MMODescrs[0].AlignInBits,
414                          Query.Types[1].getAddressSpace(), Opcode);
415 }
416 
417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
418                                          const GCNTargetMachine &TM)
419   :  ST(ST_) {
420   using namespace TargetOpcode;
421 
422   auto GetAddrSpacePtr = [&TM](unsigned AS) {
423     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424   };
425 
426   const LLT S1 = LLT::scalar(1);
427   const LLT S8 = LLT::scalar(8);
428   const LLT S16 = LLT::scalar(16);
429   const LLT S32 = LLT::scalar(32);
430   const LLT S64 = LLT::scalar(64);
431   const LLT S128 = LLT::scalar(128);
432   const LLT S256 = LLT::scalar(256);
433   const LLT S512 = LLT::scalar(512);
434   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435 
436   const LLT V2S8 = LLT::vector(2, 8);
437   const LLT V2S16 = LLT::vector(2, 16);
438   const LLT V4S16 = LLT::vector(4, 16);
439 
440   const LLT V2S32 = LLT::vector(2, 32);
441   const LLT V3S32 = LLT::vector(3, 32);
442   const LLT V4S32 = LLT::vector(4, 32);
443   const LLT V5S32 = LLT::vector(5, 32);
444   const LLT V6S32 = LLT::vector(6, 32);
445   const LLT V7S32 = LLT::vector(7, 32);
446   const LLT V8S32 = LLT::vector(8, 32);
447   const LLT V9S32 = LLT::vector(9, 32);
448   const LLT V10S32 = LLT::vector(10, 32);
449   const LLT V11S32 = LLT::vector(11, 32);
450   const LLT V12S32 = LLT::vector(12, 32);
451   const LLT V13S32 = LLT::vector(13, 32);
452   const LLT V14S32 = LLT::vector(14, 32);
453   const LLT V15S32 = LLT::vector(15, 32);
454   const LLT V16S32 = LLT::vector(16, 32);
455   const LLT V32S32 = LLT::vector(32, 32);
456 
457   const LLT V2S64 = LLT::vector(2, 64);
458   const LLT V3S64 = LLT::vector(3, 64);
459   const LLT V4S64 = LLT::vector(4, 64);
460   const LLT V5S64 = LLT::vector(5, 64);
461   const LLT V6S64 = LLT::vector(6, 64);
462   const LLT V7S64 = LLT::vector(7, 64);
463   const LLT V8S64 = LLT::vector(8, 64);
464   const LLT V16S64 = LLT::vector(16, 64);
465 
466   std::initializer_list<LLT> AllS32Vectors =
467     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469   std::initializer_list<LLT> AllS64Vectors =
470     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471 
472   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479 
480   const LLT CodePtr = FlatPtr;
481 
482   const std::initializer_list<LLT> AddrSpaces64 = {
483     GlobalPtr, ConstantPtr, FlatPtr
484   };
485 
486   const std::initializer_list<LLT> AddrSpaces32 = {
487     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488   };
489 
490   const std::initializer_list<LLT> FPTypesBase = {
491     S32, S64
492   };
493 
494   const std::initializer_list<LLT> FPTypes16 = {
495     S32, S64, S16
496   };
497 
498   const std::initializer_list<LLT> FPTypesPK16 = {
499     S32, S64, S16, V2S16
500   };
501 
502   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503 
504   auto &LegacyInfo = getLegacyLegalizerInfo();
505   LegacyInfo.setAction({G_BRCOND, S1},
506                        LegacyLegalizeActions::Legal); // VCC branches
507   LegacyInfo.setAction({G_BRCOND, S32},
508                        LegacyLegalizeActions::Legal); // SCC branches
509 
510   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
511   // elements for v3s16
512   getActionDefinitionsBuilder(G_PHI)
513     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
514     .legalFor(AllS32Vectors)
515     .legalFor(AllS64Vectors)
516     .legalFor(AddrSpaces64)
517     .legalFor(AddrSpaces32)
518     .legalIf(isPointer(0))
519     .clampScalar(0, S16, S256)
520     .widenScalarToNextPow2(0, 32)
521     .clampMaxNumElements(0, S32, 16)
522     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
523     .scalarize(0);
524 
525   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
526     // Full set of gfx9 features.
527     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
528       .legalFor({S32, S16, V2S16})
529       .clampScalar(0, S16, S32)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32);
533 
534     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
535       .legalFor({S32, S16, V2S16}) // Clamp modifier
536       .minScalarOrElt(0, S16)
537       .clampMaxNumElements(0, S16, 2)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32)
540       .lower();
541   } else if (ST.has16BitInsts()) {
542     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
543       .legalFor({S32, S16})
544       .clampScalar(0, S16, S32)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
547 
548     // Technically the saturating operations require clamp bit support, but this
549     // was introduced at the same time as 16-bit operations.
550     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
551       .legalFor({S32, S16}) // Clamp modifier
552       .minScalar(0, S16)
553       .scalarize(0)
554       .widenScalarToNextPow2(0, 16)
555       .lower();
556 
557     // We're just lowering this, but it helps get a better result to try to
558     // coerce to the desired type first.
559     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
560       .minScalar(0, S16)
561       .scalarize(0)
562       .lower();
563   } else {
564     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
565       .legalFor({S32})
566       .clampScalar(0, S32, S32)
567       .scalarize(0);
568 
569     if (ST.hasIntClamp()) {
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .legalFor({S32}) // Clamp modifier.
572         .scalarize(0)
573         .minScalarOrElt(0, S32)
574         .lower();
575     } else {
576       // Clamp bit support was added in VI, along with 16-bit operations.
577       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578         .minScalar(0, S32)
579         .scalarize(0)
580         .lower();
581     }
582 
583     // FIXME: DAG expansion gets better results. The widening uses the smaller
584     // range values and goes for the min/max lowering directly.
585     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
586       .minScalar(0, S32)
587       .scalarize(0)
588       .lower();
589   }
590 
591   getActionDefinitionsBuilder(
592       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
593       .customFor({S32, S64})
594       .clampScalar(0, S32, S64)
595       .widenScalarToNextPow2(0, 32)
596       .scalarize(0);
597 
598   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
599                    .legalFor({S32})
600                    .maxScalarOrElt(0, S32);
601 
602   if (ST.hasVOP3PInsts()) {
603     Mulh
604       .clampMaxNumElements(0, S8, 2)
605       .lowerFor({V2S8});
606   }
607 
608   Mulh
609     .scalarize(0)
610     .lower();
611 
612   // Report legal for any types we can handle anywhere. For the cases only legal
613   // on the SALU, RegBankSelect will be able to re-legalize.
614   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
615     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
616     .clampScalar(0, S32, S64)
617     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
618     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
619     .widenScalarToNextPow2(0)
620     .scalarize(0);
621 
622   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
623                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
624     .legalFor({{S32, S1}, {S32, S32}})
625     .minScalar(0, S32)
626     // TODO: .scalarize(0)
627     .lower();
628 
629   getActionDefinitionsBuilder(G_BITCAST)
630     // Don't worry about the size constraint.
631     .legalIf(all(isRegisterType(0), isRegisterType(1)))
632     .lower();
633 
634 
635   getActionDefinitionsBuilder(G_CONSTANT)
636     .legalFor({S1, S32, S64, S16, GlobalPtr,
637                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
638     .legalIf(isPointer(0))
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(0);
641 
642   getActionDefinitionsBuilder(G_FCONSTANT)
643     .legalFor({S32, S64, S16})
644     .clampScalar(0, S16, S64);
645 
646   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
647       .legalIf(isRegisterType(0))
648       // s1 and s16 are special cases because they have legal operations on
649       // them, but don't really occupy registers in the normal way.
650       .legalFor({S1, S16})
651       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
652       .clampScalarOrElt(0, S32, MaxScalar)
653       .widenScalarToNextPow2(0, 32)
654       .clampMaxNumElements(0, S32, 16);
655 
656   LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr},
657                        LegacyLegalizeActions::Legal);
658 
659   // If the amount is divergent, we have to do a wave reduction to get the
660   // maximum value, so this is expanded during RegBankSelect.
661   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662     .legalFor({{PrivatePtr, S32}});
663 
664   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665     .customIf(typeIsNot(0, PrivatePtr));
666 
667   LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal);
668 
669   auto &FPOpActions = getActionDefinitionsBuilder(
670     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671     .legalFor({S32, S64});
672   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673     .customFor({S32, S64});
674   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675     .customFor({S32, S64});
676 
677   if (ST.has16BitInsts()) {
678     if (ST.hasVOP3PInsts())
679       FPOpActions.legalFor({S16, V2S16});
680     else
681       FPOpActions.legalFor({S16});
682 
683     TrigActions.customFor({S16});
684     FDIVActions.customFor({S16});
685   }
686 
687   auto &MinNumMaxNum = getActionDefinitionsBuilder({
688       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689 
690   if (ST.hasVOP3PInsts()) {
691     MinNumMaxNum.customFor(FPTypesPK16)
692       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
693       .clampMaxNumElements(0, S16, 2)
694       .clampScalar(0, S16, S64)
695       .scalarize(0);
696   } else if (ST.has16BitInsts()) {
697     MinNumMaxNum.customFor(FPTypes16)
698       .clampScalar(0, S16, S64)
699       .scalarize(0);
700   } else {
701     MinNumMaxNum.customFor(FPTypesBase)
702       .clampScalar(0, S32, S64)
703       .scalarize(0);
704   }
705 
706   if (ST.hasVOP3PInsts())
707     FPOpActions.clampMaxNumElements(0, S16, 2);
708 
709   FPOpActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   TrigActions
714     .scalarize(0)
715     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717   FDIVActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   getActionDefinitionsBuilder({G_FNEG, G_FABS})
722     .legalFor(FPTypesPK16)
723     .clampMaxNumElements(0, S16, 2)
724     .scalarize(0)
725     .clampScalar(0, S16, S64);
726 
727   if (ST.has16BitInsts()) {
728     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729       .legalFor({S32, S64, S16})
730       .scalarize(0)
731       .clampScalar(0, S16, S64);
732   } else {
733     getActionDefinitionsBuilder(G_FSQRT)
734       .legalFor({S32, S64})
735       .scalarize(0)
736       .clampScalar(0, S32, S64);
737 
738     if (ST.hasFractBug()) {
739       getActionDefinitionsBuilder(G_FFLOOR)
740         .customFor({S64})
741         .legalFor({S32, S64})
742         .scalarize(0)
743         .clampScalar(0, S32, S64);
744     } else {
745       getActionDefinitionsBuilder(G_FFLOOR)
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     }
750   }
751 
752   getActionDefinitionsBuilder(G_FPTRUNC)
753     .legalFor({{S32, S64}, {S16, S32}})
754     .scalarize(0)
755     .lower();
756 
757   getActionDefinitionsBuilder(G_FPEXT)
758     .legalFor({{S64, S32}, {S32, S16}})
759     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760     .scalarize(0);
761 
762   getActionDefinitionsBuilder(G_FSUB)
763       // Use actual fsub instruction
764       .legalFor({S32})
765       // Must use fadd + fneg
766       .lowerFor({S64, S16, V2S16})
767       .scalarize(0)
768       .clampScalar(0, S32, S64);
769 
770   // Whether this is legal depends on the floating point mode for the function.
771   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773     FMad.customFor({S32, S16});
774   else if (ST.hasMadMacF32Insts())
775     FMad.customFor({S32});
776   else if (ST.hasMadF16())
777     FMad.customFor({S16});
778   FMad.scalarize(0)
779       .lower();
780 
781   auto &FRem = getActionDefinitionsBuilder(G_FREM);
782   if (ST.has16BitInsts()) {
783     FRem.customFor({S16, S32, S64});
784   } else {
785     FRem.minScalar(0, S32)
786         .customFor({S32, S64});
787   }
788   FRem.scalarize(0);
789 
790   // TODO: Do we need to clamp maximum bitwidth?
791   getActionDefinitionsBuilder(G_TRUNC)
792     .legalIf(isScalar(0))
793     .legalFor({{V2S16, V2S32}})
794     .clampMaxNumElements(0, S16, 2)
795     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796     // situations (like an invalid implicit use), we don't want to infinite loop
797     // in the legalizer.
798     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
799     .alwaysLegal();
800 
801   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803                {S32, S1}, {S64, S1}, {S16, S1}})
804     .scalarize(0)
805     .clampScalar(0, S32, S64)
806     .widenScalarToNextPow2(1, 32);
807 
808   // TODO: Split s1->s64 during regbankselect for VALU.
809   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811     .lowerFor({{S32, S64}})
812     .lowerIf(typeIs(1, S1))
813     .customFor({{S64, S64}});
814   if (ST.has16BitInsts())
815     IToFP.legalFor({{S16, S16}});
816   IToFP.clampScalar(1, S32, S64)
817        .minScalar(0, S32)
818        .scalarize(0)
819        .widenScalarToNextPow2(1);
820 
821   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823     .customFor({{S64, S32}, {S64, S64}})
824     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825   if (ST.has16BitInsts())
826     FPToI.legalFor({{S16, S16}});
827   else
828     FPToI.minScalar(1, S32);
829 
830   FPToI.minScalar(0, S32)
831        .widenScalarToNextPow2(0, 32)
832        .scalarize(0)
833        .lower();
834 
835   // Lower roundeven into G_FRINT
836   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837     .scalarize(0)
838     .lower();
839 
840   if (ST.has16BitInsts()) {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S16, S32, S64})
843       .clampScalar(0, S16, S64)
844       .scalarize(0);
845   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S32, S64})
848       .clampScalar(0, S32, S64)
849       .scalarize(0);
850   } else {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32})
853       .customFor({S64})
854       .clampScalar(0, S32, S64)
855       .scalarize(0);
856   }
857 
858   getActionDefinitionsBuilder(G_PTR_ADD)
859     .legalIf(all(isPointer(0), sameSize(0, 1)))
860     .scalarize(0)
861     .scalarSameSizeAs(1, 0);
862 
863   getActionDefinitionsBuilder(G_PTRMASK)
864     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865     .scalarSameSizeAs(1, 0)
866     .scalarize(0);
867 
868   auto &CmpBuilder =
869     getActionDefinitionsBuilder(G_ICMP)
870     // The compare output type differs based on the register bank of the output,
871     // so make both s1 and s32 legal.
872     //
873     // Scalar compares producing output in scc will be promoted to s32, as that
874     // is the allocatable register type that will be needed for the copy from
875     // scc. This will be promoted during RegBankSelect, and we assume something
876     // before that won't try to use s32 result types.
877     //
878     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879     // bank.
880     .legalForCartesianProduct(
881       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882     .legalForCartesianProduct(
883       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884   if (ST.has16BitInsts()) {
885     CmpBuilder.legalFor({{S1, S16}});
886   }
887 
888   CmpBuilder
889     .widenScalarToNextPow2(1)
890     .clampScalar(1, S32, S64)
891     .scalarize(0)
892     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893 
894   getActionDefinitionsBuilder(G_FCMP)
895     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0);
899 
900   // FIXME: fpow has a selection pattern that should move to custom lowering.
901   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902   if (ST.has16BitInsts())
903     Exp2Ops.legalFor({S32, S16});
904   else
905     Exp2Ops.legalFor({S32});
906   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907   Exp2Ops.scalarize(0);
908 
909   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910   if (ST.has16BitInsts())
911     ExpOps.customFor({{S32}, {S16}});
912   else
913     ExpOps.customFor({S32});
914   ExpOps.clampScalar(0, MinScalarFPTy, S32)
915         .scalarize(0);
916 
917   getActionDefinitionsBuilder(G_FPOWI)
918     .clampScalar(0, MinScalarFPTy, S32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder(G_CTPOP)
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   // The hardware instructions return a different result on 0 than the generic
931   // instructions expect. The hardware produces -1, but these produce the
932   // bitwidth.
933   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934     .scalarize(0)
935     .clampScalar(0, S32, S32)
936     .clampScalar(1, S32, S64)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32)
939     .lower();
940 
941   // The 64-bit versions produce 32-bit results, but only on the SALU.
942   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943     .legalFor({{S32, S32}, {S32, S64}})
944     .clampScalar(0, S32, S32)
945     .clampScalar(1, S32, S64)
946     .scalarize(0)
947     .widenScalarToNextPow2(0, 32)
948     .widenScalarToNextPow2(1, 32);
949 
950   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951   // RegBankSelect.
952   getActionDefinitionsBuilder(G_BITREVERSE)
953     .legalFor({S32, S64})
954     .clampScalar(0, S32, S64)
955     .scalarize(0)
956     .widenScalarToNextPow2(0);
957 
958   if (ST.has16BitInsts()) {
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S16, S32, V2S16})
961       .clampMaxNumElements(0, S16, 2)
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .clampScalar(0, S16, S32)
966       .scalarize(0);
967 
968     if (ST.hasVOP3PInsts()) {
969       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970         .legalFor({S32, S16, V2S16})
971         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972         .clampMaxNumElements(0, S16, 2)
973         .minScalar(0, S16)
974         .widenScalarToNextPow2(0)
975         .scalarize(0)
976         .lower();
977     } else {
978       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979         .legalFor({S32, S16})
980         .widenScalarToNextPow2(0)
981         .minScalar(0, S16)
982         .scalarize(0)
983         .lower();
984     }
985   } else {
986     // TODO: Should have same legality without v_perm_b32
987     getActionDefinitionsBuilder(G_BSWAP)
988       .legalFor({S32})
989       .lowerIf(scalarNarrowerThan(0, 32))
990       // FIXME: Fixing non-power-of-2 before clamp is workaround for
991       // narrowScalar limitation.
992       .widenScalarToNextPow2(0)
993       .maxScalar(0, S32)
994       .scalarize(0)
995       .lower();
996 
997     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998       .legalFor({S32})
999       .minScalar(0, S32)
1000       .widenScalarToNextPow2(0)
1001       .scalarize(0)
1002       .lower();
1003   }
1004 
1005   getActionDefinitionsBuilder(G_INTTOPTR)
1006     // List the common cases
1007     .legalForCartesianProduct(AddrSpaces64, {S64})
1008     .legalForCartesianProduct(AddrSpaces32, {S32})
1009     .scalarize(0)
1010     // Accept any address space as long as the size matches
1011     .legalIf(sameSize(0, 1))
1012     .widenScalarIf(smallerThan(1, 0),
1013       [](const LegalityQuery &Query) {
1014         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015       })
1016     .narrowScalarIf(largerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       });
1020 
1021   getActionDefinitionsBuilder(G_PTRTOINT)
1022     // List the common cases
1023     .legalForCartesianProduct(AddrSpaces64, {S64})
1024     .legalForCartesianProduct(AddrSpaces32, {S32})
1025     .scalarize(0)
1026     // Accept any address space as long as the size matches
1027     .legalIf(sameSize(0, 1))
1028     .widenScalarIf(smallerThan(0, 1),
1029       [](const LegalityQuery &Query) {
1030         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031       })
1032     .narrowScalarIf(
1033       largerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       });
1037 
1038   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039     .scalarize(0)
1040     .custom();
1041 
1042   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043                                     bool IsLoad) -> bool {
1044     const LLT DstTy = Query.Types[0];
1045 
1046     // Split vector extloads.
1047     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049 
1050     if (MemSize < DstTy.getSizeInBits())
1051       MemSize = std::max(MemSize, AlignBits);
1052 
1053     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054       return true;
1055 
1056     const LLT PtrTy = Query.Types[1];
1057     unsigned AS = PtrTy.getAddressSpace();
1058     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059       return true;
1060 
1061     // Catch weird sized loads that don't evenly divide into the access sizes
1062     // TODO: May be able to widen depending on alignment etc.
1063     unsigned NumRegs = (MemSize + 31) / 32;
1064     if (NumRegs == 3) {
1065       if (!ST.hasDwordx3LoadStores())
1066         return true;
1067     } else {
1068       // If the alignment allows, these should have been widened.
1069       if (!isPowerOf2_32(NumRegs))
1070         return true;
1071     }
1072 
1073     if (AlignBits < MemSize) {
1074       const SITargetLowering *TLI = ST.getTargetLowering();
1075       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076                                                       Align(AlignBits / 8));
1077     }
1078 
1079     return false;
1080   };
1081 
1082   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085 
1086   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087   // LDS
1088   // TODO: Unsupported flat for SI.
1089 
1090   for (unsigned Op : {G_LOAD, G_STORE}) {
1091     const bool IsStore = Op == G_STORE;
1092 
1093     auto &Actions = getActionDefinitionsBuilder(Op);
1094     // Explicitly list some common cases.
1095     // TODO: Does this help compile time at all?
1096     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1098                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1099                                       {S64, GlobalPtr, 64, GlobalAlign32},
1100                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1101                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1102                                       {S32, GlobalPtr, 8, GlobalAlign8},
1103                                       {S32, GlobalPtr, 16, GlobalAlign16},
1104 
1105                                       {S32, LocalPtr, 32, 32},
1106                                       {S64, LocalPtr, 64, 32},
1107                                       {V2S32, LocalPtr, 64, 32},
1108                                       {S32, LocalPtr, 8, 8},
1109                                       {S32, LocalPtr, 16, 16},
1110                                       {V2S16, LocalPtr, 32, 32},
1111 
1112                                       {S32, PrivatePtr, 32, 32},
1113                                       {S32, PrivatePtr, 8, 8},
1114                                       {S32, PrivatePtr, 16, 16},
1115                                       {V2S16, PrivatePtr, 32, 32},
1116 
1117                                       {S32, ConstantPtr, 32, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1119                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1120                                       {S64, ConstantPtr, 64, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122     Actions.legalIf(
1123       [=](const LegalityQuery &Query) -> bool {
1124         return isLoadStoreLegal(ST, Query);
1125       });
1126 
1127     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128     // 64-bits.
1129     //
1130     // TODO: Should generalize bitcast action into coerce, which will also cover
1131     // inserting addrspacecasts.
1132     Actions.customIf(typeIs(1, Constant32Ptr));
1133 
1134     // Turn any illegal element vectors into something easier to deal
1135     // with. These will ultimately produce 32-bit scalar shifts to extract the
1136     // parts anyway.
1137     //
1138     // For odd 16-bit element vectors, prefer to split those into pieces with
1139     // 16-bit vector parts.
1140     Actions.bitcastIf(
1141       [=](const LegalityQuery &Query) -> bool {
1142         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143                                           Query.MMODescrs[0].SizeInBits);
1144       }, bitcastToRegisterType(0));
1145 
1146     if (!IsStore) {
1147       // Widen suitably aligned loads by loading extra bytes. The standard
1148       // legalization actions can't properly express widening memory operands.
1149       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150         return shouldWidenLoad(ST, Query, G_LOAD);
1151       });
1152     }
1153 
1154     // FIXME: load/store narrowing should be moved to lower action
1155     Actions
1156         .narrowScalarIf(
1157             [=](const LegalityQuery &Query) -> bool {
1158               return !Query.Types[0].isVector() &&
1159                      needToSplitMemOp(Query, Op == G_LOAD);
1160             },
1161             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162               const LLT DstTy = Query.Types[0];
1163               const LLT PtrTy = Query.Types[1];
1164 
1165               const unsigned DstSize = DstTy.getSizeInBits();
1166               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167 
1168               // Split extloads.
1169               if (DstSize > MemSize)
1170                 return std::make_pair(0, LLT::scalar(MemSize));
1171 
1172               if (!isPowerOf2_32(DstSize)) {
1173                 // We're probably decomposing an odd sized store. Try to split
1174                 // to the widest type. TODO: Account for alignment. As-is it
1175                 // should be OK, since the new parts will be further legalized.
1176                 unsigned FloorSize = PowerOf2Floor(DstSize);
1177                 return std::make_pair(0, LLT::scalar(FloorSize));
1178               }
1179 
1180               if (DstSize > 32 && (DstSize % 32 != 0)) {
1181                 // FIXME: Need a way to specify non-extload of larger size if
1182                 // suitably aligned.
1183                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184               }
1185 
1186               unsigned MaxSize = maxSizeForAddrSpace(ST,
1187                                                      PtrTy.getAddressSpace(),
1188                                                      Op == G_LOAD);
1189               if (MemSize > MaxSize)
1190                 return std::make_pair(0, LLT::scalar(MaxSize));
1191 
1192               unsigned Align = Query.MMODescrs[0].AlignInBits;
1193               return std::make_pair(0, LLT::scalar(Align));
1194             })
1195         .fewerElementsIf(
1196             [=](const LegalityQuery &Query) -> bool {
1197               return Query.Types[0].isVector() &&
1198                      needToSplitMemOp(Query, Op == G_LOAD);
1199             },
1200             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201               const LLT DstTy = Query.Types[0];
1202               const LLT PtrTy = Query.Types[1];
1203 
1204               LLT EltTy = DstTy.getElementType();
1205               unsigned MaxSize = maxSizeForAddrSpace(ST,
1206                                                      PtrTy.getAddressSpace(),
1207                                                      Op == G_LOAD);
1208 
1209               // FIXME: Handle widened to power of 2 results better. This ends
1210               // up scalarizing.
1211               // FIXME: 3 element stores scalarized on SI
1212 
1213               // Split if it's too large for the address space.
1214               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215                 unsigned NumElts = DstTy.getNumElements();
1216                 unsigned EltSize = EltTy.getSizeInBits();
1217 
1218                 if (MaxSize % EltSize == 0) {
1219                   return std::make_pair(
1220                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1221                 }
1222 
1223                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1224 
1225                 // FIXME: Refine when odd breakdowns handled
1226                 // The scalars will need to be re-legalized.
1227                 if (NumPieces == 1 || NumPieces >= NumElts ||
1228                     NumElts % NumPieces != 0)
1229                   return std::make_pair(0, EltTy);
1230 
1231                 return std::make_pair(0,
1232                                       LLT::vector(NumElts / NumPieces, EltTy));
1233               }
1234 
1235               // FIXME: We could probably handle weird extending loads better.
1236               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1237               if (DstTy.getSizeInBits() > MemSize)
1238                 return std::make_pair(0, EltTy);
1239 
1240               unsigned EltSize = EltTy.getSizeInBits();
1241               unsigned DstSize = DstTy.getSizeInBits();
1242               if (!isPowerOf2_32(DstSize)) {
1243                 // We're probably decomposing an odd sized store. Try to split
1244                 // to the widest type. TODO: Account for alignment. As-is it
1245                 // should be OK, since the new parts will be further legalized.
1246                 unsigned FloorSize = PowerOf2Floor(DstSize);
1247                 return std::make_pair(
1248                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1249               }
1250 
1251               // Need to split because of alignment.
1252               unsigned Align = Query.MMODescrs[0].AlignInBits;
1253               if (EltSize > Align &&
1254                   (EltSize / Align < DstTy.getNumElements())) {
1255                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1256               }
1257 
1258               // May need relegalization for the scalars.
1259               return std::make_pair(0, EltTy);
1260             })
1261     .lowerIfMemSizeNotPow2()
1262     .minScalar(0, S32)
1263     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1264     .widenScalarToNextPow2(0)
1265     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1266     .lower();
1267   }
1268 
1269   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1270                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1271                                                   {S32, GlobalPtr, 16, 2 * 8},
1272                                                   {S32, LocalPtr, 8, 8},
1273                                                   {S32, LocalPtr, 16, 16},
1274                                                   {S32, PrivatePtr, 8, 8},
1275                                                   {S32, PrivatePtr, 16, 16},
1276                                                   {S32, ConstantPtr, 8, 8},
1277                                                   {S32, ConstantPtr, 16, 2 * 8}})
1278                        .legalIf(
1279                          [=](const LegalityQuery &Query) -> bool {
1280                            return isLoadStoreLegal(ST, Query);
1281                          });
1282 
1283   if (ST.hasFlatAddressSpace()) {
1284     ExtLoads.legalForTypesWithMemDesc(
1285         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1286   }
1287 
1288   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1289   // 64-bits.
1290   //
1291   // TODO: Should generalize bitcast action into coerce, which will also cover
1292   // inserting addrspacecasts.
1293   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1294 
1295   ExtLoads.clampScalar(0, S32, S32)
1296           .widenScalarToNextPow2(0)
1297           .unsupportedIfMemSizeNotPow2()
1298           .lower();
1299 
1300   auto &Atomics = getActionDefinitionsBuilder(
1301     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1302      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1303      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1304      G_ATOMICRMW_UMIN})
1305     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1306                {S64, GlobalPtr}, {S64, LocalPtr},
1307                {S32, RegionPtr}, {S64, RegionPtr}});
1308   if (ST.hasFlatAddressSpace()) {
1309     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1310   }
1311 
1312   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1313   if (ST.hasLDSFPAtomics()) {
1314     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1315     if (ST.hasGFX90AInsts())
1316       Atomic.legalFor({{S64, LocalPtr}});
1317   }
1318   if (ST.hasAtomicFaddInsts())
1319     Atomic.legalFor({{S32, GlobalPtr}});
1320 
1321   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1322   // demarshalling
1323   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1324     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1325                 {S32, FlatPtr}, {S64, FlatPtr}})
1326     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1327                {S32, RegionPtr}, {S64, RegionPtr}});
1328   // TODO: Pointer types, any 32-bit or 64-bit vector
1329 
1330   // Condition should be s32 for scalar, s1 for vector.
1331   getActionDefinitionsBuilder(G_SELECT)
1332     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1333           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1334           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1335     .clampScalar(0, S16, S64)
1336     .scalarize(1)
1337     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1338     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1339     .clampMaxNumElements(0, S32, 2)
1340     .clampMaxNumElements(0, LocalPtr, 2)
1341     .clampMaxNumElements(0, PrivatePtr, 2)
1342     .scalarize(0)
1343     .widenScalarToNextPow2(0)
1344     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1345 
1346   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1347   // be more flexible with the shift amount type.
1348   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1349     .legalFor({{S32, S32}, {S64, S32}});
1350   if (ST.has16BitInsts()) {
1351     if (ST.hasVOP3PInsts()) {
1352       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1353             .clampMaxNumElements(0, S16, 2);
1354     } else
1355       Shifts.legalFor({{S16, S16}});
1356 
1357     // TODO: Support 16-bit shift amounts for all types
1358     Shifts.widenScalarIf(
1359       [=](const LegalityQuery &Query) {
1360         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1361         // 32-bit amount.
1362         const LLT ValTy = Query.Types[0];
1363         const LLT AmountTy = Query.Types[1];
1364         return ValTy.getSizeInBits() <= 16 &&
1365                AmountTy.getSizeInBits() < 16;
1366       }, changeTo(1, S16));
1367     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1368     Shifts.clampScalar(1, S32, S32);
1369     Shifts.clampScalar(0, S16, S64);
1370     Shifts.widenScalarToNextPow2(0, 16);
1371 
1372     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1373       .minScalar(0, S16)
1374       .scalarize(0)
1375       .lower();
1376   } else {
1377     // Make sure we legalize the shift amount type first, as the general
1378     // expansion for the shifted type will produce much worse code if it hasn't
1379     // been truncated already.
1380     Shifts.clampScalar(1, S32, S32);
1381     Shifts.clampScalar(0, S32, S64);
1382     Shifts.widenScalarToNextPow2(0, 32);
1383 
1384     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1385       .minScalar(0, S32)
1386       .scalarize(0)
1387       .lower();
1388   }
1389   Shifts.scalarize(0);
1390 
1391   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1392     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1393     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1394     unsigned IdxTypeIdx = 2;
1395 
1396     getActionDefinitionsBuilder(Op)
1397       .customIf([=](const LegalityQuery &Query) {
1398           const LLT EltTy = Query.Types[EltTypeIdx];
1399           const LLT VecTy = Query.Types[VecTypeIdx];
1400           const LLT IdxTy = Query.Types[IdxTypeIdx];
1401           const unsigned EltSize = EltTy.getSizeInBits();
1402           return (EltSize == 32 || EltSize == 64) &&
1403                   VecTy.getSizeInBits() % 32 == 0 &&
1404                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1405                   IdxTy.getSizeInBits() == 32;
1406         })
1407       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1408                  bitcastToVectorElement32(VecTypeIdx))
1409       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1410       .bitcastIf(
1411         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1412         [=](const LegalityQuery &Query) {
1413           // For > 64-bit element types, try to turn this into a 64-bit
1414           // element vector since we may be able to do better indexing
1415           // if this is scalar. If not, fall back to 32.
1416           const LLT EltTy = Query.Types[EltTypeIdx];
1417           const LLT VecTy = Query.Types[VecTypeIdx];
1418           const unsigned DstEltSize = EltTy.getSizeInBits();
1419           const unsigned VecSize = VecTy.getSizeInBits();
1420 
1421           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1422           return std::make_pair(
1423             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1424         })
1425       .clampScalar(EltTypeIdx, S32, S64)
1426       .clampScalar(VecTypeIdx, S32, S64)
1427       .clampScalar(IdxTypeIdx, S32, S32)
1428       .clampMaxNumElements(VecTypeIdx, S32, 32)
1429       // TODO: Clamp elements for 64-bit vectors?
1430       // It should only be necessary with variable indexes.
1431       // As a last resort, lower to the stack
1432       .lower();
1433   }
1434 
1435   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436     .unsupportedIf([=](const LegalityQuery &Query) {
1437         const LLT &EltTy = Query.Types[1].getElementType();
1438         return Query.Types[0] != EltTy;
1439       });
1440 
1441   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444 
1445     // FIXME: Doesn't handle extract of illegal sizes.
1446     getActionDefinitionsBuilder(Op)
1447       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448       // FIXME: Multiples of 16 should not be legal.
1449       .legalIf([=](const LegalityQuery &Query) {
1450           const LLT BigTy = Query.Types[BigTyIdx];
1451           const LLT LitTy = Query.Types[LitTyIdx];
1452           return (BigTy.getSizeInBits() % 32 == 0) &&
1453                  (LitTy.getSizeInBits() % 16 == 0);
1454         })
1455       .widenScalarIf(
1456         [=](const LegalityQuery &Query) {
1457           const LLT BigTy = Query.Types[BigTyIdx];
1458           return (BigTy.getScalarSizeInBits() < 16);
1459         },
1460         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT LitTy = Query.Types[LitTyIdx];
1464           return (LitTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1467       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468       .widenScalarToNextPow2(BigTyIdx, 32);
1469 
1470   }
1471 
1472   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473     .legalForCartesianProduct(AllS32Vectors, {S32})
1474     .legalForCartesianProduct(AllS64Vectors, {S64})
1475     .clampNumElements(0, V16S32, V32S32)
1476     .clampNumElements(0, V2S64, V16S64)
1477     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478 
1479   if (ST.hasScalarPackInsts()) {
1480     BuildVector
1481       // FIXME: Should probably widen s1 vectors straight to s32
1482       .minScalarOrElt(0, S16)
1483       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484       .minScalar(1, S32);
1485 
1486     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487       .legalFor({V2S16, S32})
1488       .lower();
1489     BuildVector.minScalarOrElt(0, S32);
1490   } else {
1491     BuildVector.customFor({V2S16, S16});
1492     BuildVector.minScalarOrElt(0, S32);
1493 
1494     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495       .customFor({V2S16, S32})
1496       .lower();
1497   }
1498 
1499   BuildVector.legalIf(isRegisterType(0));
1500 
1501   // FIXME: Clamp maximum size
1502   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504     .clampMaxNumElements(0, S32, 32)
1505     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506     .clampMaxNumElements(0, S16, 64);
1507 
1508   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509   // pre-legalize.
1510   if (ST.hasVOP3PInsts()) {
1511     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512       .customFor({V2S16, V2S16})
1513       .lower();
1514   } else
1515     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516 
1517   // Merge/Unmerge
1518   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521 
1522     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523       const LLT Ty = Query.Types[TypeIdx];
1524       if (Ty.isVector()) {
1525         const LLT &EltTy = Ty.getElementType();
1526         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527           return true;
1528         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529           return true;
1530       }
1531       return false;
1532     };
1533 
1534     auto &Builder = getActionDefinitionsBuilder(Op)
1535       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536       .lowerFor({{S16, V2S16}})
1537       .lowerIf([=](const LegalityQuery &Query) {
1538           const LLT BigTy = Query.Types[BigTyIdx];
1539           return BigTy.getSizeInBits() == 32;
1540         })
1541       // Try to widen to s16 first for small types.
1542       // TODO: Only do this on targets with legal s16 shifts
1543       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547                            elementTypeIs(1, S16)),
1548                        changeTo(1, V2S16))
1549       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551       // valid.
1552       .clampScalar(LitTyIdx, S32, S512)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554       // Break up vectors with weird elements into scalars
1555       .fewerElementsIf(
1556         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557         scalarize(0))
1558       .fewerElementsIf(
1559         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560         scalarize(1))
1561       .clampScalar(BigTyIdx, S32, MaxScalar);
1562 
1563     if (Op == G_MERGE_VALUES) {
1564       Builder.widenScalarIf(
1565         // TODO: Use 16-bit shifts if legal for 8-bit values?
1566         [=](const LegalityQuery &Query) {
1567           const LLT Ty = Query.Types[LitTyIdx];
1568           return Ty.getSizeInBits() < 32;
1569         },
1570         changeTo(LitTyIdx, S32));
1571     }
1572 
1573     Builder.widenScalarIf(
1574       [=](const LegalityQuery &Query) {
1575         const LLT Ty = Query.Types[BigTyIdx];
1576         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577           Ty.getSizeInBits() % 16 != 0;
1578       },
1579       [=](const LegalityQuery &Query) {
1580         // Pick the next power of 2, or a multiple of 64 over 128.
1581         // Whichever is smaller.
1582         const LLT &Ty = Query.Types[BigTyIdx];
1583         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584         if (NewSizeInBits >= 256) {
1585           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586           if (RoundedTo < NewSizeInBits)
1587             NewSizeInBits = RoundedTo;
1588         }
1589         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590       })
1591       // Any vectors left are the wrong size. Scalarize them.
1592       .scalarize(0)
1593       .scalarize(1);
1594   }
1595 
1596   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597   // RegBankSelect.
1598   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599     .legalFor({{S32}, {S64}});
1600 
1601   if (ST.hasVOP3PInsts()) {
1602     SextInReg.lowerFor({{V2S16}})
1603       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604       // get more vector shift opportunities, since we'll get those when
1605       // expanded.
1606       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1607   } else if (ST.has16BitInsts()) {
1608     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609   } else {
1610     // Prefer to promote to s32 before lowering if we don't have 16-bit
1611     // shifts. This avoid a lot of intermediate truncate and extend operations.
1612     SextInReg.lowerFor({{S32}, {S64}});
1613   }
1614 
1615   SextInReg
1616     .scalarize(0)
1617     .clampScalar(0, S32, S64)
1618     .lower();
1619 
1620   // TODO: Only Try to form v2s16 with legal packed instructions.
1621   getActionDefinitionsBuilder(G_FSHR)
1622     .legalFor({{S32, S32}})
1623     .lowerFor({{V2S16, V2S16}})
1624     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1625     .scalarize(0)
1626     .lower();
1627 
1628   if (ST.hasVOP3PInsts()) {
1629     getActionDefinitionsBuilder(G_FSHL)
1630       .lowerFor({{V2S16, V2S16}})
1631       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1632       .scalarize(0)
1633       .lower();
1634   } else {
1635     getActionDefinitionsBuilder(G_FSHL)
1636       .scalarize(0)
1637       .lower();
1638   }
1639 
1640   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1641     .legalFor({S64});
1642 
1643   getActionDefinitionsBuilder(G_FENCE)
1644     .alwaysLegal();
1645 
1646   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1647       .scalarize(0)
1648       .minScalar(0, S32)
1649       .lower();
1650 
1651   getActionDefinitionsBuilder({
1652       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1653       G_FCOPYSIGN,
1654 
1655       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1656       G_ATOMICRMW_NAND,
1657       G_ATOMICRMW_FSUB,
1658       G_READ_REGISTER,
1659       G_WRITE_REGISTER,
1660 
1661       G_SADDO, G_SSUBO,
1662 
1663        // TODO: Implement
1664       G_FMINIMUM, G_FMAXIMUM}).lower();
1665 
1666   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1667         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1668         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1669     .unsupported();
1670 
1671   getLegacyLegalizerInfo().computeTables();
1672   verify(*ST.getInstrInfo());
1673 }
1674 
1675 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1676                                          MachineInstr &MI) const {
1677   MachineIRBuilder &B = Helper.MIRBuilder;
1678   MachineRegisterInfo &MRI = *B.getMRI();
1679 
1680   switch (MI.getOpcode()) {
1681   case TargetOpcode::G_ADDRSPACE_CAST:
1682     return legalizeAddrSpaceCast(MI, MRI, B);
1683   case TargetOpcode::G_FRINT:
1684     return legalizeFrint(MI, MRI, B);
1685   case TargetOpcode::G_FCEIL:
1686     return legalizeFceil(MI, MRI, B);
1687   case TargetOpcode::G_FREM:
1688     return legalizeFrem(MI, MRI, B);
1689   case TargetOpcode::G_INTRINSIC_TRUNC:
1690     return legalizeIntrinsicTrunc(MI, MRI, B);
1691   case TargetOpcode::G_SITOFP:
1692     return legalizeITOFP(MI, MRI, B, true);
1693   case TargetOpcode::G_UITOFP:
1694     return legalizeITOFP(MI, MRI, B, false);
1695   case TargetOpcode::G_FPTOSI:
1696     return legalizeFPTOI(MI, MRI, B, true);
1697   case TargetOpcode::G_FPTOUI:
1698     return legalizeFPTOI(MI, MRI, B, false);
1699   case TargetOpcode::G_FMINNUM:
1700   case TargetOpcode::G_FMAXNUM:
1701   case TargetOpcode::G_FMINNUM_IEEE:
1702   case TargetOpcode::G_FMAXNUM_IEEE:
1703     return legalizeMinNumMaxNum(Helper, MI);
1704   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1705     return legalizeExtractVectorElt(MI, MRI, B);
1706   case TargetOpcode::G_INSERT_VECTOR_ELT:
1707     return legalizeInsertVectorElt(MI, MRI, B);
1708   case TargetOpcode::G_SHUFFLE_VECTOR:
1709     return legalizeShuffleVector(MI, MRI, B);
1710   case TargetOpcode::G_FSIN:
1711   case TargetOpcode::G_FCOS:
1712     return legalizeSinCos(MI, MRI, B);
1713   case TargetOpcode::G_GLOBAL_VALUE:
1714     return legalizeGlobalValue(MI, MRI, B);
1715   case TargetOpcode::G_LOAD:
1716   case TargetOpcode::G_SEXTLOAD:
1717   case TargetOpcode::G_ZEXTLOAD:
1718     return legalizeLoad(Helper, MI);
1719   case TargetOpcode::G_FMAD:
1720     return legalizeFMad(MI, MRI, B);
1721   case TargetOpcode::G_FDIV:
1722     return legalizeFDIV(MI, MRI, B);
1723   case TargetOpcode::G_UDIV:
1724   case TargetOpcode::G_UREM:
1725   case TargetOpcode::G_UDIVREM:
1726     return legalizeUnsignedDIV_REM(MI, MRI, B);
1727   case TargetOpcode::G_SDIV:
1728   case TargetOpcode::G_SREM:
1729   case TargetOpcode::G_SDIVREM:
1730     return legalizeSignedDIV_REM(MI, MRI, B);
1731   case TargetOpcode::G_ATOMIC_CMPXCHG:
1732     return legalizeAtomicCmpXChg(MI, MRI, B);
1733   case TargetOpcode::G_FLOG:
1734     return legalizeFlog(MI, B, numbers::ln2f);
1735   case TargetOpcode::G_FLOG10:
1736     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1737   case TargetOpcode::G_FEXP:
1738     return legalizeFExp(MI, B);
1739   case TargetOpcode::G_FPOW:
1740     return legalizeFPow(MI, B);
1741   case TargetOpcode::G_FFLOOR:
1742     return legalizeFFloor(MI, MRI, B);
1743   case TargetOpcode::G_BUILD_VECTOR:
1744     return legalizeBuildVector(MI, MRI, B);
1745   default:
1746     return false;
1747   }
1748 
1749   llvm_unreachable("expected switch to return");
1750 }
1751 
1752 Register AMDGPULegalizerInfo::getSegmentAperture(
1753   unsigned AS,
1754   MachineRegisterInfo &MRI,
1755   MachineIRBuilder &B) const {
1756   MachineFunction &MF = B.getMF();
1757   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1758   const LLT S32 = LLT::scalar(32);
1759 
1760   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1761 
1762   if (ST.hasApertureRegs()) {
1763     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1764     // getreg.
1765     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1766         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1767         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1768     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1769         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1770         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1771     unsigned Encoding =
1772         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1773         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1774         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1775 
1776     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1777 
1778     B.buildInstr(AMDGPU::S_GETREG_B32)
1779       .addDef(GetReg)
1780       .addImm(Encoding);
1781     MRI.setType(GetReg, S32);
1782 
1783     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1784     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1785   }
1786 
1787   Register QueuePtr = MRI.createGenericVirtualRegister(
1788     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1789 
1790   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1791     return Register();
1792 
1793   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1794   // private_segment_aperture_base_hi.
1795   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1796 
1797   // TODO: can we be smarter about machine pointer info?
1798   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1799   MachineMemOperand *MMO = MF.getMachineMemOperand(
1800       PtrInfo,
1801       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1802           MachineMemOperand::MOInvariant,
1803       4, commonAlignment(Align(64), StructOffset));
1804 
1805   Register LoadAddr;
1806 
1807   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1808   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1809 }
1810 
1811 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1812   MachineInstr &MI, MachineRegisterInfo &MRI,
1813   MachineIRBuilder &B) const {
1814   MachineFunction &MF = B.getMF();
1815 
1816   const LLT S32 = LLT::scalar(32);
1817   Register Dst = MI.getOperand(0).getReg();
1818   Register Src = MI.getOperand(1).getReg();
1819 
1820   LLT DstTy = MRI.getType(Dst);
1821   LLT SrcTy = MRI.getType(Src);
1822   unsigned DestAS = DstTy.getAddressSpace();
1823   unsigned SrcAS = SrcTy.getAddressSpace();
1824 
1825   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1826   // vector element.
1827   assert(!DstTy.isVector());
1828 
1829   const AMDGPUTargetMachine &TM
1830     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1831 
1832   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1833     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1834     return true;
1835   }
1836 
1837   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1838     // Truncate.
1839     B.buildExtract(Dst, Src, 0);
1840     MI.eraseFromParent();
1841     return true;
1842   }
1843 
1844   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1845     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1846     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1847 
1848     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1849     // another. Merge operands are required to be the same type, but creating an
1850     // extra ptrtoint would be kind of pointless.
1851     auto HighAddr = B.buildConstant(
1852       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1853     B.buildMerge(Dst, {Src, HighAddr});
1854     MI.eraseFromParent();
1855     return true;
1856   }
1857 
1858   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1859     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1860            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1861     unsigned NullVal = TM.getNullPointerValue(DestAS);
1862 
1863     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1864     auto FlatNull = B.buildConstant(SrcTy, 0);
1865 
1866     // Extract low 32-bits of the pointer.
1867     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1868 
1869     auto CmpRes =
1870         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1871     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1872 
1873     MI.eraseFromParent();
1874     return true;
1875   }
1876 
1877   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1878     return false;
1879 
1880   if (!ST.hasFlatAddressSpace())
1881     return false;
1882 
1883   auto SegmentNull =
1884       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1885   auto FlatNull =
1886       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1887 
1888   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1889   if (!ApertureReg.isValid())
1890     return false;
1891 
1892   auto CmpRes =
1893       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1894 
1895   // Coerce the type of the low half of the result so we can use merge_values.
1896   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1897 
1898   // TODO: Should we allow mismatched types but matching sizes in merges to
1899   // avoid the ptrtoint?
1900   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1901   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1902 
1903   MI.eraseFromParent();
1904   return true;
1905 }
1906 
1907 bool AMDGPULegalizerInfo::legalizeFrint(
1908   MachineInstr &MI, MachineRegisterInfo &MRI,
1909   MachineIRBuilder &B) const {
1910   Register Src = MI.getOperand(1).getReg();
1911   LLT Ty = MRI.getType(Src);
1912   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1913 
1914   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1915   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1916 
1917   auto C1 = B.buildFConstant(Ty, C1Val);
1918   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1919 
1920   // TODO: Should this propagate fast-math-flags?
1921   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1922   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1923 
1924   auto C2 = B.buildFConstant(Ty, C2Val);
1925   auto Fabs = B.buildFAbs(Ty, Src);
1926 
1927   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1928   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1929   MI.eraseFromParent();
1930   return true;
1931 }
1932 
1933 bool AMDGPULegalizerInfo::legalizeFceil(
1934   MachineInstr &MI, MachineRegisterInfo &MRI,
1935   MachineIRBuilder &B) const {
1936 
1937   const LLT S1 = LLT::scalar(1);
1938   const LLT S64 = LLT::scalar(64);
1939 
1940   Register Src = MI.getOperand(1).getReg();
1941   assert(MRI.getType(Src) == S64);
1942 
1943   // result = trunc(src)
1944   // if (src > 0.0 && src != result)
1945   //   result += 1.0
1946 
1947   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1948 
1949   const auto Zero = B.buildFConstant(S64, 0.0);
1950   const auto One = B.buildFConstant(S64, 1.0);
1951   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1952   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1953   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1954   auto Add = B.buildSelect(S64, And, One, Zero);
1955 
1956   // TODO: Should this propagate fast-math-flags?
1957   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1958   return true;
1959 }
1960 
1961 bool AMDGPULegalizerInfo::legalizeFrem(
1962   MachineInstr &MI, MachineRegisterInfo &MRI,
1963   MachineIRBuilder &B) const {
1964     Register DstReg = MI.getOperand(0).getReg();
1965     Register Src0Reg = MI.getOperand(1).getReg();
1966     Register Src1Reg = MI.getOperand(2).getReg();
1967     auto Flags = MI.getFlags();
1968     LLT Ty = MRI.getType(DstReg);
1969 
1970     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1971     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1972     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1973     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1974     MI.eraseFromParent();
1975     return true;
1976 }
1977 
1978 static MachineInstrBuilder extractF64Exponent(Register Hi,
1979                                               MachineIRBuilder &B) {
1980   const unsigned FractBits = 52;
1981   const unsigned ExpBits = 11;
1982   LLT S32 = LLT::scalar(32);
1983 
1984   auto Const0 = B.buildConstant(S32, FractBits - 32);
1985   auto Const1 = B.buildConstant(S32, ExpBits);
1986 
1987   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1988     .addUse(Hi)
1989     .addUse(Const0.getReg(0))
1990     .addUse(Const1.getReg(0));
1991 
1992   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1993 }
1994 
1995 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B) const {
1998   const LLT S1 = LLT::scalar(1);
1999   const LLT S32 = LLT::scalar(32);
2000   const LLT S64 = LLT::scalar(64);
2001 
2002   Register Src = MI.getOperand(1).getReg();
2003   assert(MRI.getType(Src) == S64);
2004 
2005   // TODO: Should this use extract since the low half is unused?
2006   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2007   Register Hi = Unmerge.getReg(1);
2008 
2009   // Extract the upper half, since this is where we will find the sign and
2010   // exponent.
2011   auto Exp = extractF64Exponent(Hi, B);
2012 
2013   const unsigned FractBits = 52;
2014 
2015   // Extract the sign bit.
2016   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2017   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2018 
2019   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2020 
2021   const auto Zero32 = B.buildConstant(S32, 0);
2022 
2023   // Extend back to 64-bits.
2024   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2025 
2026   auto Shr = B.buildAShr(S64, FractMask, Exp);
2027   auto Not = B.buildNot(S64, Shr);
2028   auto Tmp0 = B.buildAnd(S64, Src, Not);
2029   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2030 
2031   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2032   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2033 
2034   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2035   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2036   MI.eraseFromParent();
2037   return true;
2038 }
2039 
2040 bool AMDGPULegalizerInfo::legalizeITOFP(
2041   MachineInstr &MI, MachineRegisterInfo &MRI,
2042   MachineIRBuilder &B, bool Signed) const {
2043 
2044   Register Dst = MI.getOperand(0).getReg();
2045   Register Src = MI.getOperand(1).getReg();
2046 
2047   const LLT S64 = LLT::scalar(64);
2048   const LLT S32 = LLT::scalar(32);
2049 
2050   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2051 
2052   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2053 
2054   auto CvtHi = Signed ?
2055     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2056     B.buildUITOFP(S64, Unmerge.getReg(1));
2057 
2058   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2059 
2060   auto ThirtyTwo = B.buildConstant(S32, 32);
2061   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2062     .addUse(CvtHi.getReg(0))
2063     .addUse(ThirtyTwo.getReg(0));
2064 
2065   // TODO: Should this propagate fast-math-flags?
2066   B.buildFAdd(Dst, LdExp, CvtLo);
2067   MI.eraseFromParent();
2068   return true;
2069 }
2070 
2071 // TODO: Copied from DAG implementation. Verify logic and document how this
2072 // actually works.
2073 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2074                                         MachineRegisterInfo &MRI,
2075                                         MachineIRBuilder &B,
2076                                         bool Signed) const {
2077 
2078   Register Dst = MI.getOperand(0).getReg();
2079   Register Src = MI.getOperand(1).getReg();
2080 
2081   const LLT S64 = LLT::scalar(64);
2082   const LLT S32 = LLT::scalar(32);
2083 
2084   const LLT SrcLT = MRI.getType(Src);
2085   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2086 
2087   unsigned Flags = MI.getFlags();
2088 
2089   // The basic idea of converting a floating point number into a pair of 32-bit
2090   // integers is illustrated as follows:
2091   //
2092   //     tf := trunc(val);
2093   //    hif := floor(tf * 2^-32);
2094   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2095   //     hi := fptoi(hif);
2096   //     lo := fptoi(lof);
2097   //
2098   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2099   MachineInstrBuilder Sign;
2100   if (Signed && SrcLT == S32) {
2101     // However, a 32-bit floating point number has only 23 bits mantissa and
2102     // it's not enough to hold all the significant bits of `lof` if val is
2103     // negative. To avoid the loss of precision, We need to take the absolute
2104     // value after truncating and flip the result back based on the original
2105     // signedness.
2106     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2107     Trunc = B.buildFAbs(S32, Trunc, Flags);
2108   }
2109   MachineInstrBuilder K0, K1;
2110   if (SrcLT == S64) {
2111     K0 = B.buildFConstant(S64,
2112                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2113     K1 = B.buildFConstant(S64,
2114                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2115   } else {
2116     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2117     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2118   }
2119 
2120   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2121   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2122   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2123 
2124   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2125                                      : B.buildFPTOUI(S32, FloorMul);
2126   auto Lo = B.buildFPTOUI(S32, Fma);
2127 
2128   if (Signed && SrcLT == S32) {
2129     // Flip the result based on the signedness, which is either all 0s or 1s.
2130     Sign = B.buildMerge(S64, {Sign, Sign});
2131     // r := xor({lo, hi}, sign) - sign;
2132     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2133   } else
2134     B.buildMerge(Dst, {Lo, Hi});
2135   MI.eraseFromParent();
2136 
2137   return true;
2138 }
2139 
2140 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2141                                                MachineInstr &MI) const {
2142   MachineFunction &MF = Helper.MIRBuilder.getMF();
2143   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2144 
2145   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2146                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2147 
2148   // With ieee_mode disabled, the instructions have the correct behavior
2149   // already for G_FMINNUM/G_FMAXNUM
2150   if (!MFI->getMode().IEEE)
2151     return !IsIEEEOp;
2152 
2153   if (IsIEEEOp)
2154     return true;
2155 
2156   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2157 }
2158 
2159 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2160   MachineInstr &MI, MachineRegisterInfo &MRI,
2161   MachineIRBuilder &B) const {
2162   // TODO: Should move some of this into LegalizerHelper.
2163 
2164   // TODO: Promote dynamic indexing of s16 to s32
2165 
2166   // FIXME: Artifact combiner probably should have replaced the truncated
2167   // constant before this, so we shouldn't need
2168   // getConstantVRegValWithLookThrough.
2169   Optional<ValueAndVReg> MaybeIdxVal =
2170       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2171   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2172     return true;
2173   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2174 
2175   Register Dst = MI.getOperand(0).getReg();
2176   Register Vec = MI.getOperand(1).getReg();
2177 
2178   LLT VecTy = MRI.getType(Vec);
2179   LLT EltTy = VecTy.getElementType();
2180   assert(EltTy == MRI.getType(Dst));
2181 
2182   if (IdxVal < VecTy.getNumElements())
2183     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2184   else
2185     B.buildUndef(Dst);
2186 
2187   MI.eraseFromParent();
2188   return true;
2189 }
2190 
2191 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2192   MachineInstr &MI, MachineRegisterInfo &MRI,
2193   MachineIRBuilder &B) const {
2194   // TODO: Should move some of this into LegalizerHelper.
2195 
2196   // TODO: Promote dynamic indexing of s16 to s32
2197 
2198   // FIXME: Artifact combiner probably should have replaced the truncated
2199   // constant before this, so we shouldn't need
2200   // getConstantVRegValWithLookThrough.
2201   Optional<ValueAndVReg> MaybeIdxVal =
2202       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2203   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2204     return true;
2205 
2206   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2207   Register Dst = MI.getOperand(0).getReg();
2208   Register Vec = MI.getOperand(1).getReg();
2209   Register Ins = MI.getOperand(2).getReg();
2210 
2211   LLT VecTy = MRI.getType(Vec);
2212   LLT EltTy = VecTy.getElementType();
2213   assert(EltTy == MRI.getType(Ins));
2214 
2215   if (IdxVal < VecTy.getNumElements())
2216     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2217   else
2218     B.buildUndef(Dst);
2219 
2220   MI.eraseFromParent();
2221   return true;
2222 }
2223 
2224 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2225   MachineInstr &MI, MachineRegisterInfo &MRI,
2226   MachineIRBuilder &B) const {
2227   const LLT V2S16 = LLT::vector(2, 16);
2228 
2229   Register Dst = MI.getOperand(0).getReg();
2230   Register Src0 = MI.getOperand(1).getReg();
2231   LLT DstTy = MRI.getType(Dst);
2232   LLT SrcTy = MRI.getType(Src0);
2233 
2234   if (SrcTy == V2S16 && DstTy == V2S16 &&
2235       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2236     return true;
2237 
2238   MachineIRBuilder HelperBuilder(MI);
2239   GISelObserverWrapper DummyObserver;
2240   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2241   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2242 }
2243 
2244 bool AMDGPULegalizerInfo::legalizeSinCos(
2245   MachineInstr &MI, MachineRegisterInfo &MRI,
2246   MachineIRBuilder &B) const {
2247 
2248   Register DstReg = MI.getOperand(0).getReg();
2249   Register SrcReg = MI.getOperand(1).getReg();
2250   LLT Ty = MRI.getType(DstReg);
2251   unsigned Flags = MI.getFlags();
2252 
2253   Register TrigVal;
2254   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2255   if (ST.hasTrigReducedRange()) {
2256     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2257     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2258       .addUse(MulVal.getReg(0))
2259       .setMIFlags(Flags).getReg(0);
2260   } else
2261     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2262 
2263   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2264     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2265   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2266     .addUse(TrigVal)
2267     .setMIFlags(Flags);
2268   MI.eraseFromParent();
2269   return true;
2270 }
2271 
2272 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2273                                                   MachineIRBuilder &B,
2274                                                   const GlobalValue *GV,
2275                                                   int64_t Offset,
2276                                                   unsigned GAFlags) const {
2277   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2278   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2279   // to the following code sequence:
2280   //
2281   // For constant address space:
2282   //   s_getpc_b64 s[0:1]
2283   //   s_add_u32 s0, s0, $symbol
2284   //   s_addc_u32 s1, s1, 0
2285   //
2286   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2287   //   a fixup or relocation is emitted to replace $symbol with a literal
2288   //   constant, which is a pc-relative offset from the encoding of the $symbol
2289   //   operand to the global variable.
2290   //
2291   // For global address space:
2292   //   s_getpc_b64 s[0:1]
2293   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2294   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2295   //
2296   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2297   //   fixups or relocations are emitted to replace $symbol@*@lo and
2298   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2299   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2300   //   operand to the global variable.
2301   //
2302   // What we want here is an offset from the value returned by s_getpc
2303   // (which is the address of the s_add_u32 instruction) to the global
2304   // variable, but since the encoding of $symbol starts 4 bytes after the start
2305   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2306   // small. This requires us to add 4 to the global variable offset in order to
2307   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2308   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2309   // instruction.
2310 
2311   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2312 
2313   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2314     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2315 
2316   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2317     .addDef(PCReg);
2318 
2319   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2320   if (GAFlags == SIInstrInfo::MO_NONE)
2321     MIB.addImm(0);
2322   else
2323     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2324 
2325   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2326 
2327   if (PtrTy.getSizeInBits() == 32)
2328     B.buildExtract(DstReg, PCReg, 0);
2329   return true;
2330  }
2331 
2332 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2333   MachineInstr &MI, MachineRegisterInfo &MRI,
2334   MachineIRBuilder &B) const {
2335   Register DstReg = MI.getOperand(0).getReg();
2336   LLT Ty = MRI.getType(DstReg);
2337   unsigned AS = Ty.getAddressSpace();
2338 
2339   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2340   MachineFunction &MF = B.getMF();
2341   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2342 
2343   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2344     if (!MFI->isModuleEntryFunction() &&
2345         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2346       const Function &Fn = MF.getFunction();
2347       DiagnosticInfoUnsupported BadLDSDecl(
2348         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2349         DS_Warning);
2350       Fn.getContext().diagnose(BadLDSDecl);
2351 
2352       // We currently don't have a way to correctly allocate LDS objects that
2353       // aren't directly associated with a kernel. We do force inlining of
2354       // functions that use local objects. However, if these dead functions are
2355       // not eliminated, we don't want a compile time error. Just emit a warning
2356       // and a trap, since there should be no callable path here.
2357       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2358       B.buildUndef(DstReg);
2359       MI.eraseFromParent();
2360       return true;
2361     }
2362 
2363     // TODO: We could emit code to handle the initialization somewhere.
2364     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2365       const SITargetLowering *TLI = ST.getTargetLowering();
2366       if (!TLI->shouldUseLDSConstAddress(GV)) {
2367         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2368         return true; // Leave in place;
2369       }
2370 
2371       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2372         Type *Ty = GV->getValueType();
2373         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2374         // zero-sized type in other languages to declare the dynamic shared
2375         // memory which size is not known at the compile time. They will be
2376         // allocated by the runtime and placed directly after the static
2377         // allocated ones. They all share the same offset.
2378         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2379           // Adjust alignment for that dynamic shared memory array.
2380           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2381           LLT S32 = LLT::scalar(32);
2382           auto Sz =
2383               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2384           B.buildIntToPtr(DstReg, Sz);
2385           MI.eraseFromParent();
2386           return true;
2387         }
2388       }
2389 
2390       B.buildConstant(
2391           DstReg,
2392           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2393       MI.eraseFromParent();
2394       return true;
2395     }
2396 
2397     const Function &Fn = MF.getFunction();
2398     DiagnosticInfoUnsupported BadInit(
2399       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2400     Fn.getContext().diagnose(BadInit);
2401     return true;
2402   }
2403 
2404   const SITargetLowering *TLI = ST.getTargetLowering();
2405 
2406   if (TLI->shouldEmitFixup(GV)) {
2407     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2408     MI.eraseFromParent();
2409     return true;
2410   }
2411 
2412   if (TLI->shouldEmitPCReloc(GV)) {
2413     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2414     MI.eraseFromParent();
2415     return true;
2416   }
2417 
2418   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2419   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2420 
2421   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2422       MachinePointerInfo::getGOT(MF),
2423       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2424           MachineMemOperand::MOInvariant,
2425       8 /*Size*/, Align(8));
2426 
2427   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2428 
2429   if (Ty.getSizeInBits() == 32) {
2430     // Truncate if this is a 32-bit constant adrdess.
2431     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2432     B.buildExtract(DstReg, Load, 0);
2433   } else
2434     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2435 
2436   MI.eraseFromParent();
2437   return true;
2438 }
2439 
2440 static LLT widenToNextPowerOf2(LLT Ty) {
2441   if (Ty.isVector())
2442     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2443   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2444 }
2445 
2446 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2447                                        MachineInstr &MI) const {
2448   MachineIRBuilder &B = Helper.MIRBuilder;
2449   MachineRegisterInfo &MRI = *B.getMRI();
2450   GISelChangeObserver &Observer = Helper.Observer;
2451 
2452   Register PtrReg = MI.getOperand(1).getReg();
2453   LLT PtrTy = MRI.getType(PtrReg);
2454   unsigned AddrSpace = PtrTy.getAddressSpace();
2455 
2456   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2457     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2458     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2459     Observer.changingInstr(MI);
2460     MI.getOperand(1).setReg(Cast.getReg(0));
2461     Observer.changedInstr(MI);
2462     return true;
2463   }
2464 
2465   if (MI.getOpcode() != AMDGPU::G_LOAD)
2466     return false;
2467 
2468   Register ValReg = MI.getOperand(0).getReg();
2469   LLT ValTy = MRI.getType(ValReg);
2470 
2471   MachineMemOperand *MMO = *MI.memoperands_begin();
2472   const unsigned ValSize = ValTy.getSizeInBits();
2473   const unsigned MemSize = 8 * MMO->getSize();
2474   const Align MemAlign = MMO->getAlign();
2475   const unsigned AlignInBits = 8 * MemAlign.value();
2476 
2477   // Widen non-power-of-2 loads to the alignment if needed
2478   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2479     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2480 
2481     // This was already the correct extending load result type, so just adjust
2482     // the memory type.
2483     if (WideMemSize == ValSize) {
2484       MachineFunction &MF = B.getMF();
2485 
2486       MachineMemOperand *WideMMO =
2487           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2488       Observer.changingInstr(MI);
2489       MI.setMemRefs(MF, {WideMMO});
2490       Observer.changedInstr(MI);
2491       return true;
2492     }
2493 
2494     // Don't bother handling edge case that should probably never be produced.
2495     if (ValSize > WideMemSize)
2496       return false;
2497 
2498     LLT WideTy = widenToNextPowerOf2(ValTy);
2499 
2500     Register WideLoad;
2501     if (!WideTy.isVector()) {
2502       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2503       B.buildTrunc(ValReg, WideLoad).getReg(0);
2504     } else {
2505       // Extract the subvector.
2506 
2507       if (isRegisterType(ValTy)) {
2508         // If this a case where G_EXTRACT is legal, use it.
2509         // (e.g. <3 x s32> -> <4 x s32>)
2510         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2511         B.buildExtract(ValReg, WideLoad, 0);
2512       } else {
2513         // For cases where the widened type isn't a nice register value, unmerge
2514         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2515         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2516         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2517         B.setInsertPt(B.getMBB(), MI.getIterator());
2518         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2519       }
2520     }
2521 
2522     MI.eraseFromParent();
2523     return true;
2524   }
2525 
2526   return false;
2527 }
2528 
2529 bool AMDGPULegalizerInfo::legalizeFMad(
2530   MachineInstr &MI, MachineRegisterInfo &MRI,
2531   MachineIRBuilder &B) const {
2532   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2533   assert(Ty.isScalar());
2534 
2535   MachineFunction &MF = B.getMF();
2536   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2537 
2538   // TODO: Always legal with future ftz flag.
2539   // FIXME: Do we need just output?
2540   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2541     return true;
2542   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2543     return true;
2544 
2545   MachineIRBuilder HelperBuilder(MI);
2546   GISelObserverWrapper DummyObserver;
2547   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2548   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2549 }
2550 
2551 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2552   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2553   Register DstReg = MI.getOperand(0).getReg();
2554   Register PtrReg = MI.getOperand(1).getReg();
2555   Register CmpVal = MI.getOperand(2).getReg();
2556   Register NewVal = MI.getOperand(3).getReg();
2557 
2558   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2559          "this should not have been custom lowered");
2560 
2561   LLT ValTy = MRI.getType(CmpVal);
2562   LLT VecTy = LLT::vector(2, ValTy);
2563 
2564   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2565 
2566   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2567     .addDef(DstReg)
2568     .addUse(PtrReg)
2569     .addUse(PackedVal)
2570     .setMemRefs(MI.memoperands());
2571 
2572   MI.eraseFromParent();
2573   return true;
2574 }
2575 
2576 bool AMDGPULegalizerInfo::legalizeFlog(
2577   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2578   Register Dst = MI.getOperand(0).getReg();
2579   Register Src = MI.getOperand(1).getReg();
2580   LLT Ty = B.getMRI()->getType(Dst);
2581   unsigned Flags = MI.getFlags();
2582 
2583   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2584   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2585 
2586   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2587   MI.eraseFromParent();
2588   return true;
2589 }
2590 
2591 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2592                                        MachineIRBuilder &B) const {
2593   Register Dst = MI.getOperand(0).getReg();
2594   Register Src = MI.getOperand(1).getReg();
2595   unsigned Flags = MI.getFlags();
2596   LLT Ty = B.getMRI()->getType(Dst);
2597 
2598   auto K = B.buildFConstant(Ty, numbers::log2e);
2599   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2600   B.buildFExp2(Dst, Mul, Flags);
2601   MI.eraseFromParent();
2602   return true;
2603 }
2604 
2605 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2606                                        MachineIRBuilder &B) const {
2607   Register Dst = MI.getOperand(0).getReg();
2608   Register Src0 = MI.getOperand(1).getReg();
2609   Register Src1 = MI.getOperand(2).getReg();
2610   unsigned Flags = MI.getFlags();
2611   LLT Ty = B.getMRI()->getType(Dst);
2612   const LLT S16 = LLT::scalar(16);
2613   const LLT S32 = LLT::scalar(32);
2614 
2615   if (Ty == S32) {
2616     auto Log = B.buildFLog2(S32, Src0, Flags);
2617     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2618       .addUse(Log.getReg(0))
2619       .addUse(Src1)
2620       .setMIFlags(Flags);
2621     B.buildFExp2(Dst, Mul, Flags);
2622   } else if (Ty == S16) {
2623     // There's no f16 fmul_legacy, so we need to convert for it.
2624     auto Log = B.buildFLog2(S16, Src0, Flags);
2625     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2626     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2627     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2628       .addUse(Ext0.getReg(0))
2629       .addUse(Ext1.getReg(0))
2630       .setMIFlags(Flags);
2631 
2632     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2633   } else
2634     return false;
2635 
2636   MI.eraseFromParent();
2637   return true;
2638 }
2639 
2640 // Find a source register, ignoring any possible source modifiers.
2641 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2642   Register ModSrc = OrigSrc;
2643   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2644     ModSrc = SrcFNeg->getOperand(1).getReg();
2645     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2646       ModSrc = SrcFAbs->getOperand(1).getReg();
2647   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2648     ModSrc = SrcFAbs->getOperand(1).getReg();
2649   return ModSrc;
2650 }
2651 
2652 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2653                                          MachineRegisterInfo &MRI,
2654                                          MachineIRBuilder &B) const {
2655 
2656   const LLT S1 = LLT::scalar(1);
2657   const LLT S64 = LLT::scalar(64);
2658   Register Dst = MI.getOperand(0).getReg();
2659   Register OrigSrc = MI.getOperand(1).getReg();
2660   unsigned Flags = MI.getFlags();
2661   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2662          "this should not have been custom lowered");
2663 
2664   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2665   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2666   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2667   // V_FRACT bug is:
2668   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2669   //
2670   // Convert floor(x) to (x - fract(x))
2671 
2672   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2673     .addUse(OrigSrc)
2674     .setMIFlags(Flags);
2675 
2676   // Give source modifier matching some assistance before obscuring a foldable
2677   // pattern.
2678 
2679   // TODO: We can avoid the neg on the fract? The input sign to fract
2680   // shouldn't matter?
2681   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2682 
2683   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2684 
2685   Register Min = MRI.createGenericVirtualRegister(S64);
2686 
2687   // We don't need to concern ourselves with the snan handling difference, so
2688   // use the one which will directly select.
2689   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2690   if (MFI->getMode().IEEE)
2691     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2692   else
2693     B.buildFMinNum(Min, Fract, Const, Flags);
2694 
2695   Register CorrectedFract = Min;
2696   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2697     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2698     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2699   }
2700 
2701   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2702   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2703 
2704   MI.eraseFromParent();
2705   return true;
2706 }
2707 
2708 // Turn an illegal packed v2s16 build vector into bit operations.
2709 // TODO: This should probably be a bitcast action in LegalizerHelper.
2710 bool AMDGPULegalizerInfo::legalizeBuildVector(
2711   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2712   Register Dst = MI.getOperand(0).getReg();
2713   const LLT S32 = LLT::scalar(32);
2714   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2715 
2716   Register Src0 = MI.getOperand(1).getReg();
2717   Register Src1 = MI.getOperand(2).getReg();
2718   assert(MRI.getType(Src0) == LLT::scalar(16));
2719 
2720   auto Merge = B.buildMerge(S32, {Src0, Src1});
2721   B.buildBitcast(Dst, Merge);
2722 
2723   MI.eraseFromParent();
2724   return true;
2725 }
2726 
2727 // Check that this is a G_XOR x, -1
2728 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2729   if (MI.getOpcode() != TargetOpcode::G_XOR)
2730     return false;
2731   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2732   return ConstVal && *ConstVal == -1;
2733 }
2734 
2735 // Return the use branch instruction, otherwise null if the usage is invalid.
2736 static MachineInstr *
2737 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2738                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2739   Register CondDef = MI.getOperand(0).getReg();
2740   if (!MRI.hasOneNonDBGUse(CondDef))
2741     return nullptr;
2742 
2743   MachineBasicBlock *Parent = MI.getParent();
2744   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2745 
2746   if (isNot(MRI, *UseMI)) {
2747     Register NegatedCond = UseMI->getOperand(0).getReg();
2748     if (!MRI.hasOneNonDBGUse(NegatedCond))
2749       return nullptr;
2750 
2751     // We're deleting the def of this value, so we need to remove it.
2752     UseMI->eraseFromParent();
2753 
2754     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2755     Negated = true;
2756   }
2757 
2758   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2759     return nullptr;
2760 
2761   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2762   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2763   if (Next == Parent->end()) {
2764     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2765     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2766       return nullptr;
2767     UncondBrTarget = &*NextMBB;
2768   } else {
2769     if (Next->getOpcode() != AMDGPU::G_BR)
2770       return nullptr;
2771     Br = &*Next;
2772     UncondBrTarget = Br->getOperand(0).getMBB();
2773   }
2774 
2775   return UseMI;
2776 }
2777 
2778 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2779                                          const ArgDescriptor *Arg,
2780                                          const TargetRegisterClass *ArgRC,
2781                                          LLT ArgTy) const {
2782   MCRegister SrcReg = Arg->getRegister();
2783   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2784   assert(DstReg.isVirtual() && "Virtual register expected");
2785 
2786   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2787                                              ArgTy);
2788   if (Arg->isMasked()) {
2789     // TODO: Should we try to emit this once in the entry block?
2790     const LLT S32 = LLT::scalar(32);
2791     const unsigned Mask = Arg->getMask();
2792     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2793 
2794     Register AndMaskSrc = LiveIn;
2795 
2796     if (Shift != 0) {
2797       auto ShiftAmt = B.buildConstant(S32, Shift);
2798       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2799     }
2800 
2801     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2802   } else {
2803     B.buildCopy(DstReg, LiveIn);
2804   }
2805 
2806   return true;
2807 }
2808 
2809 bool AMDGPULegalizerInfo::loadInputValue(
2810     Register DstReg, MachineIRBuilder &B,
2811     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2812   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2813   const ArgDescriptor *Arg;
2814   const TargetRegisterClass *ArgRC;
2815   LLT ArgTy;
2816   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2817 
2818   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2819     return false; // TODO: Handle these
2820   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2821 }
2822 
2823 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2824     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2825     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2826   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2827     return false;
2828 
2829   MI.eraseFromParent();
2830   return true;
2831 }
2832 
2833 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2834                                        MachineRegisterInfo &MRI,
2835                                        MachineIRBuilder &B) const {
2836   Register Dst = MI.getOperand(0).getReg();
2837   LLT DstTy = MRI.getType(Dst);
2838   LLT S16 = LLT::scalar(16);
2839   LLT S32 = LLT::scalar(32);
2840   LLT S64 = LLT::scalar(64);
2841 
2842   if (DstTy == S16)
2843     return legalizeFDIV16(MI, MRI, B);
2844   if (DstTy == S32)
2845     return legalizeFDIV32(MI, MRI, B);
2846   if (DstTy == S64)
2847     return legalizeFDIV64(MI, MRI, B);
2848 
2849   return false;
2850 }
2851 
2852 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2853                                                         Register DstDivReg,
2854                                                         Register DstRemReg,
2855                                                         Register X,
2856                                                         Register Y) const {
2857   const LLT S1 = LLT::scalar(1);
2858   const LLT S32 = LLT::scalar(32);
2859 
2860   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2861   // algorithm used here.
2862 
2863   // Initial estimate of inv(y).
2864   auto FloatY = B.buildUITOFP(S32, Y);
2865   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2866   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2867   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2868   auto Z = B.buildFPTOUI(S32, ScaledY);
2869 
2870   // One round of UNR.
2871   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2872   auto NegYZ = B.buildMul(S32, NegY, Z);
2873   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2874 
2875   // Quotient/remainder estimate.
2876   auto Q = B.buildUMulH(S32, X, Z);
2877   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2878 
2879   // First quotient/remainder refinement.
2880   auto One = B.buildConstant(S32, 1);
2881   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2882   if (DstDivReg)
2883     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2884   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2885 
2886   // Second quotient/remainder refinement.
2887   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2888   if (DstDivReg)
2889     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2890 
2891   if (DstRemReg)
2892     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2893 }
2894 
2895 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2896 //
2897 // Return lo, hi of result
2898 //
2899 // %cvt.lo = G_UITOFP Val.lo
2900 // %cvt.hi = G_UITOFP Val.hi
2901 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2902 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2903 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2904 // %mul2 = G_FMUL %mul1, 2**(-32)
2905 // %trunc = G_INTRINSIC_TRUNC %mul2
2906 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2907 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2908 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2909                                                        Register Val) {
2910   const LLT S32 = LLT::scalar(32);
2911   auto Unmerge = B.buildUnmerge(S32, Val);
2912 
2913   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2914   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2915 
2916   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2917                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2918 
2919   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2920   auto Mul1 =
2921       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2922 
2923   // 2**(-32)
2924   auto Mul2 =
2925       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2926   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2927 
2928   // -(2**32)
2929   auto Mad2 = B.buildFMAD(S32, Trunc,
2930                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2931 
2932   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2933   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2934 
2935   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2936 }
2937 
2938 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2939                                                         Register DstDivReg,
2940                                                         Register DstRemReg,
2941                                                         Register Numer,
2942                                                         Register Denom) const {
2943   const LLT S32 = LLT::scalar(32);
2944   const LLT S64 = LLT::scalar(64);
2945   const LLT S1 = LLT::scalar(1);
2946   Register RcpLo, RcpHi;
2947 
2948   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2949 
2950   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2951 
2952   auto Zero64 = B.buildConstant(S64, 0);
2953   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2954 
2955   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2956   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2957 
2958   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2959   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2960   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2961 
2962   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2963   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2964   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2965   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2966 
2967   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2968   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2969   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2970   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2971   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2972 
2973   auto Zero32 = B.buildConstant(S32, 0);
2974   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2975   auto Add2_HiC =
2976       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2977   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2978   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2979 
2980   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2981   Register NumerLo = UnmergeNumer.getReg(0);
2982   Register NumerHi = UnmergeNumer.getReg(1);
2983 
2984   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2985   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2986   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2987   Register Mul3_Lo = UnmergeMul3.getReg(0);
2988   Register Mul3_Hi = UnmergeMul3.getReg(1);
2989   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2990   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2991   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2992   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2993 
2994   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2995   Register DenomLo = UnmergeDenom.getReg(0);
2996   Register DenomHi = UnmergeDenom.getReg(1);
2997 
2998   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2999   auto C1 = B.buildSExt(S32, CmpHi);
3000 
3001   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3002   auto C2 = B.buildSExt(S32, CmpLo);
3003 
3004   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3005   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3006 
3007   // TODO: Here and below portions of the code can be enclosed into if/endif.
3008   // Currently control flow is unconditional and we have 4 selects after
3009   // potential endif to substitute PHIs.
3010 
3011   // if C3 != 0 ...
3012   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3013   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3014   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3015   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3016 
3017   auto One64 = B.buildConstant(S64, 1);
3018   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3019 
3020   auto C4 =
3021       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3022   auto C5 =
3023       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3024   auto C6 = B.buildSelect(
3025       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3026 
3027   // if (C6 != 0)
3028   auto Add4 = B.buildAdd(S64, Add3, One64);
3029   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3030 
3031   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3032   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3033   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3034 
3035   // endif C6
3036   // endif C3
3037 
3038   if (DstDivReg) {
3039     auto Sel1 = B.buildSelect(
3040         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3041     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3042                   Sel1, MulHi3);
3043   }
3044 
3045   if (DstRemReg) {
3046     auto Sel2 = B.buildSelect(
3047         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3048     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3049                   Sel2, Sub1);
3050   }
3051 }
3052 
3053 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3054                                                   MachineRegisterInfo &MRI,
3055                                                   MachineIRBuilder &B) const {
3056   Register DstDivReg, DstRemReg;
3057   switch (MI.getOpcode()) {
3058   default:
3059     llvm_unreachable("Unexpected opcode!");
3060   case AMDGPU::G_UDIV: {
3061     DstDivReg = MI.getOperand(0).getReg();
3062     break;
3063   }
3064   case AMDGPU::G_UREM: {
3065     DstRemReg = MI.getOperand(0).getReg();
3066     break;
3067   }
3068   case AMDGPU::G_UDIVREM: {
3069     DstDivReg = MI.getOperand(0).getReg();
3070     DstRemReg = MI.getOperand(1).getReg();
3071     break;
3072   }
3073   }
3074 
3075   const LLT S64 = LLT::scalar(64);
3076   const LLT S32 = LLT::scalar(32);
3077   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3078   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3079   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3080   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3081 
3082   if (Ty == S32)
3083     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3084   else if (Ty == S64)
3085     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3086   else
3087     return false;
3088 
3089   MI.eraseFromParent();
3090   return true;
3091 }
3092 
3093 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3094                                                 MachineRegisterInfo &MRI,
3095                                                 MachineIRBuilder &B) const {
3096   const LLT S64 = LLT::scalar(64);
3097   const LLT S32 = LLT::scalar(32);
3098 
3099   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3100   if (Ty != S32 && Ty != S64)
3101     return false;
3102 
3103   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3104   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3105   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3106 
3107   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3108   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3109   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3110 
3111   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3112   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3113 
3114   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3115   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3116 
3117   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3118   switch (MI.getOpcode()) {
3119   default:
3120     llvm_unreachable("Unexpected opcode!");
3121   case AMDGPU::G_SDIV: {
3122     DstDivReg = MI.getOperand(0).getReg();
3123     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3124     break;
3125   }
3126   case AMDGPU::G_SREM: {
3127     DstRemReg = MI.getOperand(0).getReg();
3128     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3129     break;
3130   }
3131   case AMDGPU::G_SDIVREM: {
3132     DstDivReg = MI.getOperand(0).getReg();
3133     DstRemReg = MI.getOperand(1).getReg();
3134     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3135     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3136     break;
3137   }
3138   }
3139 
3140   if (Ty == S32)
3141     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3142   else
3143     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3144 
3145   if (DstDivReg) {
3146     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3147     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3148     B.buildSub(DstDivReg, SignXor, Sign);
3149   }
3150 
3151   if (DstRemReg) {
3152     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3153     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3154     B.buildSub(DstRemReg, SignXor, Sign);
3155   }
3156 
3157   MI.eraseFromParent();
3158   return true;
3159 }
3160 
3161 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3162                                                  MachineRegisterInfo &MRI,
3163                                                  MachineIRBuilder &B) const {
3164   Register Res = MI.getOperand(0).getReg();
3165   Register LHS = MI.getOperand(1).getReg();
3166   Register RHS = MI.getOperand(2).getReg();
3167   uint16_t Flags = MI.getFlags();
3168   LLT ResTy = MRI.getType(Res);
3169 
3170   const MachineFunction &MF = B.getMF();
3171   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3172                             MI.getFlag(MachineInstr::FmAfn);
3173 
3174   if (!AllowInaccurateRcp)
3175     return false;
3176 
3177   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3178     // 1 / x -> RCP(x)
3179     if (CLHS->isExactlyValue(1.0)) {
3180       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3181         .addUse(RHS)
3182         .setMIFlags(Flags);
3183 
3184       MI.eraseFromParent();
3185       return true;
3186     }
3187 
3188     // -1 / x -> RCP( FNEG(x) )
3189     if (CLHS->isExactlyValue(-1.0)) {
3190       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3191       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3192         .addUse(FNeg.getReg(0))
3193         .setMIFlags(Flags);
3194 
3195       MI.eraseFromParent();
3196       return true;
3197     }
3198   }
3199 
3200   // x / y -> x * (1.0 / y)
3201   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3202     .addUse(RHS)
3203     .setMIFlags(Flags);
3204   B.buildFMul(Res, LHS, RCP, Flags);
3205 
3206   MI.eraseFromParent();
3207   return true;
3208 }
3209 
3210 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3211                                                    MachineRegisterInfo &MRI,
3212                                                    MachineIRBuilder &B) const {
3213   Register Res = MI.getOperand(0).getReg();
3214   Register X = MI.getOperand(1).getReg();
3215   Register Y = MI.getOperand(2).getReg();
3216   uint16_t Flags = MI.getFlags();
3217   LLT ResTy = MRI.getType(Res);
3218 
3219   const MachineFunction &MF = B.getMF();
3220   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3221                             MI.getFlag(MachineInstr::FmAfn);
3222 
3223   if (!AllowInaccurateRcp)
3224     return false;
3225 
3226   auto NegY = B.buildFNeg(ResTy, Y);
3227   auto One = B.buildFConstant(ResTy, 1.0);
3228 
3229   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3230     .addUse(Y)
3231     .setMIFlags(Flags);
3232 
3233   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3234   R = B.buildFMA(ResTy, Tmp0, R, R);
3235 
3236   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3237   R = B.buildFMA(ResTy, Tmp1, R, R);
3238 
3239   auto Ret = B.buildFMul(ResTy, X, R);
3240   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3241 
3242   B.buildFMA(Res, Tmp2, R, Ret);
3243   MI.eraseFromParent();
3244   return true;
3245 }
3246 
3247 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3248                                          MachineRegisterInfo &MRI,
3249                                          MachineIRBuilder &B) const {
3250   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3251     return true;
3252 
3253   Register Res = MI.getOperand(0).getReg();
3254   Register LHS = MI.getOperand(1).getReg();
3255   Register RHS = MI.getOperand(2).getReg();
3256 
3257   uint16_t Flags = MI.getFlags();
3258 
3259   LLT S16 = LLT::scalar(16);
3260   LLT S32 = LLT::scalar(32);
3261 
3262   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3263   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3264 
3265   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3266     .addUse(RHSExt.getReg(0))
3267     .setMIFlags(Flags);
3268 
3269   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3270   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3271 
3272   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3273     .addUse(RDst.getReg(0))
3274     .addUse(RHS)
3275     .addUse(LHS)
3276     .setMIFlags(Flags);
3277 
3278   MI.eraseFromParent();
3279   return true;
3280 }
3281 
3282 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3283 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3284 static void toggleSPDenormMode(bool Enable,
3285                                MachineIRBuilder &B,
3286                                const GCNSubtarget &ST,
3287                                AMDGPU::SIModeRegisterDefaults Mode) {
3288   // Set SP denorm mode to this value.
3289   unsigned SPDenormMode =
3290     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3291 
3292   if (ST.hasDenormModeInst()) {
3293     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3294     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3295 
3296     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3297     B.buildInstr(AMDGPU::S_DENORM_MODE)
3298       .addImm(NewDenormModeValue);
3299 
3300   } else {
3301     // Select FP32 bit field in mode register.
3302     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3303                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3304                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3305 
3306     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3307       .addImm(SPDenormMode)
3308       .addImm(SPDenormModeBitField);
3309   }
3310 }
3311 
3312 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3313                                          MachineRegisterInfo &MRI,
3314                                          MachineIRBuilder &B) const {
3315   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3316     return true;
3317 
3318   Register Res = MI.getOperand(0).getReg();
3319   Register LHS = MI.getOperand(1).getReg();
3320   Register RHS = MI.getOperand(2).getReg();
3321   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3322   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3323 
3324   uint16_t Flags = MI.getFlags();
3325 
3326   LLT S32 = LLT::scalar(32);
3327   LLT S1 = LLT::scalar(1);
3328 
3329   auto One = B.buildFConstant(S32, 1.0f);
3330 
3331   auto DenominatorScaled =
3332     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3333       .addUse(LHS)
3334       .addUse(RHS)
3335       .addImm(0)
3336       .setMIFlags(Flags);
3337   auto NumeratorScaled =
3338     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3339       .addUse(LHS)
3340       .addUse(RHS)
3341       .addImm(1)
3342       .setMIFlags(Flags);
3343 
3344   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3345     .addUse(DenominatorScaled.getReg(0))
3346     .setMIFlags(Flags);
3347   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3348 
3349   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3350   // aren't modeled as reading it.
3351   if (!Mode.allFP32Denormals())
3352     toggleSPDenormMode(true, B, ST, Mode);
3353 
3354   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3355   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3356   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3357   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3358   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3359   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3360 
3361   if (!Mode.allFP32Denormals())
3362     toggleSPDenormMode(false, B, ST, Mode);
3363 
3364   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3365     .addUse(Fma4.getReg(0))
3366     .addUse(Fma1.getReg(0))
3367     .addUse(Fma3.getReg(0))
3368     .addUse(NumeratorScaled.getReg(1))
3369     .setMIFlags(Flags);
3370 
3371   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3372     .addUse(Fmas.getReg(0))
3373     .addUse(RHS)
3374     .addUse(LHS)
3375     .setMIFlags(Flags);
3376 
3377   MI.eraseFromParent();
3378   return true;
3379 }
3380 
3381 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3382                                          MachineRegisterInfo &MRI,
3383                                          MachineIRBuilder &B) const {
3384   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3385     return true;
3386 
3387   Register Res = MI.getOperand(0).getReg();
3388   Register LHS = MI.getOperand(1).getReg();
3389   Register RHS = MI.getOperand(2).getReg();
3390 
3391   uint16_t Flags = MI.getFlags();
3392 
3393   LLT S64 = LLT::scalar(64);
3394   LLT S1 = LLT::scalar(1);
3395 
3396   auto One = B.buildFConstant(S64, 1.0);
3397 
3398   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3399     .addUse(LHS)
3400     .addUse(RHS)
3401     .addImm(0)
3402     .setMIFlags(Flags);
3403 
3404   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3405 
3406   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3407     .addUse(DivScale0.getReg(0))
3408     .setMIFlags(Flags);
3409 
3410   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3411   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3412   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3413 
3414   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3415     .addUse(LHS)
3416     .addUse(RHS)
3417     .addImm(1)
3418     .setMIFlags(Flags);
3419 
3420   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3421   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3422   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3423 
3424   Register Scale;
3425   if (!ST.hasUsableDivScaleConditionOutput()) {
3426     // Workaround a hardware bug on SI where the condition output from div_scale
3427     // is not usable.
3428 
3429     LLT S32 = LLT::scalar(32);
3430 
3431     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3432     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3433     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3434     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3435 
3436     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3437                               Scale1Unmerge.getReg(1));
3438     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3439                               Scale0Unmerge.getReg(1));
3440     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3441   } else {
3442     Scale = DivScale1.getReg(1);
3443   }
3444 
3445   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3446     .addUse(Fma4.getReg(0))
3447     .addUse(Fma3.getReg(0))
3448     .addUse(Mul.getReg(0))
3449     .addUse(Scale)
3450     .setMIFlags(Flags);
3451 
3452   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3453     .addUse(Fmas.getReg(0))
3454     .addUse(RHS)
3455     .addUse(LHS)
3456     .setMIFlags(Flags);
3457 
3458   MI.eraseFromParent();
3459   return true;
3460 }
3461 
3462 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3463                                                  MachineRegisterInfo &MRI,
3464                                                  MachineIRBuilder &B) const {
3465   Register Res = MI.getOperand(0).getReg();
3466   Register LHS = MI.getOperand(2).getReg();
3467   Register RHS = MI.getOperand(3).getReg();
3468   uint16_t Flags = MI.getFlags();
3469 
3470   LLT S32 = LLT::scalar(32);
3471   LLT S1 = LLT::scalar(1);
3472 
3473   auto Abs = B.buildFAbs(S32, RHS, Flags);
3474   const APFloat C0Val(1.0f);
3475 
3476   auto C0 = B.buildConstant(S32, 0x6f800000);
3477   auto C1 = B.buildConstant(S32, 0x2f800000);
3478   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3479 
3480   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3481   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3482 
3483   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3484 
3485   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3486     .addUse(Mul0.getReg(0))
3487     .setMIFlags(Flags);
3488 
3489   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3490 
3491   B.buildFMul(Res, Sel, Mul1, Flags);
3492 
3493   MI.eraseFromParent();
3494   return true;
3495 }
3496 
3497 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3498 // FIXME: Why do we handle this one but not other removed instructions?
3499 //
3500 // Reciprocal square root.  The clamp prevents infinite results, clamping
3501 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3502 // +-max_float.
3503 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3504                                                     MachineRegisterInfo &MRI,
3505                                                     MachineIRBuilder &B) const {
3506   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3507     return true;
3508 
3509   Register Dst = MI.getOperand(0).getReg();
3510   Register Src = MI.getOperand(2).getReg();
3511   auto Flags = MI.getFlags();
3512 
3513   LLT Ty = MRI.getType(Dst);
3514 
3515   const fltSemantics *FltSemantics;
3516   if (Ty == LLT::scalar(32))
3517     FltSemantics = &APFloat::IEEEsingle();
3518   else if (Ty == LLT::scalar(64))
3519     FltSemantics = &APFloat::IEEEdouble();
3520   else
3521     return false;
3522 
3523   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3524     .addUse(Src)
3525     .setMIFlags(Flags);
3526 
3527   // We don't need to concern ourselves with the snan handling difference, since
3528   // the rsq quieted (or not) so use the one which will directly select.
3529   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3530   const bool UseIEEE = MFI->getMode().IEEE;
3531 
3532   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3533   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3534                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3535 
3536   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3537 
3538   if (UseIEEE)
3539     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3540   else
3541     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3542   MI.eraseFromParent();
3543   return true;
3544 }
3545 
3546 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3547   switch (IID) {
3548   case Intrinsic::amdgcn_ds_fadd:
3549     return AMDGPU::G_ATOMICRMW_FADD;
3550   case Intrinsic::amdgcn_ds_fmin:
3551     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3552   case Intrinsic::amdgcn_ds_fmax:
3553     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3554   default:
3555     llvm_unreachable("not a DS FP intrinsic");
3556   }
3557 }
3558 
3559 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3560                                                       MachineInstr &MI,
3561                                                       Intrinsic::ID IID) const {
3562   GISelChangeObserver &Observer = Helper.Observer;
3563   Observer.changingInstr(MI);
3564 
3565   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3566 
3567   // The remaining operands were used to set fields in the MemOperand on
3568   // construction.
3569   for (int I = 6; I > 3; --I)
3570     MI.RemoveOperand(I);
3571 
3572   MI.RemoveOperand(1); // Remove the intrinsic ID.
3573   Observer.changedInstr(MI);
3574   return true;
3575 }
3576 
3577 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3578                                             MachineRegisterInfo &MRI,
3579                                             MachineIRBuilder &B) const {
3580   uint64_t Offset =
3581     ST.getTargetLowering()->getImplicitParameterOffset(
3582       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3583   LLT DstTy = MRI.getType(DstReg);
3584   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3585 
3586   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3587   if (!loadInputValue(KernargPtrReg, B,
3588                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3589     return false;
3590 
3591   // FIXME: This should be nuw
3592   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3593   return true;
3594 }
3595 
3596 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3597                                                  MachineRegisterInfo &MRI,
3598                                                  MachineIRBuilder &B) const {
3599   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3600   if (!MFI->isEntryFunction()) {
3601     return legalizePreloadedArgIntrin(MI, MRI, B,
3602                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3603   }
3604 
3605   Register DstReg = MI.getOperand(0).getReg();
3606   if (!getImplicitArgPtr(DstReg, MRI, B))
3607     return false;
3608 
3609   MI.eraseFromParent();
3610   return true;
3611 }
3612 
3613 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3614                                               MachineRegisterInfo &MRI,
3615                                               MachineIRBuilder &B,
3616                                               unsigned AddrSpace) const {
3617   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3618   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3619   Register Hi32 = Unmerge.getReg(1);
3620 
3621   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3622   MI.eraseFromParent();
3623   return true;
3624 }
3625 
3626 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3627 // offset (the offset that is included in bounds checking and swizzling, to be
3628 // split between the instruction's voffset and immoffset fields) and soffset
3629 // (the offset that is excluded from bounds checking and swizzling, to go in
3630 // the instruction's soffset field).  This function takes the first kind of
3631 // offset and figures out how to split it between voffset and immoffset.
3632 std::tuple<Register, unsigned, unsigned>
3633 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3634                                         Register OrigOffset) const {
3635   const unsigned MaxImm = 4095;
3636   Register BaseReg;
3637   unsigned TotalConstOffset;
3638   const LLT S32 = LLT::scalar(32);
3639   MachineRegisterInfo &MRI = *B.getMRI();
3640 
3641   std::tie(BaseReg, TotalConstOffset) =
3642       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3643 
3644   unsigned ImmOffset = TotalConstOffset;
3645 
3646   // If BaseReg is a pointer, convert it to int.
3647   if (MRI.getType(BaseReg).isPointer())
3648     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3649 
3650   // If the immediate value is too big for the immoffset field, put the value
3651   // and -4096 into the immoffset field so that the value that is copied/added
3652   // for the voffset field is a multiple of 4096, and it stands more chance
3653   // of being CSEd with the copy/add for another similar load/store.
3654   // However, do not do that rounding down to a multiple of 4096 if that is a
3655   // negative number, as it appears to be illegal to have a negative offset
3656   // in the vgpr, even if adding the immediate offset makes it positive.
3657   unsigned Overflow = ImmOffset & ~MaxImm;
3658   ImmOffset -= Overflow;
3659   if ((int32_t)Overflow < 0) {
3660     Overflow += ImmOffset;
3661     ImmOffset = 0;
3662   }
3663 
3664   if (Overflow != 0) {
3665     if (!BaseReg) {
3666       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3667     } else {
3668       auto OverflowVal = B.buildConstant(S32, Overflow);
3669       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3670     }
3671   }
3672 
3673   if (!BaseReg)
3674     BaseReg = B.buildConstant(S32, 0).getReg(0);
3675 
3676   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3677 }
3678 
3679 /// Handle register layout difference for f16 images for some subtargets.
3680 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3681                                              MachineRegisterInfo &MRI,
3682                                              Register Reg,
3683                                              bool ImageStore) const {
3684   const LLT S16 = LLT::scalar(16);
3685   const LLT S32 = LLT::scalar(32);
3686   LLT StoreVT = MRI.getType(Reg);
3687   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3688 
3689   if (ST.hasUnpackedD16VMem()) {
3690     auto Unmerge = B.buildUnmerge(S16, Reg);
3691 
3692     SmallVector<Register, 4> WideRegs;
3693     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3694       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3695 
3696     int NumElts = StoreVT.getNumElements();
3697 
3698     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3699   }
3700 
3701   if (ImageStore && ST.hasImageStoreD16Bug()) {
3702     if (StoreVT.getNumElements() == 2) {
3703       SmallVector<Register, 4> PackedRegs;
3704       Reg = B.buildBitcast(S32, Reg).getReg(0);
3705       PackedRegs.push_back(Reg);
3706       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3707       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3708     }
3709 
3710     if (StoreVT.getNumElements() == 3) {
3711       SmallVector<Register, 4> PackedRegs;
3712       auto Unmerge = B.buildUnmerge(S16, Reg);
3713       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3714         PackedRegs.push_back(Unmerge.getReg(I));
3715       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3716       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3717       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3718     }
3719 
3720     if (StoreVT.getNumElements() == 4) {
3721       SmallVector<Register, 4> PackedRegs;
3722       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3723       auto Unmerge = B.buildUnmerge(S32, Reg);
3724       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3725         PackedRegs.push_back(Unmerge.getReg(I));
3726       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3727       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3728     }
3729 
3730     llvm_unreachable("invalid data type");
3731   }
3732 
3733   return Reg;
3734 }
3735 
3736 Register AMDGPULegalizerInfo::fixStoreSourceType(
3737   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3738   MachineRegisterInfo *MRI = B.getMRI();
3739   LLT Ty = MRI->getType(VData);
3740 
3741   const LLT S16 = LLT::scalar(16);
3742 
3743   // Fixup illegal register types for i8 stores.
3744   if (Ty == LLT::scalar(8) || Ty == S16) {
3745     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3746     return AnyExt;
3747   }
3748 
3749   if (Ty.isVector()) {
3750     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3751       if (IsFormat)
3752         return handleD16VData(B, *MRI, VData);
3753     }
3754   }
3755 
3756   return VData;
3757 }
3758 
3759 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3760                                               MachineRegisterInfo &MRI,
3761                                               MachineIRBuilder &B,
3762                                               bool IsTyped,
3763                                               bool IsFormat) const {
3764   Register VData = MI.getOperand(1).getReg();
3765   LLT Ty = MRI.getType(VData);
3766   LLT EltTy = Ty.getScalarType();
3767   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3768   const LLT S32 = LLT::scalar(32);
3769 
3770   VData = fixStoreSourceType(B, VData, IsFormat);
3771   Register RSrc = MI.getOperand(2).getReg();
3772 
3773   MachineMemOperand *MMO = *MI.memoperands_begin();
3774   const int MemSize = MMO->getSize();
3775 
3776   unsigned ImmOffset;
3777   unsigned TotalOffset;
3778 
3779   // The typed intrinsics add an immediate after the registers.
3780   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3781 
3782   // The struct intrinsic variants add one additional operand over raw.
3783   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3784   Register VIndex;
3785   int OpOffset = 0;
3786   if (HasVIndex) {
3787     VIndex = MI.getOperand(3).getReg();
3788     OpOffset = 1;
3789   }
3790 
3791   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3792   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3793 
3794   unsigned Format = 0;
3795   if (IsTyped) {
3796     Format = MI.getOperand(5 + OpOffset).getImm();
3797     ++OpOffset;
3798   }
3799 
3800   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3801 
3802   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3803   if (TotalOffset != 0)
3804     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3805 
3806   unsigned Opc;
3807   if (IsTyped) {
3808     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3809                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3810   } else if (IsFormat) {
3811     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3812                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3813   } else {
3814     switch (MemSize) {
3815     case 1:
3816       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3817       break;
3818     case 2:
3819       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3820       break;
3821     default:
3822       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3823       break;
3824     }
3825   }
3826 
3827   if (!VIndex)
3828     VIndex = B.buildConstant(S32, 0).getReg(0);
3829 
3830   auto MIB = B.buildInstr(Opc)
3831     .addUse(VData)              // vdata
3832     .addUse(RSrc)               // rsrc
3833     .addUse(VIndex)             // vindex
3834     .addUse(VOffset)            // voffset
3835     .addUse(SOffset)            // soffset
3836     .addImm(ImmOffset);         // offset(imm)
3837 
3838   if (IsTyped)
3839     MIB.addImm(Format);
3840 
3841   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3842      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3843      .addMemOperand(MMO);
3844 
3845   MI.eraseFromParent();
3846   return true;
3847 }
3848 
3849 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3850                                              MachineRegisterInfo &MRI,
3851                                              MachineIRBuilder &B,
3852                                              bool IsFormat,
3853                                              bool IsTyped) const {
3854   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3855   MachineMemOperand *MMO = *MI.memoperands_begin();
3856   const int MemSize = MMO->getSize();
3857   const LLT S32 = LLT::scalar(32);
3858 
3859   Register Dst = MI.getOperand(0).getReg();
3860   Register RSrc = MI.getOperand(2).getReg();
3861 
3862   // The typed intrinsics add an immediate after the registers.
3863   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3864 
3865   // The struct intrinsic variants add one additional operand over raw.
3866   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3867   Register VIndex;
3868   int OpOffset = 0;
3869   if (HasVIndex) {
3870     VIndex = MI.getOperand(3).getReg();
3871     OpOffset = 1;
3872   }
3873 
3874   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3875   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3876 
3877   unsigned Format = 0;
3878   if (IsTyped) {
3879     Format = MI.getOperand(5 + OpOffset).getImm();
3880     ++OpOffset;
3881   }
3882 
3883   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3884   unsigned ImmOffset;
3885   unsigned TotalOffset;
3886 
3887   LLT Ty = MRI.getType(Dst);
3888   LLT EltTy = Ty.getScalarType();
3889   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3890   const bool Unpacked = ST.hasUnpackedD16VMem();
3891 
3892   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3893   if (TotalOffset != 0)
3894     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3895 
3896   unsigned Opc;
3897 
3898   if (IsTyped) {
3899     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3900                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3901   } else if (IsFormat) {
3902     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3903                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3904   } else {
3905     switch (MemSize) {
3906     case 1:
3907       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3908       break;
3909     case 2:
3910       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3911       break;
3912     default:
3913       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3914       break;
3915     }
3916   }
3917 
3918   Register LoadDstReg;
3919 
3920   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3921   LLT UnpackedTy = Ty.changeElementSize(32);
3922 
3923   if (IsExtLoad)
3924     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3925   else if (Unpacked && IsD16 && Ty.isVector())
3926     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3927   else
3928     LoadDstReg = Dst;
3929 
3930   if (!VIndex)
3931     VIndex = B.buildConstant(S32, 0).getReg(0);
3932 
3933   auto MIB = B.buildInstr(Opc)
3934     .addDef(LoadDstReg)         // vdata
3935     .addUse(RSrc)               // rsrc
3936     .addUse(VIndex)             // vindex
3937     .addUse(VOffset)            // voffset
3938     .addUse(SOffset)            // soffset
3939     .addImm(ImmOffset);         // offset(imm)
3940 
3941   if (IsTyped)
3942     MIB.addImm(Format);
3943 
3944   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3945      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3946      .addMemOperand(MMO);
3947 
3948   if (LoadDstReg != Dst) {
3949     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3950 
3951     // Widen result for extending loads was widened.
3952     if (IsExtLoad)
3953       B.buildTrunc(Dst, LoadDstReg);
3954     else {
3955       // Repack to original 16-bit vector result
3956       // FIXME: G_TRUNC should work, but legalization currently fails
3957       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3958       SmallVector<Register, 4> Repack;
3959       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3960         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3961       B.buildMerge(Dst, Repack);
3962     }
3963   }
3964 
3965   MI.eraseFromParent();
3966   return true;
3967 }
3968 
3969 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3970                                                MachineIRBuilder &B,
3971                                                bool IsInc) const {
3972   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3973                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3974   B.buildInstr(Opc)
3975     .addDef(MI.getOperand(0).getReg())
3976     .addUse(MI.getOperand(2).getReg())
3977     .addUse(MI.getOperand(3).getReg())
3978     .cloneMemRefs(MI);
3979   MI.eraseFromParent();
3980   return true;
3981 }
3982 
3983 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3984   switch (IntrID) {
3985   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3986   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3987     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3988   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3989   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3990     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3991   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3992   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3993     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3994   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3995   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3996     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3997   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3998   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3999     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4000   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4001   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4002     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4003   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4004   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4005     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4006   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4007   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4008     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4009   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4010   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4011     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4012   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4013   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4014     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4015   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4016   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4017     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4018   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4019   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4020     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4021   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4022   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4023     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4024   case Intrinsic::amdgcn_buffer_atomic_fadd:
4025   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4026   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4027     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4028   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4029   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4030     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4031   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4032   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4033     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4034   default:
4035     llvm_unreachable("unhandled atomic opcode");
4036   }
4037 }
4038 
4039 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4040                                                MachineIRBuilder &B,
4041                                                Intrinsic::ID IID) const {
4042   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4043                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4044   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4045 
4046   Register Dst;
4047 
4048   int OpOffset = 0;
4049   if (HasReturn) {
4050     // A few FP atomics do not support return values.
4051     Dst = MI.getOperand(0).getReg();
4052   } else {
4053     OpOffset = -1;
4054   }
4055 
4056   Register VData = MI.getOperand(2 + OpOffset).getReg();
4057   Register CmpVal;
4058 
4059   if (IsCmpSwap) {
4060     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4061     ++OpOffset;
4062   }
4063 
4064   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4065   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4066 
4067   // The struct intrinsic variants add one additional operand over raw.
4068   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4069   Register VIndex;
4070   if (HasVIndex) {
4071     VIndex = MI.getOperand(4 + OpOffset).getReg();
4072     ++OpOffset;
4073   }
4074 
4075   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4076   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4077   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4078 
4079   MachineMemOperand *MMO = *MI.memoperands_begin();
4080 
4081   unsigned ImmOffset;
4082   unsigned TotalOffset;
4083   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4084   if (TotalOffset != 0)
4085     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4086 
4087   if (!VIndex)
4088     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4089 
4090   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4091 
4092   if (HasReturn)
4093     MIB.addDef(Dst);
4094 
4095   MIB.addUse(VData); // vdata
4096 
4097   if (IsCmpSwap)
4098     MIB.addReg(CmpVal);
4099 
4100   MIB.addUse(RSrc)               // rsrc
4101      .addUse(VIndex)             // vindex
4102      .addUse(VOffset)            // voffset
4103      .addUse(SOffset)            // soffset
4104      .addImm(ImmOffset)          // offset(imm)
4105      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4106      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4107      .addMemOperand(MMO);
4108 
4109   MI.eraseFromParent();
4110   return true;
4111 }
4112 
4113 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4114 /// vector with s16 typed elements.
4115 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4116                                       SmallVectorImpl<Register> &PackedAddrs,
4117                                       unsigned ArgOffset,
4118                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4119                                       bool IsA16, bool IsG16) {
4120   const LLT S16 = LLT::scalar(16);
4121   const LLT V2S16 = LLT::vector(2, 16);
4122   auto EndIdx = Intr->VAddrEnd;
4123 
4124   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4125     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4126     if (!SrcOp.isReg())
4127       continue; // _L to _LZ may have eliminated this.
4128 
4129     Register AddrReg = SrcOp.getReg();
4130 
4131     if (I < Intr->GradientStart) {
4132       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4133       PackedAddrs.push_back(AddrReg);
4134     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4135                (I >= Intr->CoordStart && !IsA16)) {
4136       // Handle any gradient or coordinate operands that should not be packed
4137       PackedAddrs.push_back(AddrReg);
4138     } else {
4139       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4140       // derivatives dx/dh and dx/dv are packed with undef.
4141       if (((I + 1) >= EndIdx) ||
4142           ((Intr->NumGradients / 2) % 2 == 1 &&
4143            (I == static_cast<unsigned>(Intr->GradientStart +
4144                                        (Intr->NumGradients / 2) - 1) ||
4145             I == static_cast<unsigned>(Intr->GradientStart +
4146                                        Intr->NumGradients - 1))) ||
4147           // Check for _L to _LZ optimization
4148           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4149         PackedAddrs.push_back(
4150             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4151                 .getReg(0));
4152       } else {
4153         PackedAddrs.push_back(
4154             B.buildBuildVector(
4155                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4156                 .getReg(0));
4157         ++I;
4158       }
4159     }
4160   }
4161 }
4162 
4163 /// Convert from separate vaddr components to a single vector address register,
4164 /// and replace the remaining operands with $noreg.
4165 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4166                                      int DimIdx, int NumVAddrs) {
4167   const LLT S32 = LLT::scalar(32);
4168 
4169   SmallVector<Register, 8> AddrRegs;
4170   for (int I = 0; I != NumVAddrs; ++I) {
4171     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4172     if (SrcOp.isReg()) {
4173       AddrRegs.push_back(SrcOp.getReg());
4174       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4175     }
4176   }
4177 
4178   int NumAddrRegs = AddrRegs.size();
4179   if (NumAddrRegs != 1) {
4180     // Round up to 8 elements for v5-v7
4181     // FIXME: Missing intermediate sized register classes and instructions.
4182     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4183       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4184       auto Undef = B.buildUndef(S32);
4185       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4186       NumAddrRegs = RoundedNumRegs;
4187     }
4188 
4189     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4190     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4191   }
4192 
4193   for (int I = 1; I != NumVAddrs; ++I) {
4194     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4195     if (SrcOp.isReg())
4196       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4197   }
4198 }
4199 
4200 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4201 ///
4202 /// Depending on the subtarget, load/store with 16-bit element data need to be
4203 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4204 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4205 /// registers.
4206 ///
4207 /// We don't want to directly select image instructions just yet, but also want
4208 /// to exposes all register repacking to the legalizer/combiners. We also don't
4209 /// want a selected instrution entering RegBankSelect. In order to avoid
4210 /// defining a multitude of intermediate image instructions, directly hack on
4211 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4212 /// now unnecessary arguments with $noreg.
4213 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4214     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4215     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4216 
4217   const unsigned NumDefs = MI.getNumExplicitDefs();
4218   const unsigned ArgOffset = NumDefs + 1;
4219   bool IsTFE = NumDefs == 2;
4220   // We are only processing the operands of d16 image operations on subtargets
4221   // that use the unpacked register layout, or need to repack the TFE result.
4222 
4223   // TODO: Do we need to guard against already legalized intrinsics?
4224   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4225       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4226 
4227   MachineRegisterInfo *MRI = B.getMRI();
4228   const LLT S32 = LLT::scalar(32);
4229   const LLT S16 = LLT::scalar(16);
4230   const LLT V2S16 = LLT::vector(2, 16);
4231 
4232   unsigned DMask = 0;
4233 
4234   // Check for 16 bit addresses and pack if true.
4235   LLT GradTy =
4236       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4237   LLT AddrTy =
4238       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4239   const bool IsG16 = GradTy == S16;
4240   const bool IsA16 = AddrTy == S16;
4241 
4242   int DMaskLanes = 0;
4243   if (!BaseOpcode->Atomic) {
4244     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4245     if (BaseOpcode->Gather4) {
4246       DMaskLanes = 4;
4247     } else if (DMask != 0) {
4248       DMaskLanes = countPopulation(DMask);
4249     } else if (!IsTFE && !BaseOpcode->Store) {
4250       // If dmask is 0, this is a no-op load. This can be eliminated.
4251       B.buildUndef(MI.getOperand(0));
4252       MI.eraseFromParent();
4253       return true;
4254     }
4255   }
4256 
4257   Observer.changingInstr(MI);
4258   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4259 
4260   unsigned NewOpcode = NumDefs == 0 ?
4261     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4262 
4263   // Track that we legalized this
4264   MI.setDesc(B.getTII().get(NewOpcode));
4265 
4266   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4267   // dmask to be at least 1 otherwise the instruction will fail
4268   if (IsTFE && DMask == 0) {
4269     DMask = 0x1;
4270     DMaskLanes = 1;
4271     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4272   }
4273 
4274   if (BaseOpcode->Atomic) {
4275     Register VData0 = MI.getOperand(2).getReg();
4276     LLT Ty = MRI->getType(VData0);
4277 
4278     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4279     if (Ty.isVector())
4280       return false;
4281 
4282     if (BaseOpcode->AtomicX2) {
4283       Register VData1 = MI.getOperand(3).getReg();
4284       // The two values are packed in one register.
4285       LLT PackedTy = LLT::vector(2, Ty);
4286       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4287       MI.getOperand(2).setReg(Concat.getReg(0));
4288       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4289     }
4290   }
4291 
4292   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4293 
4294   // Optimize _L to _LZ when _L is zero
4295   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4296           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4297     const ConstantFP *ConstantLod;
4298 
4299     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4300                  m_GFCst(ConstantLod))) {
4301       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4302         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4303         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4304             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4305                                                       Intr->Dim);
4306 
4307         // The starting indexes should remain in the same place.
4308         --CorrectedNumVAddrs;
4309 
4310         MI.getOperand(MI.getNumExplicitDefs())
4311             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4312         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4313         Intr = NewImageDimIntr;
4314       }
4315     }
4316   }
4317 
4318   // Optimize _mip away, when 'lod' is zero
4319   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4320     int64_t ConstantLod;
4321     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4322                  m_ICst(ConstantLod))) {
4323       if (ConstantLod == 0) {
4324         // TODO: Change intrinsic opcode and remove operand instead or replacing
4325         // it with 0, as the _L to _LZ handling is done above.
4326         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4327         --CorrectedNumVAddrs;
4328       }
4329     }
4330   }
4331 
4332   // Rewrite the addressing register layout before doing anything else.
4333   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4334     // 16 bit gradients are supported, but are tied to the A16 control
4335     // so both gradients and addresses must be 16 bit
4336     return false;
4337   }
4338 
4339   if (IsA16 && !ST.hasA16()) {
4340     // A16 not supported
4341     return false;
4342   }
4343 
4344   if (IsA16 || IsG16) {
4345     if (Intr->NumVAddrs > 1) {
4346       SmallVector<Register, 4> PackedRegs;
4347 
4348       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4349                                 IsG16);
4350 
4351       // See also below in the non-a16 branch
4352       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4353 
4354       if (!UseNSA && PackedRegs.size() > 1) {
4355         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4356         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4357         PackedRegs[0] = Concat.getReg(0);
4358         PackedRegs.resize(1);
4359       }
4360 
4361       const unsigned NumPacked = PackedRegs.size();
4362       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4363         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4364         if (!SrcOp.isReg()) {
4365           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4366           continue;
4367         }
4368 
4369         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4370 
4371         if (I - Intr->VAddrStart < NumPacked)
4372           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4373         else
4374           SrcOp.setReg(AMDGPU::NoRegister);
4375       }
4376     }
4377   } else {
4378     // If the register allocator cannot place the address registers contiguously
4379     // without introducing moves, then using the non-sequential address encoding
4380     // is always preferable, since it saves VALU instructions and is usually a
4381     // wash in terms of code size or even better.
4382     //
4383     // However, we currently have no way of hinting to the register allocator
4384     // that MIMG addresses should be placed contiguously when it is possible to
4385     // do so, so force non-NSA for the common 2-address case as a heuristic.
4386     //
4387     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4388     // allocation when possible.
4389     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4390 
4391     if (!UseNSA && Intr->NumVAddrs > 1)
4392       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4393                                Intr->NumVAddrs);
4394   }
4395 
4396   int Flags = 0;
4397   if (IsA16)
4398     Flags |= 1;
4399   if (IsG16)
4400     Flags |= 2;
4401   MI.addOperand(MachineOperand::CreateImm(Flags));
4402 
4403   if (BaseOpcode->Store) { // No TFE for stores?
4404     // TODO: Handle dmask trim
4405     Register VData = MI.getOperand(1).getReg();
4406     LLT Ty = MRI->getType(VData);
4407     if (!Ty.isVector() || Ty.getElementType() != S16)
4408       return true;
4409 
4410     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4411     if (RepackedReg != VData) {
4412       MI.getOperand(1).setReg(RepackedReg);
4413     }
4414 
4415     return true;
4416   }
4417 
4418   Register DstReg = MI.getOperand(0).getReg();
4419   LLT Ty = MRI->getType(DstReg);
4420   const LLT EltTy = Ty.getScalarType();
4421   const bool IsD16 = Ty.getScalarType() == S16;
4422   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4423 
4424   // Confirm that the return type is large enough for the dmask specified
4425   if (NumElts < DMaskLanes)
4426     return false;
4427 
4428   if (NumElts > 4 || DMaskLanes > 4)
4429     return false;
4430 
4431   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4432   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4433 
4434   // The raw dword aligned data component of the load. The only legal cases
4435   // where this matters should be when using the packed D16 format, for
4436   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4437   LLT RoundedTy;
4438 
4439   // S32 vector to to cover all data, plus TFE result element.
4440   LLT TFETy;
4441 
4442   // Register type to use for each loaded component. Will be S32 or V2S16.
4443   LLT RegTy;
4444 
4445   if (IsD16 && ST.hasUnpackedD16VMem()) {
4446     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4447     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4448     RegTy = S32;
4449   } else {
4450     unsigned EltSize = EltTy.getSizeInBits();
4451     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4452     unsigned RoundedSize = 32 * RoundedElts;
4453     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4454     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4455     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4456   }
4457 
4458   // The return type does not need adjustment.
4459   // TODO: Should we change s16 case to s32 or <2 x s16>?
4460   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4461     return true;
4462 
4463   Register Dst1Reg;
4464 
4465   // Insert after the instruction.
4466   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4467 
4468   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4469   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4470   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4471   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4472 
4473   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4474 
4475   MI.getOperand(0).setReg(NewResultReg);
4476 
4477   // In the IR, TFE is supposed to be used with a 2 element struct return
4478   // type. The intruction really returns these two values in one contiguous
4479   // register, with one additional dword beyond the loaded data. Rewrite the
4480   // return type to use a single register result.
4481 
4482   if (IsTFE) {
4483     Dst1Reg = MI.getOperand(1).getReg();
4484     if (MRI->getType(Dst1Reg) != S32)
4485       return false;
4486 
4487     // TODO: Make sure the TFE operand bit is set.
4488     MI.RemoveOperand(1);
4489 
4490     // Handle the easy case that requires no repack instructions.
4491     if (Ty == S32) {
4492       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4493       return true;
4494     }
4495   }
4496 
4497   // Now figure out how to copy the new result register back into the old
4498   // result.
4499   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4500 
4501   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4502 
4503   if (ResultNumRegs == 1) {
4504     assert(!IsTFE);
4505     ResultRegs[0] = NewResultReg;
4506   } else {
4507     // We have to repack into a new vector of some kind.
4508     for (int I = 0; I != NumDataRegs; ++I)
4509       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4510     B.buildUnmerge(ResultRegs, NewResultReg);
4511 
4512     // Drop the final TFE element to get the data part. The TFE result is
4513     // directly written to the right place already.
4514     if (IsTFE)
4515       ResultRegs.resize(NumDataRegs);
4516   }
4517 
4518   // For an s16 scalar result, we form an s32 result with a truncate regardless
4519   // of packed vs. unpacked.
4520   if (IsD16 && !Ty.isVector()) {
4521     B.buildTrunc(DstReg, ResultRegs[0]);
4522     return true;
4523   }
4524 
4525   // Avoid a build/concat_vector of 1 entry.
4526   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4527     B.buildBitcast(DstReg, ResultRegs[0]);
4528     return true;
4529   }
4530 
4531   assert(Ty.isVector());
4532 
4533   if (IsD16) {
4534     // For packed D16 results with TFE enabled, all the data components are
4535     // S32. Cast back to the expected type.
4536     //
4537     // TODO: We don't really need to use load s32 elements. We would only need one
4538     // cast for the TFE result if a multiple of v2s16 was used.
4539     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4540       for (Register &Reg : ResultRegs)
4541         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4542     } else if (ST.hasUnpackedD16VMem()) {
4543       for (Register &Reg : ResultRegs)
4544         Reg = B.buildTrunc(S16, Reg).getReg(0);
4545     }
4546   }
4547 
4548   auto padWithUndef = [&](LLT Ty, int NumElts) {
4549     if (NumElts == 0)
4550       return;
4551     Register Undef = B.buildUndef(Ty).getReg(0);
4552     for (int I = 0; I != NumElts; ++I)
4553       ResultRegs.push_back(Undef);
4554   };
4555 
4556   // Pad out any elements eliminated due to the dmask.
4557   LLT ResTy = MRI->getType(ResultRegs[0]);
4558   if (!ResTy.isVector()) {
4559     padWithUndef(ResTy, NumElts - ResultRegs.size());
4560     B.buildBuildVector(DstReg, ResultRegs);
4561     return true;
4562   }
4563 
4564   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4565   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4566 
4567   // Deal with the one annoying legal case.
4568   const LLT V3S16 = LLT::vector(3, 16);
4569   if (Ty == V3S16) {
4570     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4571     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4572     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4573     return true;
4574   }
4575 
4576   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4577   B.buildConcatVectors(DstReg, ResultRegs);
4578   return true;
4579 }
4580 
4581 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4582   LegalizerHelper &Helper, MachineInstr &MI) const {
4583   MachineIRBuilder &B = Helper.MIRBuilder;
4584   GISelChangeObserver &Observer = Helper.Observer;
4585 
4586   Register Dst = MI.getOperand(0).getReg();
4587   LLT Ty = B.getMRI()->getType(Dst);
4588   unsigned Size = Ty.getSizeInBits();
4589   MachineFunction &MF = B.getMF();
4590 
4591   Observer.changingInstr(MI);
4592 
4593   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4594     Ty = getBitcastRegisterType(Ty);
4595     Helper.bitcastDst(MI, Ty, 0);
4596     Dst = MI.getOperand(0).getReg();
4597     B.setInsertPt(B.getMBB(), MI);
4598   }
4599 
4600   // FIXME: We don't really need this intermediate instruction. The intrinsic
4601   // should be fixed to have a memory operand. Since it's readnone, we're not
4602   // allowed to add one.
4603   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4604   MI.RemoveOperand(1); // Remove intrinsic ID
4605 
4606   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4607   // TODO: Should this use datalayout alignment?
4608   const unsigned MemSize = (Size + 7) / 8;
4609   const Align MemAlign(4);
4610   MachineMemOperand *MMO = MF.getMachineMemOperand(
4611       MachinePointerInfo(),
4612       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4613           MachineMemOperand::MOInvariant,
4614       MemSize, MemAlign);
4615   MI.addMemOperand(MF, MMO);
4616 
4617   // There are no 96-bit result scalar loads, but widening to 128-bit should
4618   // always be legal. We may need to restore this to a 96-bit result if it turns
4619   // out this needs to be converted to a vector load during RegBankSelect.
4620   if (!isPowerOf2_32(Size)) {
4621     if (Ty.isVector())
4622       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4623     else
4624       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4625   }
4626 
4627   Observer.changedInstr(MI);
4628   return true;
4629 }
4630 
4631 // TODO: Move to selection
4632 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4633                                                 MachineRegisterInfo &MRI,
4634                                                 MachineIRBuilder &B) const {
4635   if (!ST.isTrapHandlerEnabled() ||
4636       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4637     return legalizeTrapEndpgm(MI, MRI, B);
4638 
4639   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4640     switch (*HsaAbiVer) {
4641     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4642     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4643       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4644     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4645       return ST.supportsGetDoorbellID() ?
4646           legalizeTrapHsa(MI, MRI, B) :
4647           legalizeTrapHsaQueuePtr(MI, MRI, B);
4648     }
4649   }
4650 
4651   llvm_unreachable("Unknown trap handler");
4652 }
4653 
4654 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4655     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4656   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4657   MI.eraseFromParent();
4658   return true;
4659 }
4660 
4661 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4662     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4663   // Pass queue pointer to trap handler as input, and insert trap instruction
4664   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4665   Register LiveIn =
4666     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4667   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4668     return false;
4669 
4670   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4671   B.buildCopy(SGPR01, LiveIn);
4672   B.buildInstr(AMDGPU::S_TRAP)
4673       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4674       .addReg(SGPR01, RegState::Implicit);
4675 
4676   MI.eraseFromParent();
4677   return true;
4678 }
4679 
4680 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4681     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4682   B.buildInstr(AMDGPU::S_TRAP)
4683       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4684   MI.eraseFromParent();
4685   return true;
4686 }
4687 
4688 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4689     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4690   // Is non-HSA path or trap-handler disabled? then, report a warning
4691   // accordingly
4692   if (!ST.isTrapHandlerEnabled() ||
4693       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4694     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4695                                      "debugtrap handler not supported",
4696                                      MI.getDebugLoc(), DS_Warning);
4697     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4698     Ctx.diagnose(NoTrap);
4699   } else {
4700     // Insert debug-trap instruction
4701     B.buildInstr(AMDGPU::S_TRAP)
4702         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4703   }
4704 
4705   MI.eraseFromParent();
4706   return true;
4707 }
4708 
4709 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4710                                                MachineIRBuilder &B) const {
4711   MachineRegisterInfo &MRI = *B.getMRI();
4712   const LLT S16 = LLT::scalar(16);
4713   const LLT S32 = LLT::scalar(32);
4714 
4715   Register DstReg = MI.getOperand(0).getReg();
4716   Register NodePtr = MI.getOperand(2).getReg();
4717   Register RayExtent = MI.getOperand(3).getReg();
4718   Register RayOrigin = MI.getOperand(4).getReg();
4719   Register RayDir = MI.getOperand(5).getReg();
4720   Register RayInvDir = MI.getOperand(6).getReg();
4721   Register TDescr = MI.getOperand(7).getReg();
4722 
4723   if (!ST.hasGFX10_AEncoding()) {
4724     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4725                                         "intrinsic not supported on subtarget",
4726                                         MI.getDebugLoc());
4727     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4728     return false;
4729   }
4730 
4731   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4732   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4733   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4734                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4735                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4736                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4737 
4738   SmallVector<Register, 12> Ops;
4739   if (Is64) {
4740     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4741     Ops.push_back(Unmerge.getReg(0));
4742     Ops.push_back(Unmerge.getReg(1));
4743   } else {
4744     Ops.push_back(NodePtr);
4745   }
4746   Ops.push_back(RayExtent);
4747 
4748   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4749     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4750     Ops.push_back(Unmerge.getReg(0));
4751     Ops.push_back(Unmerge.getReg(1));
4752     Ops.push_back(Unmerge.getReg(2));
4753   };
4754 
4755   packLanes(RayOrigin);
4756   if (IsA16) {
4757     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4758     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4759     Register R1 = MRI.createGenericVirtualRegister(S32);
4760     Register R2 = MRI.createGenericVirtualRegister(S32);
4761     Register R3 = MRI.createGenericVirtualRegister(S32);
4762     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4763     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4764     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4765     Ops.push_back(R1);
4766     Ops.push_back(R2);
4767     Ops.push_back(R3);
4768   } else {
4769     packLanes(RayDir);
4770     packLanes(RayInvDir);
4771   }
4772 
4773   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4774     .addDef(DstReg)
4775     .addImm(Opcode);
4776 
4777   for (Register R : Ops) {
4778     MIB.addUse(R);
4779   }
4780 
4781   MIB.addUse(TDescr)
4782      .addImm(IsA16 ? 1 : 0)
4783      .cloneMemRefs(MI);
4784 
4785   MI.eraseFromParent();
4786   return true;
4787 }
4788 
4789 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4790                                             MachineInstr &MI) const {
4791   MachineIRBuilder &B = Helper.MIRBuilder;
4792   MachineRegisterInfo &MRI = *B.getMRI();
4793 
4794   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4795   auto IntrID = MI.getIntrinsicID();
4796   switch (IntrID) {
4797   case Intrinsic::amdgcn_if:
4798   case Intrinsic::amdgcn_else: {
4799     MachineInstr *Br = nullptr;
4800     MachineBasicBlock *UncondBrTarget = nullptr;
4801     bool Negated = false;
4802     if (MachineInstr *BrCond =
4803             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4804       const SIRegisterInfo *TRI
4805         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4806 
4807       Register Def = MI.getOperand(1).getReg();
4808       Register Use = MI.getOperand(3).getReg();
4809 
4810       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4811 
4812       if (Negated)
4813         std::swap(CondBrTarget, UncondBrTarget);
4814 
4815       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4816       if (IntrID == Intrinsic::amdgcn_if) {
4817         B.buildInstr(AMDGPU::SI_IF)
4818           .addDef(Def)
4819           .addUse(Use)
4820           .addMBB(UncondBrTarget);
4821       } else {
4822         B.buildInstr(AMDGPU::SI_ELSE)
4823             .addDef(Def)
4824             .addUse(Use)
4825             .addMBB(UncondBrTarget);
4826       }
4827 
4828       if (Br) {
4829         Br->getOperand(0).setMBB(CondBrTarget);
4830       } else {
4831         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4832         // since we're swapping branch targets it needs to be reinserted.
4833         // FIXME: IRTranslator should probably not do this
4834         B.buildBr(*CondBrTarget);
4835       }
4836 
4837       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4838       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4839       MI.eraseFromParent();
4840       BrCond->eraseFromParent();
4841       return true;
4842     }
4843 
4844     return false;
4845   }
4846   case Intrinsic::amdgcn_loop: {
4847     MachineInstr *Br = nullptr;
4848     MachineBasicBlock *UncondBrTarget = nullptr;
4849     bool Negated = false;
4850     if (MachineInstr *BrCond =
4851             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4852       const SIRegisterInfo *TRI
4853         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4854 
4855       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4856       Register Reg = MI.getOperand(2).getReg();
4857 
4858       if (Negated)
4859         std::swap(CondBrTarget, UncondBrTarget);
4860 
4861       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4862       B.buildInstr(AMDGPU::SI_LOOP)
4863         .addUse(Reg)
4864         .addMBB(UncondBrTarget);
4865 
4866       if (Br)
4867         Br->getOperand(0).setMBB(CondBrTarget);
4868       else
4869         B.buildBr(*CondBrTarget);
4870 
4871       MI.eraseFromParent();
4872       BrCond->eraseFromParent();
4873       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4874       return true;
4875     }
4876 
4877     return false;
4878   }
4879   case Intrinsic::amdgcn_kernarg_segment_ptr:
4880     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4881       // This only makes sense to call in a kernel, so just lower to null.
4882       B.buildConstant(MI.getOperand(0).getReg(), 0);
4883       MI.eraseFromParent();
4884       return true;
4885     }
4886 
4887     return legalizePreloadedArgIntrin(
4888       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4889   case Intrinsic::amdgcn_implicitarg_ptr:
4890     return legalizeImplicitArgPtr(MI, MRI, B);
4891   case Intrinsic::amdgcn_workitem_id_x:
4892     return legalizePreloadedArgIntrin(MI, MRI, B,
4893                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4894   case Intrinsic::amdgcn_workitem_id_y:
4895     return legalizePreloadedArgIntrin(MI, MRI, B,
4896                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4897   case Intrinsic::amdgcn_workitem_id_z:
4898     return legalizePreloadedArgIntrin(MI, MRI, B,
4899                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4900   case Intrinsic::amdgcn_workgroup_id_x:
4901     return legalizePreloadedArgIntrin(MI, MRI, B,
4902                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4903   case Intrinsic::amdgcn_workgroup_id_y:
4904     return legalizePreloadedArgIntrin(MI, MRI, B,
4905                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4906   case Intrinsic::amdgcn_workgroup_id_z:
4907     return legalizePreloadedArgIntrin(MI, MRI, B,
4908                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4909   case Intrinsic::amdgcn_dispatch_ptr:
4910     return legalizePreloadedArgIntrin(MI, MRI, B,
4911                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4912   case Intrinsic::amdgcn_queue_ptr:
4913     return legalizePreloadedArgIntrin(MI, MRI, B,
4914                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4915   case Intrinsic::amdgcn_implicit_buffer_ptr:
4916     return legalizePreloadedArgIntrin(
4917       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4918   case Intrinsic::amdgcn_dispatch_id:
4919     return legalizePreloadedArgIntrin(MI, MRI, B,
4920                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4921   case Intrinsic::amdgcn_fdiv_fast:
4922     return legalizeFDIVFastIntrin(MI, MRI, B);
4923   case Intrinsic::amdgcn_is_shared:
4924     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4925   case Intrinsic::amdgcn_is_private:
4926     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4927   case Intrinsic::amdgcn_wavefrontsize: {
4928     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4929     MI.eraseFromParent();
4930     return true;
4931   }
4932   case Intrinsic::amdgcn_s_buffer_load:
4933     return legalizeSBufferLoad(Helper, MI);
4934   case Intrinsic::amdgcn_raw_buffer_store:
4935   case Intrinsic::amdgcn_struct_buffer_store:
4936     return legalizeBufferStore(MI, MRI, B, false, false);
4937   case Intrinsic::amdgcn_raw_buffer_store_format:
4938   case Intrinsic::amdgcn_struct_buffer_store_format:
4939     return legalizeBufferStore(MI, MRI, B, false, true);
4940   case Intrinsic::amdgcn_raw_tbuffer_store:
4941   case Intrinsic::amdgcn_struct_tbuffer_store:
4942     return legalizeBufferStore(MI, MRI, B, true, true);
4943   case Intrinsic::amdgcn_raw_buffer_load:
4944   case Intrinsic::amdgcn_struct_buffer_load:
4945     return legalizeBufferLoad(MI, MRI, B, false, false);
4946   case Intrinsic::amdgcn_raw_buffer_load_format:
4947   case Intrinsic::amdgcn_struct_buffer_load_format:
4948     return legalizeBufferLoad(MI, MRI, B, true, false);
4949   case Intrinsic::amdgcn_raw_tbuffer_load:
4950   case Intrinsic::amdgcn_struct_tbuffer_load:
4951     return legalizeBufferLoad(MI, MRI, B, true, true);
4952   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4953   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4954   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4955   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4956   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4957   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4958   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4959   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4960   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4961   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4962   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4963   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4964   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4965   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4966   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4967   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4968   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4969   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4970   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4971   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4972   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4973   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4974   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4975   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4976   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4977   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4978   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4979   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4980   case Intrinsic::amdgcn_buffer_atomic_fadd:
4981   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4982   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4983   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4984   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4985     return legalizeBufferAtomic(MI, B, IntrID);
4986   case Intrinsic::amdgcn_atomic_inc:
4987     return legalizeAtomicIncDec(MI, B, true);
4988   case Intrinsic::amdgcn_atomic_dec:
4989     return legalizeAtomicIncDec(MI, B, false);
4990   case Intrinsic::trap:
4991     return legalizeTrapIntrinsic(MI, MRI, B);
4992   case Intrinsic::debugtrap:
4993     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4994   case Intrinsic::amdgcn_rsq_clamp:
4995     return legalizeRsqClampIntrinsic(MI, MRI, B);
4996   case Intrinsic::amdgcn_ds_fadd:
4997   case Intrinsic::amdgcn_ds_fmin:
4998   case Intrinsic::amdgcn_ds_fmax:
4999     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5000   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5001     return legalizeBVHIntrinsic(MI, B);
5002   default: {
5003     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5004             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5005       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5006     return true;
5007   }
5008   }
5009 
5010   return true;
5011 }
5012