1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 // If we have a truncating store or an extending load with a data size larger
226 // than 32-bits, we need to reduce to a 32-bit type.
227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return ST.enableFlatScratch() ? 128 : 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // Do not handle extending vector loads.
280   if (Ty.isVector() && MemSize != RegSize)
281     return false;
282 
283   // TODO: We should be able to widen loads if the alignment is high enough, but
284   // we also need to modify the memory access size.
285 #if 0
286   // Accept widening loads based on alignment.
287   if (IsLoad && MemSize < Size)
288     MemSize = std::max(MemSize, Align);
289 #endif
290 
291   // Only 1-byte and 2-byte to 32-bit extloads are valid.
292   if (MemSize != RegSize && RegSize != 32)
293     return false;
294 
295   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296     return false;
297 
298   switch (MemSize) {
299   case 8:
300   case 16:
301   case 32:
302   case 64:
303   case 128:
304     break;
305   case 96:
306     if (!ST.hasDwordx3LoadStores())
307       return false;
308     break;
309   case 256:
310   case 512:
311     // These may contextually need to be broken down.
312     break;
313   default:
314     return false;
315   }
316 
317   assert(RegSize >= MemSize);
318 
319   if (AlignBits < MemSize) {
320     const SITargetLowering *TLI = ST.getTargetLowering();
321     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322                                                  Align(AlignBits / 8)))
323       return false;
324   }
325 
326   return true;
327 }
328 
329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330 // workaround this. Eventually it should ignore the type for loads and only care
331 // about the size. Return true in cases where we will workaround this for now by
332 // bitcasting.
333 static bool loadStoreBitcastWorkaround(const LLT Ty) {
334   if (EnableNewLegality)
335     return false;
336 
337   const unsigned Size = Ty.getSizeInBits();
338   if (Size <= 64)
339     return false;
340   if (!Ty.isVector())
341     return true;
342 
343   LLT EltTy = Ty.getElementType();
344   if (EltTy.isPointer())
345     return true;
346 
347   unsigned EltSize = EltTy.getSizeInBits();
348   return EltSize != 32 && EltSize != 64;
349 }
350 
351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352   const LLT Ty = Query.Types[0];
353   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
354          !loadStoreBitcastWorkaround(Ty);
355 }
356 
357 /// Return true if a load or store of the type should be lowered with a bitcast
358 /// to a different type.
359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360                                        const unsigned MemSizeInBits) {
361   const unsigned Size = Ty.getSizeInBits();
362   if (Size != MemSizeInBits)
363     return Size <= 32 && Ty.isVector();
364 
365   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
366     return true;
367   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
368          !isRegisterVectorElementType(Ty.getElementType());
369 }
370 
371 /// Return true if we should legalize a load by widening an odd sized memory
372 /// access up to the alignment. Note this case when the memory access itself
373 /// changes, not the size of the result register.
374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375                             unsigned AlignInBits, unsigned AddrSpace,
376                             unsigned Opcode) {
377   // We don't want to widen cases that are naturally legal.
378   if (isPowerOf2_32(SizeInBits))
379     return false;
380 
381   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382   // end up widening these for a scalar load during RegBankSelect, since there
383   // aren't 96-bit scalar loads.
384   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385     return false;
386 
387   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388     return false;
389 
390   // A load is known dereferenceable up to the alignment, so it's legal to widen
391   // to it.
392   //
393   // TODO: Could check dereferenceable for less aligned cases.
394   unsigned RoundedSize = NextPowerOf2(SizeInBits);
395   if (AlignInBits < RoundedSize)
396     return false;
397 
398   // Do not widen if it would introduce a slow unaligned load.
399   const SITargetLowering *TLI = ST.getTargetLowering();
400   bool Fast = false;
401   return TLI->allowsMisalignedMemoryAccessesImpl(
402              RoundedSize, AddrSpace, Align(AlignInBits / 8),
403              MachineMemOperand::MOLoad, &Fast) &&
404          Fast;
405 }
406 
407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408                             unsigned Opcode) {
409   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410     return false;
411 
412   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413                          Query.MMODescrs[0].AlignInBits,
414                          Query.Types[1].getAddressSpace(), Opcode);
415 }
416 
417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
418                                          const GCNTargetMachine &TM)
419   :  ST(ST_) {
420   using namespace TargetOpcode;
421 
422   auto GetAddrSpacePtr = [&TM](unsigned AS) {
423     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424   };
425 
426   const LLT S1 = LLT::scalar(1);
427   const LLT S8 = LLT::scalar(8);
428   const LLT S16 = LLT::scalar(16);
429   const LLT S32 = LLT::scalar(32);
430   const LLT S64 = LLT::scalar(64);
431   const LLT S128 = LLT::scalar(128);
432   const LLT S256 = LLT::scalar(256);
433   const LLT S512 = LLT::scalar(512);
434   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435 
436   const LLT V2S8 = LLT::vector(2, 8);
437   const LLT V2S16 = LLT::vector(2, 16);
438   const LLT V4S16 = LLT::vector(4, 16);
439 
440   const LLT V2S32 = LLT::vector(2, 32);
441   const LLT V3S32 = LLT::vector(3, 32);
442   const LLT V4S32 = LLT::vector(4, 32);
443   const LLT V5S32 = LLT::vector(5, 32);
444   const LLT V6S32 = LLT::vector(6, 32);
445   const LLT V7S32 = LLT::vector(7, 32);
446   const LLT V8S32 = LLT::vector(8, 32);
447   const LLT V9S32 = LLT::vector(9, 32);
448   const LLT V10S32 = LLT::vector(10, 32);
449   const LLT V11S32 = LLT::vector(11, 32);
450   const LLT V12S32 = LLT::vector(12, 32);
451   const LLT V13S32 = LLT::vector(13, 32);
452   const LLT V14S32 = LLT::vector(14, 32);
453   const LLT V15S32 = LLT::vector(15, 32);
454   const LLT V16S32 = LLT::vector(16, 32);
455   const LLT V32S32 = LLT::vector(32, 32);
456 
457   const LLT V2S64 = LLT::vector(2, 64);
458   const LLT V3S64 = LLT::vector(3, 64);
459   const LLT V4S64 = LLT::vector(4, 64);
460   const LLT V5S64 = LLT::vector(5, 64);
461   const LLT V6S64 = LLT::vector(6, 64);
462   const LLT V7S64 = LLT::vector(7, 64);
463   const LLT V8S64 = LLT::vector(8, 64);
464   const LLT V16S64 = LLT::vector(16, 64);
465 
466   std::initializer_list<LLT> AllS32Vectors =
467     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469   std::initializer_list<LLT> AllS64Vectors =
470     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471 
472   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479 
480   const LLT CodePtr = FlatPtr;
481 
482   const std::initializer_list<LLT> AddrSpaces64 = {
483     GlobalPtr, ConstantPtr, FlatPtr
484   };
485 
486   const std::initializer_list<LLT> AddrSpaces32 = {
487     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488   };
489 
490   const std::initializer_list<LLT> FPTypesBase = {
491     S32, S64
492   };
493 
494   const std::initializer_list<LLT> FPTypes16 = {
495     S32, S64, S16
496   };
497 
498   const std::initializer_list<LLT> FPTypesPK16 = {
499     S32, S64, S16, V2S16
500   };
501 
502   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503 
504   auto &LegacyInfo = getLegacyLegalizerInfo();
505   LegacyInfo.setAction({G_BRCOND, S1},
506                        LegacyLegalizeActions::Legal); // VCC branches
507   LegacyInfo.setAction({G_BRCOND, S32},
508                        LegacyLegalizeActions::Legal); // SCC branches
509 
510   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
511   // elements for v3s16
512   getActionDefinitionsBuilder(G_PHI)
513     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
514     .legalFor(AllS32Vectors)
515     .legalFor(AllS64Vectors)
516     .legalFor(AddrSpaces64)
517     .legalFor(AddrSpaces32)
518     .legalIf(isPointer(0))
519     .clampScalar(0, S16, S256)
520     .widenScalarToNextPow2(0, 32)
521     .clampMaxNumElements(0, S32, 16)
522     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
523     .scalarize(0);
524 
525   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
526     // Full set of gfx9 features.
527     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
528       .legalFor({S32, S16, V2S16})
529       .clampScalar(0, S16, S32)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32);
533 
534     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
535       .legalFor({S32, S16, V2S16}) // Clamp modifier
536       .minScalarOrElt(0, S16)
537       .clampMaxNumElements(0, S16, 2)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32)
540       .lower();
541   } else if (ST.has16BitInsts()) {
542     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
543       .legalFor({S32, S16})
544       .clampScalar(0, S16, S32)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
547 
548     // Technically the saturating operations require clamp bit support, but this
549     // was introduced at the same time as 16-bit operations.
550     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
551       .legalFor({S32, S16}) // Clamp modifier
552       .minScalar(0, S16)
553       .scalarize(0)
554       .widenScalarToNextPow2(0, 16)
555       .lower();
556 
557     // We're just lowering this, but it helps get a better result to try to
558     // coerce to the desired type first.
559     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
560       .minScalar(0, S16)
561       .scalarize(0)
562       .lower();
563   } else {
564     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
565       .legalFor({S32})
566       .clampScalar(0, S32, S32)
567       .scalarize(0);
568 
569     if (ST.hasIntClamp()) {
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .legalFor({S32}) // Clamp modifier.
572         .scalarize(0)
573         .minScalarOrElt(0, S32)
574         .lower();
575     } else {
576       // Clamp bit support was added in VI, along with 16-bit operations.
577       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578         .minScalar(0, S32)
579         .scalarize(0)
580         .lower();
581     }
582 
583     // FIXME: DAG expansion gets better results. The widening uses the smaller
584     // range values and goes for the min/max lowering directly.
585     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
586       .minScalar(0, S32)
587       .scalarize(0)
588       .lower();
589   }
590 
591   getActionDefinitionsBuilder(
592       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
593       .customFor({S32, S64})
594       .clampScalar(0, S32, S64)
595       .widenScalarToNextPow2(0, 32)
596       .scalarize(0);
597 
598   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
599                    .legalFor({S32})
600                    .maxScalarOrElt(0, S32);
601 
602   if (ST.hasVOP3PInsts()) {
603     Mulh
604       .clampMaxNumElements(0, S8, 2)
605       .lowerFor({V2S8});
606   }
607 
608   Mulh
609     .scalarize(0)
610     .lower();
611 
612   // Report legal for any types we can handle anywhere. For the cases only legal
613   // on the SALU, RegBankSelect will be able to re-legalize.
614   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
615     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
616     .clampScalar(0, S32, S64)
617     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
618     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
619     .widenScalarToNextPow2(0)
620     .scalarize(0);
621 
622   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
623                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
624     .legalFor({{S32, S1}, {S32, S32}})
625     .minScalar(0, S32)
626     // TODO: .scalarize(0)
627     .lower();
628 
629   getActionDefinitionsBuilder(G_BITCAST)
630     // Don't worry about the size constraint.
631     .legalIf(all(isRegisterType(0), isRegisterType(1)))
632     .lower();
633 
634 
635   getActionDefinitionsBuilder(G_CONSTANT)
636     .legalFor({S1, S32, S64, S16, GlobalPtr,
637                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
638     .legalIf(isPointer(0))
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(0);
641 
642   getActionDefinitionsBuilder(G_FCONSTANT)
643     .legalFor({S32, S64, S16})
644     .clampScalar(0, S16, S64);
645 
646   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
647       .legalIf(isRegisterType(0))
648       // s1 and s16 are special cases because they have legal operations on
649       // them, but don't really occupy registers in the normal way.
650       .legalFor({S1, S16})
651       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
652       .clampScalarOrElt(0, S32, MaxScalar)
653       .widenScalarToNextPow2(0, 32)
654       .clampMaxNumElements(0, S32, 16);
655 
656   LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr},
657                        LegacyLegalizeActions::Legal);
658 
659   // If the amount is divergent, we have to do a wave reduction to get the
660   // maximum value, so this is expanded during RegBankSelect.
661   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662     .legalFor({{PrivatePtr, S32}});
663 
664   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665     .customIf(typeIsNot(0, PrivatePtr));
666 
667   LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal);
668 
669   auto &FPOpActions = getActionDefinitionsBuilder(
670     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671     .legalFor({S32, S64});
672   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673     .customFor({S32, S64});
674   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675     .customFor({S32, S64});
676 
677   if (ST.has16BitInsts()) {
678     if (ST.hasVOP3PInsts())
679       FPOpActions.legalFor({S16, V2S16});
680     else
681       FPOpActions.legalFor({S16});
682 
683     TrigActions.customFor({S16});
684     FDIVActions.customFor({S16});
685   }
686 
687   auto &MinNumMaxNum = getActionDefinitionsBuilder({
688       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689 
690   if (ST.hasVOP3PInsts()) {
691     MinNumMaxNum.customFor(FPTypesPK16)
692       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
693       .clampMaxNumElements(0, S16, 2)
694       .clampScalar(0, S16, S64)
695       .scalarize(0);
696   } else if (ST.has16BitInsts()) {
697     MinNumMaxNum.customFor(FPTypes16)
698       .clampScalar(0, S16, S64)
699       .scalarize(0);
700   } else {
701     MinNumMaxNum.customFor(FPTypesBase)
702       .clampScalar(0, S32, S64)
703       .scalarize(0);
704   }
705 
706   if (ST.hasVOP3PInsts())
707     FPOpActions.clampMaxNumElements(0, S16, 2);
708 
709   FPOpActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   TrigActions
714     .scalarize(0)
715     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717   FDIVActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   getActionDefinitionsBuilder({G_FNEG, G_FABS})
722     .legalFor(FPTypesPK16)
723     .clampMaxNumElements(0, S16, 2)
724     .scalarize(0)
725     .clampScalar(0, S16, S64);
726 
727   if (ST.has16BitInsts()) {
728     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729       .legalFor({S32, S64, S16})
730       .scalarize(0)
731       .clampScalar(0, S16, S64);
732   } else {
733     getActionDefinitionsBuilder(G_FSQRT)
734       .legalFor({S32, S64})
735       .scalarize(0)
736       .clampScalar(0, S32, S64);
737 
738     if (ST.hasFractBug()) {
739       getActionDefinitionsBuilder(G_FFLOOR)
740         .customFor({S64})
741         .legalFor({S32, S64})
742         .scalarize(0)
743         .clampScalar(0, S32, S64);
744     } else {
745       getActionDefinitionsBuilder(G_FFLOOR)
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     }
750   }
751 
752   getActionDefinitionsBuilder(G_FPTRUNC)
753     .legalFor({{S32, S64}, {S16, S32}})
754     .scalarize(0)
755     .lower();
756 
757   getActionDefinitionsBuilder(G_FPEXT)
758     .legalFor({{S64, S32}, {S32, S16}})
759     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760     .scalarize(0);
761 
762   getActionDefinitionsBuilder(G_FSUB)
763       // Use actual fsub instruction
764       .legalFor({S32})
765       // Must use fadd + fneg
766       .lowerFor({S64, S16, V2S16})
767       .scalarize(0)
768       .clampScalar(0, S32, S64);
769 
770   // Whether this is legal depends on the floating point mode for the function.
771   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773     FMad.customFor({S32, S16});
774   else if (ST.hasMadMacF32Insts())
775     FMad.customFor({S32});
776   else if (ST.hasMadF16())
777     FMad.customFor({S16});
778   FMad.scalarize(0)
779       .lower();
780 
781   auto &FRem = getActionDefinitionsBuilder(G_FREM);
782   if (ST.has16BitInsts()) {
783     FRem.customFor({S16, S32, S64});
784   } else {
785     FRem.minScalar(0, S32)
786         .customFor({S32, S64});
787   }
788   FRem.scalarize(0);
789 
790   // TODO: Do we need to clamp maximum bitwidth?
791   getActionDefinitionsBuilder(G_TRUNC)
792     .legalIf(isScalar(0))
793     .legalFor({{V2S16, V2S32}})
794     .clampMaxNumElements(0, S16, 2)
795     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796     // situations (like an invalid implicit use), we don't want to infinite loop
797     // in the legalizer.
798     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
799     .alwaysLegal();
800 
801   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803                {S32, S1}, {S64, S1}, {S16, S1}})
804     .scalarize(0)
805     .clampScalar(0, S32, S64)
806     .widenScalarToNextPow2(1, 32);
807 
808   // TODO: Split s1->s64 during regbankselect for VALU.
809   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811     .lowerFor({{S32, S64}})
812     .lowerIf(typeIs(1, S1))
813     .customFor({{S64, S64}});
814   if (ST.has16BitInsts())
815     IToFP.legalFor({{S16, S16}});
816   IToFP.clampScalar(1, S32, S64)
817        .minScalar(0, S32)
818        .scalarize(0)
819        .widenScalarToNextPow2(1);
820 
821   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823     .customFor({{S64, S32}, {S64, S64}})
824     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825   if (ST.has16BitInsts())
826     FPToI.legalFor({{S16, S16}});
827   else
828     FPToI.minScalar(1, S32);
829 
830   FPToI.minScalar(0, S32)
831        .widenScalarToNextPow2(0, 32)
832        .scalarize(0)
833        .lower();
834 
835   // Lower roundeven into G_FRINT
836   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837     .scalarize(0)
838     .lower();
839 
840   if (ST.has16BitInsts()) {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S16, S32, S64})
843       .clampScalar(0, S16, S64)
844       .scalarize(0);
845   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S32, S64})
848       .clampScalar(0, S32, S64)
849       .scalarize(0);
850   } else {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32})
853       .customFor({S64})
854       .clampScalar(0, S32, S64)
855       .scalarize(0);
856   }
857 
858   getActionDefinitionsBuilder(G_PTR_ADD)
859     .legalIf(all(isPointer(0), sameSize(0, 1)))
860     .scalarize(0)
861     .scalarSameSizeAs(1, 0);
862 
863   getActionDefinitionsBuilder(G_PTRMASK)
864     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865     .scalarSameSizeAs(1, 0)
866     .scalarize(0);
867 
868   auto &CmpBuilder =
869     getActionDefinitionsBuilder(G_ICMP)
870     // The compare output type differs based on the register bank of the output,
871     // so make both s1 and s32 legal.
872     //
873     // Scalar compares producing output in scc will be promoted to s32, as that
874     // is the allocatable register type that will be needed for the copy from
875     // scc. This will be promoted during RegBankSelect, and we assume something
876     // before that won't try to use s32 result types.
877     //
878     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879     // bank.
880     .legalForCartesianProduct(
881       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882     .legalForCartesianProduct(
883       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884   if (ST.has16BitInsts()) {
885     CmpBuilder.legalFor({{S1, S16}});
886   }
887 
888   CmpBuilder
889     .widenScalarToNextPow2(1)
890     .clampScalar(1, S32, S64)
891     .scalarize(0)
892     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893 
894   getActionDefinitionsBuilder(G_FCMP)
895     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0);
899 
900   // FIXME: fpow has a selection pattern that should move to custom lowering.
901   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902   if (ST.has16BitInsts())
903     Exp2Ops.legalFor({S32, S16});
904   else
905     Exp2Ops.legalFor({S32});
906   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907   Exp2Ops.scalarize(0);
908 
909   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910   if (ST.has16BitInsts())
911     ExpOps.customFor({{S32}, {S16}});
912   else
913     ExpOps.customFor({S32});
914   ExpOps.clampScalar(0, MinScalarFPTy, S32)
915         .scalarize(0);
916 
917   getActionDefinitionsBuilder(G_FPOWI)
918     .clampScalar(0, MinScalarFPTy, S32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder(G_CTPOP)
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   // The hardware instructions return a different result on 0 than the generic
931   // instructions expect. The hardware produces -1, but these produce the
932   // bitwidth.
933   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934     .scalarize(0)
935     .clampScalar(0, S32, S32)
936     .clampScalar(1, S32, S64)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32)
939     .lower();
940 
941   // The 64-bit versions produce 32-bit results, but only on the SALU.
942   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943     .legalFor({{S32, S32}, {S32, S64}})
944     .clampScalar(0, S32, S32)
945     .clampScalar(1, S32, S64)
946     .scalarize(0)
947     .widenScalarToNextPow2(0, 32)
948     .widenScalarToNextPow2(1, 32);
949 
950   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951   // RegBankSelect.
952   getActionDefinitionsBuilder(G_BITREVERSE)
953     .legalFor({S32, S64})
954     .clampScalar(0, S32, S64)
955     .scalarize(0)
956     .widenScalarToNextPow2(0);
957 
958   if (ST.has16BitInsts()) {
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S16, S32, V2S16})
961       .clampMaxNumElements(0, S16, 2)
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .clampScalar(0, S16, S32)
966       .scalarize(0);
967 
968     if (ST.hasVOP3PInsts()) {
969       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970         .legalFor({S32, S16, V2S16})
971         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972         .clampMaxNumElements(0, S16, 2)
973         .minScalar(0, S16)
974         .widenScalarToNextPow2(0)
975         .scalarize(0)
976         .lower();
977     } else {
978       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979         .legalFor({S32, S16})
980         .widenScalarToNextPow2(0)
981         .minScalar(0, S16)
982         .scalarize(0)
983         .lower();
984     }
985   } else {
986     // TODO: Should have same legality without v_perm_b32
987     getActionDefinitionsBuilder(G_BSWAP)
988       .legalFor({S32})
989       .lowerIf(scalarNarrowerThan(0, 32))
990       // FIXME: Fixing non-power-of-2 before clamp is workaround for
991       // narrowScalar limitation.
992       .widenScalarToNextPow2(0)
993       .maxScalar(0, S32)
994       .scalarize(0)
995       .lower();
996 
997     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998       .legalFor({S32})
999       .minScalar(0, S32)
1000       .widenScalarToNextPow2(0)
1001       .scalarize(0)
1002       .lower();
1003   }
1004 
1005   getActionDefinitionsBuilder(G_INTTOPTR)
1006     // List the common cases
1007     .legalForCartesianProduct(AddrSpaces64, {S64})
1008     .legalForCartesianProduct(AddrSpaces32, {S32})
1009     .scalarize(0)
1010     // Accept any address space as long as the size matches
1011     .legalIf(sameSize(0, 1))
1012     .widenScalarIf(smallerThan(1, 0),
1013       [](const LegalityQuery &Query) {
1014         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015       })
1016     .narrowScalarIf(largerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       });
1020 
1021   getActionDefinitionsBuilder(G_PTRTOINT)
1022     // List the common cases
1023     .legalForCartesianProduct(AddrSpaces64, {S64})
1024     .legalForCartesianProduct(AddrSpaces32, {S32})
1025     .scalarize(0)
1026     // Accept any address space as long as the size matches
1027     .legalIf(sameSize(0, 1))
1028     .widenScalarIf(smallerThan(0, 1),
1029       [](const LegalityQuery &Query) {
1030         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031       })
1032     .narrowScalarIf(
1033       largerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       });
1037 
1038   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039     .scalarize(0)
1040     .custom();
1041 
1042   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043                                     bool IsLoad) -> bool {
1044     const LLT DstTy = Query.Types[0];
1045 
1046     // Split vector extloads.
1047     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049 
1050     if (MemSize < DstTy.getSizeInBits())
1051       MemSize = std::max(MemSize, AlignBits);
1052 
1053     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054       return true;
1055 
1056     const LLT PtrTy = Query.Types[1];
1057     unsigned AS = PtrTy.getAddressSpace();
1058     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059       return true;
1060 
1061     // Catch weird sized loads that don't evenly divide into the access sizes
1062     // TODO: May be able to widen depending on alignment etc.
1063     unsigned NumRegs = (MemSize + 31) / 32;
1064     if (NumRegs == 3) {
1065       if (!ST.hasDwordx3LoadStores())
1066         return true;
1067     } else {
1068       // If the alignment allows, these should have been widened.
1069       if (!isPowerOf2_32(NumRegs))
1070         return true;
1071     }
1072 
1073     if (AlignBits < MemSize) {
1074       const SITargetLowering *TLI = ST.getTargetLowering();
1075       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076                                                       Align(AlignBits / 8));
1077     }
1078 
1079     return false;
1080   };
1081 
1082   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085 
1086   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087   // LDS
1088   // TODO: Unsupported flat for SI.
1089 
1090   for (unsigned Op : {G_LOAD, G_STORE}) {
1091     const bool IsStore = Op == G_STORE;
1092 
1093     auto &Actions = getActionDefinitionsBuilder(Op);
1094     // Explicitly list some common cases.
1095     // TODO: Does this help compile time at all?
1096     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1098                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1099                                       {S64, GlobalPtr, 64, GlobalAlign32},
1100                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1101                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1102                                       {S32, GlobalPtr, 8, GlobalAlign8},
1103                                       {S32, GlobalPtr, 16, GlobalAlign16},
1104 
1105                                       {S32, LocalPtr, 32, 32},
1106                                       {S64, LocalPtr, 64, 32},
1107                                       {V2S32, LocalPtr, 64, 32},
1108                                       {S32, LocalPtr, 8, 8},
1109                                       {S32, LocalPtr, 16, 16},
1110                                       {V2S16, LocalPtr, 32, 32},
1111 
1112                                       {S32, PrivatePtr, 32, 32},
1113                                       {S32, PrivatePtr, 8, 8},
1114                                       {S32, PrivatePtr, 16, 16},
1115                                       {V2S16, PrivatePtr, 32, 32},
1116 
1117                                       {S32, ConstantPtr, 32, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1119                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1120                                       {S64, ConstantPtr, 64, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122     Actions.legalIf(
1123       [=](const LegalityQuery &Query) -> bool {
1124         return isLoadStoreLegal(ST, Query);
1125       });
1126 
1127     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128     // 64-bits.
1129     //
1130     // TODO: Should generalize bitcast action into coerce, which will also cover
1131     // inserting addrspacecasts.
1132     Actions.customIf(typeIs(1, Constant32Ptr));
1133 
1134     // Turn any illegal element vectors into something easier to deal
1135     // with. These will ultimately produce 32-bit scalar shifts to extract the
1136     // parts anyway.
1137     //
1138     // For odd 16-bit element vectors, prefer to split those into pieces with
1139     // 16-bit vector parts.
1140     Actions.bitcastIf(
1141       [=](const LegalityQuery &Query) -> bool {
1142         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143                                           Query.MMODescrs[0].SizeInBits);
1144       }, bitcastToRegisterType(0));
1145 
1146     if (!IsStore) {
1147       // Widen suitably aligned loads by loading extra bytes. The standard
1148       // legalization actions can't properly express widening memory operands.
1149       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150         return shouldWidenLoad(ST, Query, G_LOAD);
1151       });
1152     }
1153 
1154     // FIXME: load/store narrowing should be moved to lower action
1155     Actions
1156         .narrowScalarIf(
1157             [=](const LegalityQuery &Query) -> bool {
1158               return !Query.Types[0].isVector() &&
1159                      needToSplitMemOp(Query, Op == G_LOAD);
1160             },
1161             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162               const LLT DstTy = Query.Types[0];
1163               const LLT PtrTy = Query.Types[1];
1164 
1165               const unsigned DstSize = DstTy.getSizeInBits();
1166               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167 
1168               // Split extloads.
1169               if (DstSize > MemSize)
1170                 return std::make_pair(0, LLT::scalar(MemSize));
1171 
1172               if (!isPowerOf2_32(DstSize)) {
1173                 // We're probably decomposing an odd sized store. Try to split
1174                 // to the widest type. TODO: Account for alignment. As-is it
1175                 // should be OK, since the new parts will be further legalized.
1176                 unsigned FloorSize = PowerOf2Floor(DstSize);
1177                 return std::make_pair(0, LLT::scalar(FloorSize));
1178               }
1179 
1180               if (DstSize > 32 && (DstSize % 32 != 0)) {
1181                 // FIXME: Need a way to specify non-extload of larger size if
1182                 // suitably aligned.
1183                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184               }
1185 
1186               unsigned MaxSize = maxSizeForAddrSpace(ST,
1187                                                      PtrTy.getAddressSpace(),
1188                                                      Op == G_LOAD);
1189               if (MemSize > MaxSize)
1190                 return std::make_pair(0, LLT::scalar(MaxSize));
1191 
1192               unsigned Align = Query.MMODescrs[0].AlignInBits;
1193               return std::make_pair(0, LLT::scalar(Align));
1194             })
1195         .fewerElementsIf(
1196             [=](const LegalityQuery &Query) -> bool {
1197               return Query.Types[0].isVector() &&
1198                      needToSplitMemOp(Query, Op == G_LOAD);
1199             },
1200             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201               const LLT DstTy = Query.Types[0];
1202               const LLT PtrTy = Query.Types[1];
1203 
1204               LLT EltTy = DstTy.getElementType();
1205               unsigned MaxSize = maxSizeForAddrSpace(ST,
1206                                                      PtrTy.getAddressSpace(),
1207                                                      Op == G_LOAD);
1208 
1209               // FIXME: Handle widened to power of 2 results better. This ends
1210               // up scalarizing.
1211               // FIXME: 3 element stores scalarized on SI
1212 
1213               // Split if it's too large for the address space.
1214               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215                 unsigned NumElts = DstTy.getNumElements();
1216                 unsigned EltSize = EltTy.getSizeInBits();
1217 
1218                 if (MaxSize % EltSize == 0) {
1219                   return std::make_pair(
1220                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1221                 }
1222 
1223                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1224 
1225                 // FIXME: Refine when odd breakdowns handled
1226                 // The scalars will need to be re-legalized.
1227                 if (NumPieces == 1 || NumPieces >= NumElts ||
1228                     NumElts % NumPieces != 0)
1229                   return std::make_pair(0, EltTy);
1230 
1231                 return std::make_pair(0,
1232                                       LLT::vector(NumElts / NumPieces, EltTy));
1233               }
1234 
1235               // FIXME: We could probably handle weird extending loads better.
1236               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1237               if (DstTy.getSizeInBits() > MemSize)
1238                 return std::make_pair(0, EltTy);
1239 
1240               unsigned EltSize = EltTy.getSizeInBits();
1241               unsigned DstSize = DstTy.getSizeInBits();
1242               if (!isPowerOf2_32(DstSize)) {
1243                 // We're probably decomposing an odd sized store. Try to split
1244                 // to the widest type. TODO: Account for alignment. As-is it
1245                 // should be OK, since the new parts will be further legalized.
1246                 unsigned FloorSize = PowerOf2Floor(DstSize);
1247                 return std::make_pair(
1248                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1249               }
1250 
1251               // Need to split because of alignment.
1252               unsigned Align = Query.MMODescrs[0].AlignInBits;
1253               if (EltSize > Align &&
1254                   (EltSize / Align < DstTy.getNumElements())) {
1255                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1256               }
1257 
1258               // May need relegalization for the scalars.
1259               return std::make_pair(0, EltTy);
1260             })
1261     .lowerIfMemSizeNotPow2()
1262     .minScalar(0, S32)
1263     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1264     .widenScalarToNextPow2(0)
1265     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1266     .lower();
1267   }
1268 
1269   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1270                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1271                                                   {S32, GlobalPtr, 16, 2 * 8},
1272                                                   {S32, LocalPtr, 8, 8},
1273                                                   {S32, LocalPtr, 16, 16},
1274                                                   {S32, PrivatePtr, 8, 8},
1275                                                   {S32, PrivatePtr, 16, 16},
1276                                                   {S32, ConstantPtr, 8, 8},
1277                                                   {S32, ConstantPtr, 16, 2 * 8}})
1278                        .legalIf(
1279                          [=](const LegalityQuery &Query) -> bool {
1280                            return isLoadStoreLegal(ST, Query);
1281                          });
1282 
1283   if (ST.hasFlatAddressSpace()) {
1284     ExtLoads.legalForTypesWithMemDesc(
1285         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1286   }
1287 
1288   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1289   // 64-bits.
1290   //
1291   // TODO: Should generalize bitcast action into coerce, which will also cover
1292   // inserting addrspacecasts.
1293   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1294 
1295   ExtLoads.clampScalar(0, S32, S32)
1296           .widenScalarToNextPow2(0)
1297           .unsupportedIfMemSizeNotPow2()
1298           .lower();
1299 
1300   auto &Atomics = getActionDefinitionsBuilder(
1301     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1302      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1303      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1304      G_ATOMICRMW_UMIN})
1305     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1306                {S64, GlobalPtr}, {S64, LocalPtr},
1307                {S32, RegionPtr}, {S64, RegionPtr}});
1308   if (ST.hasFlatAddressSpace()) {
1309     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1310   }
1311 
1312   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1313   if (ST.hasLDSFPAtomics()) {
1314     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1315     if (ST.hasGFX90AInsts())
1316       Atomic.legalFor({{S64, LocalPtr}});
1317   }
1318   if (ST.hasAtomicFaddInsts())
1319     Atomic.legalFor({{S32, GlobalPtr}});
1320 
1321   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1322   // demarshalling
1323   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1324     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1325                 {S32, FlatPtr}, {S64, FlatPtr}})
1326     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1327                {S32, RegionPtr}, {S64, RegionPtr}});
1328   // TODO: Pointer types, any 32-bit or 64-bit vector
1329 
1330   // Condition should be s32 for scalar, s1 for vector.
1331   getActionDefinitionsBuilder(G_SELECT)
1332     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1333           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1334           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1335     .clampScalar(0, S16, S64)
1336     .scalarize(1)
1337     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1338     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1339     .clampMaxNumElements(0, S32, 2)
1340     .clampMaxNumElements(0, LocalPtr, 2)
1341     .clampMaxNumElements(0, PrivatePtr, 2)
1342     .scalarize(0)
1343     .widenScalarToNextPow2(0)
1344     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1345 
1346   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1347   // be more flexible with the shift amount type.
1348   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1349     .legalFor({{S32, S32}, {S64, S32}});
1350   if (ST.has16BitInsts()) {
1351     if (ST.hasVOP3PInsts()) {
1352       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1353             .clampMaxNumElements(0, S16, 2);
1354     } else
1355       Shifts.legalFor({{S16, S16}});
1356 
1357     // TODO: Support 16-bit shift amounts for all types
1358     Shifts.widenScalarIf(
1359       [=](const LegalityQuery &Query) {
1360         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1361         // 32-bit amount.
1362         const LLT ValTy = Query.Types[0];
1363         const LLT AmountTy = Query.Types[1];
1364         return ValTy.getSizeInBits() <= 16 &&
1365                AmountTy.getSizeInBits() < 16;
1366       }, changeTo(1, S16));
1367     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1368     Shifts.clampScalar(1, S32, S32);
1369     Shifts.clampScalar(0, S16, S64);
1370     Shifts.widenScalarToNextPow2(0, 16);
1371 
1372     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1373       .minScalar(0, S16)
1374       .scalarize(0)
1375       .lower();
1376   } else {
1377     // Make sure we legalize the shift amount type first, as the general
1378     // expansion for the shifted type will produce much worse code if it hasn't
1379     // been truncated already.
1380     Shifts.clampScalar(1, S32, S32);
1381     Shifts.clampScalar(0, S32, S64);
1382     Shifts.widenScalarToNextPow2(0, 32);
1383 
1384     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1385       .minScalar(0, S32)
1386       .scalarize(0)
1387       .lower();
1388   }
1389   Shifts.scalarize(0);
1390 
1391   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1392     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1393     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1394     unsigned IdxTypeIdx = 2;
1395 
1396     getActionDefinitionsBuilder(Op)
1397       .customIf([=](const LegalityQuery &Query) {
1398           const LLT EltTy = Query.Types[EltTypeIdx];
1399           const LLT VecTy = Query.Types[VecTypeIdx];
1400           const LLT IdxTy = Query.Types[IdxTypeIdx];
1401           const unsigned EltSize = EltTy.getSizeInBits();
1402           return (EltSize == 32 || EltSize == 64) &&
1403                   VecTy.getSizeInBits() % 32 == 0 &&
1404                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1405                   IdxTy.getSizeInBits() == 32;
1406         })
1407       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1408                  bitcastToVectorElement32(VecTypeIdx))
1409       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1410       .bitcastIf(
1411         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1412         [=](const LegalityQuery &Query) {
1413           // For > 64-bit element types, try to turn this into a 64-bit
1414           // element vector since we may be able to do better indexing
1415           // if this is scalar. If not, fall back to 32.
1416           const LLT EltTy = Query.Types[EltTypeIdx];
1417           const LLT VecTy = Query.Types[VecTypeIdx];
1418           const unsigned DstEltSize = EltTy.getSizeInBits();
1419           const unsigned VecSize = VecTy.getSizeInBits();
1420 
1421           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1422           return std::make_pair(
1423             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1424         })
1425       .clampScalar(EltTypeIdx, S32, S64)
1426       .clampScalar(VecTypeIdx, S32, S64)
1427       .clampScalar(IdxTypeIdx, S32, S32)
1428       .clampMaxNumElements(VecTypeIdx, S32, 32)
1429       // TODO: Clamp elements for 64-bit vectors?
1430       // It should only be necessary with variable indexes.
1431       // As a last resort, lower to the stack
1432       .lower();
1433   }
1434 
1435   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436     .unsupportedIf([=](const LegalityQuery &Query) {
1437         const LLT &EltTy = Query.Types[1].getElementType();
1438         return Query.Types[0] != EltTy;
1439       });
1440 
1441   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444 
1445     // FIXME: Doesn't handle extract of illegal sizes.
1446     getActionDefinitionsBuilder(Op)
1447       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448       // FIXME: Multiples of 16 should not be legal.
1449       .legalIf([=](const LegalityQuery &Query) {
1450           const LLT BigTy = Query.Types[BigTyIdx];
1451           const LLT LitTy = Query.Types[LitTyIdx];
1452           return (BigTy.getSizeInBits() % 32 == 0) &&
1453                  (LitTy.getSizeInBits() % 16 == 0);
1454         })
1455       .widenScalarIf(
1456         [=](const LegalityQuery &Query) {
1457           const LLT BigTy = Query.Types[BigTyIdx];
1458           return (BigTy.getScalarSizeInBits() < 16);
1459         },
1460         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT LitTy = Query.Types[LitTyIdx];
1464           return (LitTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1467       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468       .widenScalarToNextPow2(BigTyIdx, 32);
1469 
1470   }
1471 
1472   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473     .legalForCartesianProduct(AllS32Vectors, {S32})
1474     .legalForCartesianProduct(AllS64Vectors, {S64})
1475     .clampNumElements(0, V16S32, V32S32)
1476     .clampNumElements(0, V2S64, V16S64)
1477     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478 
1479   if (ST.hasScalarPackInsts()) {
1480     BuildVector
1481       // FIXME: Should probably widen s1 vectors straight to s32
1482       .minScalarOrElt(0, S16)
1483       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484       .minScalar(1, S32);
1485 
1486     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487       .legalFor({V2S16, S32})
1488       .lower();
1489     BuildVector.minScalarOrElt(0, S32);
1490   } else {
1491     BuildVector.customFor({V2S16, S16});
1492     BuildVector.minScalarOrElt(0, S32);
1493 
1494     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495       .customFor({V2S16, S32})
1496       .lower();
1497   }
1498 
1499   BuildVector.legalIf(isRegisterType(0));
1500 
1501   // FIXME: Clamp maximum size
1502   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504     .clampMaxNumElements(0, S32, 32)
1505     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506     .clampMaxNumElements(0, S16, 64);
1507 
1508   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509   // pre-legalize.
1510   if (ST.hasVOP3PInsts()) {
1511     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512       .customFor({V2S16, V2S16})
1513       .lower();
1514   } else
1515     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516 
1517   // Merge/Unmerge
1518   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521 
1522     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523       const LLT Ty = Query.Types[TypeIdx];
1524       if (Ty.isVector()) {
1525         const LLT &EltTy = Ty.getElementType();
1526         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527           return true;
1528         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529           return true;
1530       }
1531       return false;
1532     };
1533 
1534     auto &Builder = getActionDefinitionsBuilder(Op)
1535       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536       .lowerFor({{S16, V2S16}})
1537       .lowerIf([=](const LegalityQuery &Query) {
1538           const LLT BigTy = Query.Types[BigTyIdx];
1539           return BigTy.getSizeInBits() == 32;
1540         })
1541       // Try to widen to s16 first for small types.
1542       // TODO: Only do this on targets with legal s16 shifts
1543       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547                            elementTypeIs(1, S16)),
1548                        changeTo(1, V2S16))
1549       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551       // valid.
1552       .clampScalar(LitTyIdx, S32, S512)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554       // Break up vectors with weird elements into scalars
1555       .fewerElementsIf(
1556         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557         scalarize(0))
1558       .fewerElementsIf(
1559         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560         scalarize(1))
1561       .clampScalar(BigTyIdx, S32, MaxScalar);
1562 
1563     if (Op == G_MERGE_VALUES) {
1564       Builder.widenScalarIf(
1565         // TODO: Use 16-bit shifts if legal for 8-bit values?
1566         [=](const LegalityQuery &Query) {
1567           const LLT Ty = Query.Types[LitTyIdx];
1568           return Ty.getSizeInBits() < 32;
1569         },
1570         changeTo(LitTyIdx, S32));
1571     }
1572 
1573     Builder.widenScalarIf(
1574       [=](const LegalityQuery &Query) {
1575         const LLT Ty = Query.Types[BigTyIdx];
1576         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577           Ty.getSizeInBits() % 16 != 0;
1578       },
1579       [=](const LegalityQuery &Query) {
1580         // Pick the next power of 2, or a multiple of 64 over 128.
1581         // Whichever is smaller.
1582         const LLT &Ty = Query.Types[BigTyIdx];
1583         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584         if (NewSizeInBits >= 256) {
1585           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586           if (RoundedTo < NewSizeInBits)
1587             NewSizeInBits = RoundedTo;
1588         }
1589         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590       })
1591       // Any vectors left are the wrong size. Scalarize them.
1592       .scalarize(0)
1593       .scalarize(1);
1594   }
1595 
1596   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597   // RegBankSelect.
1598   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599     .legalFor({{S32}, {S64}});
1600 
1601   if (ST.hasVOP3PInsts()) {
1602     SextInReg.lowerFor({{V2S16}})
1603       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604       // get more vector shift opportunities, since we'll get those when
1605       // expanded.
1606       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1607   } else if (ST.has16BitInsts()) {
1608     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609   } else {
1610     // Prefer to promote to s32 before lowering if we don't have 16-bit
1611     // shifts. This avoid a lot of intermediate truncate and extend operations.
1612     SextInReg.lowerFor({{S32}, {S64}});
1613   }
1614 
1615   SextInReg
1616     .scalarize(0)
1617     .clampScalar(0, S32, S64)
1618     .lower();
1619 
1620   // TODO: Only Try to form v2s16 with legal packed instructions.
1621   getActionDefinitionsBuilder(G_FSHR)
1622     .legalFor({{S32, S32}})
1623     .lowerFor({{V2S16, V2S16}})
1624     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1625     .scalarize(0)
1626     .lower();
1627 
1628   if (ST.hasVOP3PInsts()) {
1629     getActionDefinitionsBuilder(G_FSHL)
1630       .lowerFor({{V2S16, V2S16}})
1631       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1632       .scalarize(0)
1633       .lower();
1634   } else {
1635     getActionDefinitionsBuilder(G_FSHL)
1636       .scalarize(0)
1637       .lower();
1638   }
1639 
1640   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1641     .legalFor({S64});
1642 
1643   getActionDefinitionsBuilder(G_FENCE)
1644     .alwaysLegal();
1645 
1646   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1647       .scalarize(0)
1648       .minScalar(0, S32)
1649       .lower();
1650 
1651   getActionDefinitionsBuilder({
1652       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1653       G_FCOPYSIGN,
1654 
1655       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1656       G_ATOMICRMW_NAND,
1657       G_ATOMICRMW_FSUB,
1658       G_READ_REGISTER,
1659       G_WRITE_REGISTER,
1660 
1661       G_SADDO, G_SSUBO,
1662 
1663        // TODO: Implement
1664       G_FMINIMUM, G_FMAXIMUM}).lower();
1665 
1666   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1667         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1668         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1669     .unsupported();
1670 
1671   getLegacyLegalizerInfo().computeTables();
1672   verify(*ST.getInstrInfo());
1673 }
1674 
1675 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1676                                          MachineInstr &MI) const {
1677   MachineIRBuilder &B = Helper.MIRBuilder;
1678   MachineRegisterInfo &MRI = *B.getMRI();
1679 
1680   switch (MI.getOpcode()) {
1681   case TargetOpcode::G_ADDRSPACE_CAST:
1682     return legalizeAddrSpaceCast(MI, MRI, B);
1683   case TargetOpcode::G_FRINT:
1684     return legalizeFrint(MI, MRI, B);
1685   case TargetOpcode::G_FCEIL:
1686     return legalizeFceil(MI, MRI, B);
1687   case TargetOpcode::G_FREM:
1688     return legalizeFrem(MI, MRI, B);
1689   case TargetOpcode::G_INTRINSIC_TRUNC:
1690     return legalizeIntrinsicTrunc(MI, MRI, B);
1691   case TargetOpcode::G_SITOFP:
1692     return legalizeITOFP(MI, MRI, B, true);
1693   case TargetOpcode::G_UITOFP:
1694     return legalizeITOFP(MI, MRI, B, false);
1695   case TargetOpcode::G_FPTOSI:
1696     return legalizeFPTOI(MI, MRI, B, true);
1697   case TargetOpcode::G_FPTOUI:
1698     return legalizeFPTOI(MI, MRI, B, false);
1699   case TargetOpcode::G_FMINNUM:
1700   case TargetOpcode::G_FMAXNUM:
1701   case TargetOpcode::G_FMINNUM_IEEE:
1702   case TargetOpcode::G_FMAXNUM_IEEE:
1703     return legalizeMinNumMaxNum(Helper, MI);
1704   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1705     return legalizeExtractVectorElt(MI, MRI, B);
1706   case TargetOpcode::G_INSERT_VECTOR_ELT:
1707     return legalizeInsertVectorElt(MI, MRI, B);
1708   case TargetOpcode::G_SHUFFLE_VECTOR:
1709     return legalizeShuffleVector(MI, MRI, B);
1710   case TargetOpcode::G_FSIN:
1711   case TargetOpcode::G_FCOS:
1712     return legalizeSinCos(MI, MRI, B);
1713   case TargetOpcode::G_GLOBAL_VALUE:
1714     return legalizeGlobalValue(MI, MRI, B);
1715   case TargetOpcode::G_LOAD:
1716   case TargetOpcode::G_SEXTLOAD:
1717   case TargetOpcode::G_ZEXTLOAD:
1718     return legalizeLoad(Helper, MI);
1719   case TargetOpcode::G_FMAD:
1720     return legalizeFMad(MI, MRI, B);
1721   case TargetOpcode::G_FDIV:
1722     return legalizeFDIV(MI, MRI, B);
1723   case TargetOpcode::G_UDIV:
1724   case TargetOpcode::G_UREM:
1725   case TargetOpcode::G_UDIVREM:
1726     return legalizeUnsignedDIV_REM(MI, MRI, B);
1727   case TargetOpcode::G_SDIV:
1728   case TargetOpcode::G_SREM:
1729   case TargetOpcode::G_SDIVREM:
1730     return legalizeSignedDIV_REM(MI, MRI, B);
1731   case TargetOpcode::G_ATOMIC_CMPXCHG:
1732     return legalizeAtomicCmpXChg(MI, MRI, B);
1733   case TargetOpcode::G_FLOG:
1734     return legalizeFlog(MI, B, numbers::ln2f);
1735   case TargetOpcode::G_FLOG10:
1736     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1737   case TargetOpcode::G_FEXP:
1738     return legalizeFExp(MI, B);
1739   case TargetOpcode::G_FPOW:
1740     return legalizeFPow(MI, B);
1741   case TargetOpcode::G_FFLOOR:
1742     return legalizeFFloor(MI, MRI, B);
1743   case TargetOpcode::G_BUILD_VECTOR:
1744     return legalizeBuildVector(MI, MRI, B);
1745   default:
1746     return false;
1747   }
1748 
1749   llvm_unreachable("expected switch to return");
1750 }
1751 
1752 Register AMDGPULegalizerInfo::getSegmentAperture(
1753   unsigned AS,
1754   MachineRegisterInfo &MRI,
1755   MachineIRBuilder &B) const {
1756   MachineFunction &MF = B.getMF();
1757   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1758   const LLT S32 = LLT::scalar(32);
1759 
1760   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1761 
1762   if (ST.hasApertureRegs()) {
1763     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1764     // getreg.
1765     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1766         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1767         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1768     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1769         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1770         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1771     unsigned Encoding =
1772         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1773         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1774         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1775 
1776     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1777 
1778     B.buildInstr(AMDGPU::S_GETREG_B32)
1779       .addDef(GetReg)
1780       .addImm(Encoding);
1781     MRI.setType(GetReg, S32);
1782 
1783     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1784     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1785   }
1786 
1787   Register QueuePtr = MRI.createGenericVirtualRegister(
1788     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1789 
1790   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1791     return Register();
1792 
1793   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1794   // private_segment_aperture_base_hi.
1795   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1796 
1797   // TODO: can we be smarter about machine pointer info?
1798   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1799   MachineMemOperand *MMO = MF.getMachineMemOperand(
1800       PtrInfo,
1801       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1802           MachineMemOperand::MOInvariant,
1803       4, commonAlignment(Align(64), StructOffset));
1804 
1805   Register LoadAddr;
1806 
1807   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1808   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1809 }
1810 
1811 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1812   MachineInstr &MI, MachineRegisterInfo &MRI,
1813   MachineIRBuilder &B) const {
1814   MachineFunction &MF = B.getMF();
1815 
1816   const LLT S32 = LLT::scalar(32);
1817   Register Dst = MI.getOperand(0).getReg();
1818   Register Src = MI.getOperand(1).getReg();
1819 
1820   LLT DstTy = MRI.getType(Dst);
1821   LLT SrcTy = MRI.getType(Src);
1822   unsigned DestAS = DstTy.getAddressSpace();
1823   unsigned SrcAS = SrcTy.getAddressSpace();
1824 
1825   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1826   // vector element.
1827   assert(!DstTy.isVector());
1828 
1829   const AMDGPUTargetMachine &TM
1830     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1831 
1832   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1833     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1834     return true;
1835   }
1836 
1837   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1838     // Truncate.
1839     B.buildExtract(Dst, Src, 0);
1840     MI.eraseFromParent();
1841     return true;
1842   }
1843 
1844   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1845     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1846     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1847 
1848     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1849     // another. Merge operands are required to be the same type, but creating an
1850     // extra ptrtoint would be kind of pointless.
1851     auto HighAddr = B.buildConstant(
1852       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1853     B.buildMerge(Dst, {Src, HighAddr});
1854     MI.eraseFromParent();
1855     return true;
1856   }
1857 
1858   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1859     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1860            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1861     unsigned NullVal = TM.getNullPointerValue(DestAS);
1862 
1863     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1864     auto FlatNull = B.buildConstant(SrcTy, 0);
1865 
1866     // Extract low 32-bits of the pointer.
1867     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1868 
1869     auto CmpRes =
1870         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1871     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1872 
1873     MI.eraseFromParent();
1874     return true;
1875   }
1876 
1877   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1878     return false;
1879 
1880   if (!ST.hasFlatAddressSpace())
1881     return false;
1882 
1883   auto SegmentNull =
1884       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1885   auto FlatNull =
1886       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1887 
1888   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1889   if (!ApertureReg.isValid())
1890     return false;
1891 
1892   auto CmpRes =
1893       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1894 
1895   // Coerce the type of the low half of the result so we can use merge_values.
1896   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1897 
1898   // TODO: Should we allow mismatched types but matching sizes in merges to
1899   // avoid the ptrtoint?
1900   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1901   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1902 
1903   MI.eraseFromParent();
1904   return true;
1905 }
1906 
1907 bool AMDGPULegalizerInfo::legalizeFrint(
1908   MachineInstr &MI, MachineRegisterInfo &MRI,
1909   MachineIRBuilder &B) const {
1910   Register Src = MI.getOperand(1).getReg();
1911   LLT Ty = MRI.getType(Src);
1912   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1913 
1914   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1915   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1916 
1917   auto C1 = B.buildFConstant(Ty, C1Val);
1918   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1919 
1920   // TODO: Should this propagate fast-math-flags?
1921   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1922   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1923 
1924   auto C2 = B.buildFConstant(Ty, C2Val);
1925   auto Fabs = B.buildFAbs(Ty, Src);
1926 
1927   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1928   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1929   MI.eraseFromParent();
1930   return true;
1931 }
1932 
1933 bool AMDGPULegalizerInfo::legalizeFceil(
1934   MachineInstr &MI, MachineRegisterInfo &MRI,
1935   MachineIRBuilder &B) const {
1936 
1937   const LLT S1 = LLT::scalar(1);
1938   const LLT S64 = LLT::scalar(64);
1939 
1940   Register Src = MI.getOperand(1).getReg();
1941   assert(MRI.getType(Src) == S64);
1942 
1943   // result = trunc(src)
1944   // if (src > 0.0 && src != result)
1945   //   result += 1.0
1946 
1947   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1948 
1949   const auto Zero = B.buildFConstant(S64, 0.0);
1950   const auto One = B.buildFConstant(S64, 1.0);
1951   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1952   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1953   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1954   auto Add = B.buildSelect(S64, And, One, Zero);
1955 
1956   // TODO: Should this propagate fast-math-flags?
1957   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1958   return true;
1959 }
1960 
1961 bool AMDGPULegalizerInfo::legalizeFrem(
1962   MachineInstr &MI, MachineRegisterInfo &MRI,
1963   MachineIRBuilder &B) const {
1964     Register DstReg = MI.getOperand(0).getReg();
1965     Register Src0Reg = MI.getOperand(1).getReg();
1966     Register Src1Reg = MI.getOperand(2).getReg();
1967     auto Flags = MI.getFlags();
1968     LLT Ty = MRI.getType(DstReg);
1969 
1970     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1971     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1972     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1973     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1974     MI.eraseFromParent();
1975     return true;
1976 }
1977 
1978 static MachineInstrBuilder extractF64Exponent(Register Hi,
1979                                               MachineIRBuilder &B) {
1980   const unsigned FractBits = 52;
1981   const unsigned ExpBits = 11;
1982   LLT S32 = LLT::scalar(32);
1983 
1984   auto Const0 = B.buildConstant(S32, FractBits - 32);
1985   auto Const1 = B.buildConstant(S32, ExpBits);
1986 
1987   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1988     .addUse(Hi)
1989     .addUse(Const0.getReg(0))
1990     .addUse(Const1.getReg(0));
1991 
1992   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1993 }
1994 
1995 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B) const {
1998   const LLT S1 = LLT::scalar(1);
1999   const LLT S32 = LLT::scalar(32);
2000   const LLT S64 = LLT::scalar(64);
2001 
2002   Register Src = MI.getOperand(1).getReg();
2003   assert(MRI.getType(Src) == S64);
2004 
2005   // TODO: Should this use extract since the low half is unused?
2006   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2007   Register Hi = Unmerge.getReg(1);
2008 
2009   // Extract the upper half, since this is where we will find the sign and
2010   // exponent.
2011   auto Exp = extractF64Exponent(Hi, B);
2012 
2013   const unsigned FractBits = 52;
2014 
2015   // Extract the sign bit.
2016   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2017   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2018 
2019   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2020 
2021   const auto Zero32 = B.buildConstant(S32, 0);
2022 
2023   // Extend back to 64-bits.
2024   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2025 
2026   auto Shr = B.buildAShr(S64, FractMask, Exp);
2027   auto Not = B.buildNot(S64, Shr);
2028   auto Tmp0 = B.buildAnd(S64, Src, Not);
2029   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2030 
2031   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2032   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2033 
2034   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2035   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2036   MI.eraseFromParent();
2037   return true;
2038 }
2039 
2040 bool AMDGPULegalizerInfo::legalizeITOFP(
2041   MachineInstr &MI, MachineRegisterInfo &MRI,
2042   MachineIRBuilder &B, bool Signed) const {
2043 
2044   Register Dst = MI.getOperand(0).getReg();
2045   Register Src = MI.getOperand(1).getReg();
2046 
2047   const LLT S64 = LLT::scalar(64);
2048   const LLT S32 = LLT::scalar(32);
2049 
2050   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2051 
2052   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2053 
2054   auto CvtHi = Signed ?
2055     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2056     B.buildUITOFP(S64, Unmerge.getReg(1));
2057 
2058   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2059 
2060   auto ThirtyTwo = B.buildConstant(S32, 32);
2061   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2062     .addUse(CvtHi.getReg(0))
2063     .addUse(ThirtyTwo.getReg(0));
2064 
2065   // TODO: Should this propagate fast-math-flags?
2066   B.buildFAdd(Dst, LdExp, CvtLo);
2067   MI.eraseFromParent();
2068   return true;
2069 }
2070 
2071 // TODO: Copied from DAG implementation. Verify logic and document how this
2072 // actually works.
2073 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2074                                         MachineRegisterInfo &MRI,
2075                                         MachineIRBuilder &B,
2076                                         bool Signed) const {
2077 
2078   Register Dst = MI.getOperand(0).getReg();
2079   Register Src = MI.getOperand(1).getReg();
2080 
2081   const LLT S64 = LLT::scalar(64);
2082   const LLT S32 = LLT::scalar(32);
2083 
2084   const LLT SrcLT = MRI.getType(Src);
2085   const LLT DstLT = MRI.getType(Dst);
2086 
2087   assert((SrcLT == S32 || SrcLT == S64) && DstLT == S64);
2088 
2089   unsigned Flags = MI.getFlags();
2090 
2091   // The basic idea of converting a floating point number into a pair of 32-bit
2092   // integers is illustrated as follows:
2093   //
2094   //     tf := trunc(val);
2095   //    hif := floor(tf * 2^-32);
2096   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2097   //     hi := fptoi(hif);
2098   //     lo := fptoi(lof);
2099   //
2100   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2101   MachineInstrBuilder Sign;
2102   if (Signed && SrcLT == S32) {
2103     // However, a 32-bit floating point number has only 23 bits mantissa and
2104     // it's not enough to hold all the significant bits of `lof` if val is
2105     // negative. To avoid the loss of precision, We need to take the absolute
2106     // value after truncating and flip the result back based on the original
2107     // signedness.
2108     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2109     Trunc = B.buildFAbs(S32, Trunc, Flags);
2110   }
2111   MachineInstrBuilder K0, K1;
2112   if (SrcLT == S64) {
2113     K0 = B.buildFConstant(S64,
2114                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2115     K1 = B.buildFConstant(S64,
2116                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2117   } else {
2118     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2119     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2120   }
2121 
2122   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2123   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2124   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2125 
2126   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2127                                      : B.buildFPTOUI(S32, FloorMul);
2128   auto Lo = B.buildFPTOUI(S32, Fma);
2129 
2130   if (Signed && SrcLT == S32) {
2131     // Flip the result based on the signedness, which is either all 0s or 1s.
2132     Sign = B.buildMerge(S64, {Sign, Sign});
2133     // r := xor({lo, hi}, sign) - sign;
2134     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2135   } else
2136     B.buildMerge(Dst, {Lo, Hi});
2137   MI.eraseFromParent();
2138 
2139   return true;
2140 }
2141 
2142 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2143                                                MachineInstr &MI) const {
2144   MachineFunction &MF = Helper.MIRBuilder.getMF();
2145   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2146 
2147   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2148                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2149 
2150   // With ieee_mode disabled, the instructions have the correct behavior
2151   // already for G_FMINNUM/G_FMAXNUM
2152   if (!MFI->getMode().IEEE)
2153     return !IsIEEEOp;
2154 
2155   if (IsIEEEOp)
2156     return true;
2157 
2158   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2159 }
2160 
2161 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2162   MachineInstr &MI, MachineRegisterInfo &MRI,
2163   MachineIRBuilder &B) const {
2164   // TODO: Should move some of this into LegalizerHelper.
2165 
2166   // TODO: Promote dynamic indexing of s16 to s32
2167 
2168   // FIXME: Artifact combiner probably should have replaced the truncated
2169   // constant before this, so we shouldn't need
2170   // getConstantVRegValWithLookThrough.
2171   Optional<ValueAndVReg> MaybeIdxVal =
2172       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2173   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2174     return true;
2175   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2176 
2177   Register Dst = MI.getOperand(0).getReg();
2178   Register Vec = MI.getOperand(1).getReg();
2179 
2180   LLT VecTy = MRI.getType(Vec);
2181   LLT EltTy = VecTy.getElementType();
2182   assert(EltTy == MRI.getType(Dst));
2183 
2184   if (IdxVal < VecTy.getNumElements())
2185     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2186   else
2187     B.buildUndef(Dst);
2188 
2189   MI.eraseFromParent();
2190   return true;
2191 }
2192 
2193 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2194   MachineInstr &MI, MachineRegisterInfo &MRI,
2195   MachineIRBuilder &B) const {
2196   // TODO: Should move some of this into LegalizerHelper.
2197 
2198   // TODO: Promote dynamic indexing of s16 to s32
2199 
2200   // FIXME: Artifact combiner probably should have replaced the truncated
2201   // constant before this, so we shouldn't need
2202   // getConstantVRegValWithLookThrough.
2203   Optional<ValueAndVReg> MaybeIdxVal =
2204       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2205   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2206     return true;
2207 
2208   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2209   Register Dst = MI.getOperand(0).getReg();
2210   Register Vec = MI.getOperand(1).getReg();
2211   Register Ins = MI.getOperand(2).getReg();
2212 
2213   LLT VecTy = MRI.getType(Vec);
2214   LLT EltTy = VecTy.getElementType();
2215   assert(EltTy == MRI.getType(Ins));
2216 
2217   if (IdxVal < VecTy.getNumElements())
2218     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2219   else
2220     B.buildUndef(Dst);
2221 
2222   MI.eraseFromParent();
2223   return true;
2224 }
2225 
2226 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2227   MachineInstr &MI, MachineRegisterInfo &MRI,
2228   MachineIRBuilder &B) const {
2229   const LLT V2S16 = LLT::vector(2, 16);
2230 
2231   Register Dst = MI.getOperand(0).getReg();
2232   Register Src0 = MI.getOperand(1).getReg();
2233   LLT DstTy = MRI.getType(Dst);
2234   LLT SrcTy = MRI.getType(Src0);
2235 
2236   if (SrcTy == V2S16 && DstTy == V2S16 &&
2237       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2238     return true;
2239 
2240   MachineIRBuilder HelperBuilder(MI);
2241   GISelObserverWrapper DummyObserver;
2242   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2243   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2244 }
2245 
2246 bool AMDGPULegalizerInfo::legalizeSinCos(
2247   MachineInstr &MI, MachineRegisterInfo &MRI,
2248   MachineIRBuilder &B) const {
2249 
2250   Register DstReg = MI.getOperand(0).getReg();
2251   Register SrcReg = MI.getOperand(1).getReg();
2252   LLT Ty = MRI.getType(DstReg);
2253   unsigned Flags = MI.getFlags();
2254 
2255   Register TrigVal;
2256   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2257   if (ST.hasTrigReducedRange()) {
2258     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2259     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2260       .addUse(MulVal.getReg(0))
2261       .setMIFlags(Flags).getReg(0);
2262   } else
2263     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2264 
2265   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2266     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2267   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2268     .addUse(TrigVal)
2269     .setMIFlags(Flags);
2270   MI.eraseFromParent();
2271   return true;
2272 }
2273 
2274 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2275                                                   MachineIRBuilder &B,
2276                                                   const GlobalValue *GV,
2277                                                   int64_t Offset,
2278                                                   unsigned GAFlags) const {
2279   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2280   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2281   // to the following code sequence:
2282   //
2283   // For constant address space:
2284   //   s_getpc_b64 s[0:1]
2285   //   s_add_u32 s0, s0, $symbol
2286   //   s_addc_u32 s1, s1, 0
2287   //
2288   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2289   //   a fixup or relocation is emitted to replace $symbol with a literal
2290   //   constant, which is a pc-relative offset from the encoding of the $symbol
2291   //   operand to the global variable.
2292   //
2293   // For global address space:
2294   //   s_getpc_b64 s[0:1]
2295   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2296   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2297   //
2298   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2299   //   fixups or relocations are emitted to replace $symbol@*@lo and
2300   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2301   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2302   //   operand to the global variable.
2303   //
2304   // What we want here is an offset from the value returned by s_getpc
2305   // (which is the address of the s_add_u32 instruction) to the global
2306   // variable, but since the encoding of $symbol starts 4 bytes after the start
2307   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2308   // small. This requires us to add 4 to the global variable offset in order to
2309   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2310   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2311   // instruction.
2312 
2313   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2314 
2315   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2316     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2317 
2318   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2319     .addDef(PCReg);
2320 
2321   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2322   if (GAFlags == SIInstrInfo::MO_NONE)
2323     MIB.addImm(0);
2324   else
2325     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2326 
2327   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2328 
2329   if (PtrTy.getSizeInBits() == 32)
2330     B.buildExtract(DstReg, PCReg, 0);
2331   return true;
2332  }
2333 
2334 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2335   MachineInstr &MI, MachineRegisterInfo &MRI,
2336   MachineIRBuilder &B) const {
2337   Register DstReg = MI.getOperand(0).getReg();
2338   LLT Ty = MRI.getType(DstReg);
2339   unsigned AS = Ty.getAddressSpace();
2340 
2341   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2342   MachineFunction &MF = B.getMF();
2343   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2344 
2345   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2346     if (!MFI->isModuleEntryFunction() &&
2347         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2348       const Function &Fn = MF.getFunction();
2349       DiagnosticInfoUnsupported BadLDSDecl(
2350         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2351         DS_Warning);
2352       Fn.getContext().diagnose(BadLDSDecl);
2353 
2354       // We currently don't have a way to correctly allocate LDS objects that
2355       // aren't directly associated with a kernel. We do force inlining of
2356       // functions that use local objects. However, if these dead functions are
2357       // not eliminated, we don't want a compile time error. Just emit a warning
2358       // and a trap, since there should be no callable path here.
2359       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2360       B.buildUndef(DstReg);
2361       MI.eraseFromParent();
2362       return true;
2363     }
2364 
2365     // TODO: We could emit code to handle the initialization somewhere.
2366     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2367       const SITargetLowering *TLI = ST.getTargetLowering();
2368       if (!TLI->shouldUseLDSConstAddress(GV)) {
2369         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2370         return true; // Leave in place;
2371       }
2372 
2373       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2374         Type *Ty = GV->getValueType();
2375         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2376         // zero-sized type in other languages to declare the dynamic shared
2377         // memory which size is not known at the compile time. They will be
2378         // allocated by the runtime and placed directly after the static
2379         // allocated ones. They all share the same offset.
2380         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2381           // Adjust alignment for that dynamic shared memory array.
2382           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2383           LLT S32 = LLT::scalar(32);
2384           auto Sz =
2385               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2386           B.buildIntToPtr(DstReg, Sz);
2387           MI.eraseFromParent();
2388           return true;
2389         }
2390       }
2391 
2392       B.buildConstant(
2393           DstReg,
2394           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2395       MI.eraseFromParent();
2396       return true;
2397     }
2398 
2399     const Function &Fn = MF.getFunction();
2400     DiagnosticInfoUnsupported BadInit(
2401       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2402     Fn.getContext().diagnose(BadInit);
2403     return true;
2404   }
2405 
2406   const SITargetLowering *TLI = ST.getTargetLowering();
2407 
2408   if (TLI->shouldEmitFixup(GV)) {
2409     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2410     MI.eraseFromParent();
2411     return true;
2412   }
2413 
2414   if (TLI->shouldEmitPCReloc(GV)) {
2415     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2416     MI.eraseFromParent();
2417     return true;
2418   }
2419 
2420   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2421   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2422 
2423   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2424       MachinePointerInfo::getGOT(MF),
2425       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2426           MachineMemOperand::MOInvariant,
2427       8 /*Size*/, Align(8));
2428 
2429   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2430 
2431   if (Ty.getSizeInBits() == 32) {
2432     // Truncate if this is a 32-bit constant adrdess.
2433     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2434     B.buildExtract(DstReg, Load, 0);
2435   } else
2436     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2437 
2438   MI.eraseFromParent();
2439   return true;
2440 }
2441 
2442 static LLT widenToNextPowerOf2(LLT Ty) {
2443   if (Ty.isVector())
2444     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2445   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2446 }
2447 
2448 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2449                                        MachineInstr &MI) const {
2450   MachineIRBuilder &B = Helper.MIRBuilder;
2451   MachineRegisterInfo &MRI = *B.getMRI();
2452   GISelChangeObserver &Observer = Helper.Observer;
2453 
2454   Register PtrReg = MI.getOperand(1).getReg();
2455   LLT PtrTy = MRI.getType(PtrReg);
2456   unsigned AddrSpace = PtrTy.getAddressSpace();
2457 
2458   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2459     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2460     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2461     Observer.changingInstr(MI);
2462     MI.getOperand(1).setReg(Cast.getReg(0));
2463     Observer.changedInstr(MI);
2464     return true;
2465   }
2466 
2467   if (MI.getOpcode() != AMDGPU::G_LOAD)
2468     return false;
2469 
2470   Register ValReg = MI.getOperand(0).getReg();
2471   LLT ValTy = MRI.getType(ValReg);
2472 
2473   MachineMemOperand *MMO = *MI.memoperands_begin();
2474   const unsigned ValSize = ValTy.getSizeInBits();
2475   const unsigned MemSize = 8 * MMO->getSize();
2476   const Align MemAlign = MMO->getAlign();
2477   const unsigned AlignInBits = 8 * MemAlign.value();
2478 
2479   // Widen non-power-of-2 loads to the alignment if needed
2480   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2481     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2482 
2483     // This was already the correct extending load result type, so just adjust
2484     // the memory type.
2485     if (WideMemSize == ValSize) {
2486       MachineFunction &MF = B.getMF();
2487 
2488       MachineMemOperand *WideMMO =
2489           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2490       Observer.changingInstr(MI);
2491       MI.setMemRefs(MF, {WideMMO});
2492       Observer.changedInstr(MI);
2493       return true;
2494     }
2495 
2496     // Don't bother handling edge case that should probably never be produced.
2497     if (ValSize > WideMemSize)
2498       return false;
2499 
2500     LLT WideTy = widenToNextPowerOf2(ValTy);
2501 
2502     Register WideLoad;
2503     if (!WideTy.isVector()) {
2504       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2505       B.buildTrunc(ValReg, WideLoad).getReg(0);
2506     } else {
2507       // Extract the subvector.
2508 
2509       if (isRegisterType(ValTy)) {
2510         // If this a case where G_EXTRACT is legal, use it.
2511         // (e.g. <3 x s32> -> <4 x s32>)
2512         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2513         B.buildExtract(ValReg, WideLoad, 0);
2514       } else {
2515         // For cases where the widened type isn't a nice register value, unmerge
2516         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2517         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2518         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2519         B.setInsertPt(B.getMBB(), MI.getIterator());
2520         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2521       }
2522     }
2523 
2524     MI.eraseFromParent();
2525     return true;
2526   }
2527 
2528   return false;
2529 }
2530 
2531 bool AMDGPULegalizerInfo::legalizeFMad(
2532   MachineInstr &MI, MachineRegisterInfo &MRI,
2533   MachineIRBuilder &B) const {
2534   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2535   assert(Ty.isScalar());
2536 
2537   MachineFunction &MF = B.getMF();
2538   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2539 
2540   // TODO: Always legal with future ftz flag.
2541   // FIXME: Do we need just output?
2542   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2543     return true;
2544   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2545     return true;
2546 
2547   MachineIRBuilder HelperBuilder(MI);
2548   GISelObserverWrapper DummyObserver;
2549   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2550   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2551 }
2552 
2553 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2554   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2555   Register DstReg = MI.getOperand(0).getReg();
2556   Register PtrReg = MI.getOperand(1).getReg();
2557   Register CmpVal = MI.getOperand(2).getReg();
2558   Register NewVal = MI.getOperand(3).getReg();
2559 
2560   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2561          "this should not have been custom lowered");
2562 
2563   LLT ValTy = MRI.getType(CmpVal);
2564   LLT VecTy = LLT::vector(2, ValTy);
2565 
2566   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2567 
2568   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2569     .addDef(DstReg)
2570     .addUse(PtrReg)
2571     .addUse(PackedVal)
2572     .setMemRefs(MI.memoperands());
2573 
2574   MI.eraseFromParent();
2575   return true;
2576 }
2577 
2578 bool AMDGPULegalizerInfo::legalizeFlog(
2579   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2580   Register Dst = MI.getOperand(0).getReg();
2581   Register Src = MI.getOperand(1).getReg();
2582   LLT Ty = B.getMRI()->getType(Dst);
2583   unsigned Flags = MI.getFlags();
2584 
2585   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2586   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2587 
2588   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2589   MI.eraseFromParent();
2590   return true;
2591 }
2592 
2593 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2594                                        MachineIRBuilder &B) const {
2595   Register Dst = MI.getOperand(0).getReg();
2596   Register Src = MI.getOperand(1).getReg();
2597   unsigned Flags = MI.getFlags();
2598   LLT Ty = B.getMRI()->getType(Dst);
2599 
2600   auto K = B.buildFConstant(Ty, numbers::log2e);
2601   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2602   B.buildFExp2(Dst, Mul, Flags);
2603   MI.eraseFromParent();
2604   return true;
2605 }
2606 
2607 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2608                                        MachineIRBuilder &B) const {
2609   Register Dst = MI.getOperand(0).getReg();
2610   Register Src0 = MI.getOperand(1).getReg();
2611   Register Src1 = MI.getOperand(2).getReg();
2612   unsigned Flags = MI.getFlags();
2613   LLT Ty = B.getMRI()->getType(Dst);
2614   const LLT S16 = LLT::scalar(16);
2615   const LLT S32 = LLT::scalar(32);
2616 
2617   if (Ty == S32) {
2618     auto Log = B.buildFLog2(S32, Src0, Flags);
2619     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2620       .addUse(Log.getReg(0))
2621       .addUse(Src1)
2622       .setMIFlags(Flags);
2623     B.buildFExp2(Dst, Mul, Flags);
2624   } else if (Ty == S16) {
2625     // There's no f16 fmul_legacy, so we need to convert for it.
2626     auto Log = B.buildFLog2(S16, Src0, Flags);
2627     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2628     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2629     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2630       .addUse(Ext0.getReg(0))
2631       .addUse(Ext1.getReg(0))
2632       .setMIFlags(Flags);
2633 
2634     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2635   } else
2636     return false;
2637 
2638   MI.eraseFromParent();
2639   return true;
2640 }
2641 
2642 // Find a source register, ignoring any possible source modifiers.
2643 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2644   Register ModSrc = OrigSrc;
2645   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2646     ModSrc = SrcFNeg->getOperand(1).getReg();
2647     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2648       ModSrc = SrcFAbs->getOperand(1).getReg();
2649   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2650     ModSrc = SrcFAbs->getOperand(1).getReg();
2651   return ModSrc;
2652 }
2653 
2654 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2655                                          MachineRegisterInfo &MRI,
2656                                          MachineIRBuilder &B) const {
2657 
2658   const LLT S1 = LLT::scalar(1);
2659   const LLT S64 = LLT::scalar(64);
2660   Register Dst = MI.getOperand(0).getReg();
2661   Register OrigSrc = MI.getOperand(1).getReg();
2662   unsigned Flags = MI.getFlags();
2663   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2664          "this should not have been custom lowered");
2665 
2666   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2667   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2668   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2669   // V_FRACT bug is:
2670   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2671   //
2672   // Convert floor(x) to (x - fract(x))
2673 
2674   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2675     .addUse(OrigSrc)
2676     .setMIFlags(Flags);
2677 
2678   // Give source modifier matching some assistance before obscuring a foldable
2679   // pattern.
2680 
2681   // TODO: We can avoid the neg on the fract? The input sign to fract
2682   // shouldn't matter?
2683   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2684 
2685   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2686 
2687   Register Min = MRI.createGenericVirtualRegister(S64);
2688 
2689   // We don't need to concern ourselves with the snan handling difference, so
2690   // use the one which will directly select.
2691   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2692   if (MFI->getMode().IEEE)
2693     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2694   else
2695     B.buildFMinNum(Min, Fract, Const, Flags);
2696 
2697   Register CorrectedFract = Min;
2698   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2699     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2700     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2701   }
2702 
2703   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2704   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2705 
2706   MI.eraseFromParent();
2707   return true;
2708 }
2709 
2710 // Turn an illegal packed v2s16 build vector into bit operations.
2711 // TODO: This should probably be a bitcast action in LegalizerHelper.
2712 bool AMDGPULegalizerInfo::legalizeBuildVector(
2713   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2714   Register Dst = MI.getOperand(0).getReg();
2715   const LLT S32 = LLT::scalar(32);
2716   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2717 
2718   Register Src0 = MI.getOperand(1).getReg();
2719   Register Src1 = MI.getOperand(2).getReg();
2720   assert(MRI.getType(Src0) == LLT::scalar(16));
2721 
2722   auto Merge = B.buildMerge(S32, {Src0, Src1});
2723   B.buildBitcast(Dst, Merge);
2724 
2725   MI.eraseFromParent();
2726   return true;
2727 }
2728 
2729 // Check that this is a G_XOR x, -1
2730 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2731   if (MI.getOpcode() != TargetOpcode::G_XOR)
2732     return false;
2733   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2734   return ConstVal && *ConstVal == -1;
2735 }
2736 
2737 // Return the use branch instruction, otherwise null if the usage is invalid.
2738 static MachineInstr *
2739 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2740                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2741   Register CondDef = MI.getOperand(0).getReg();
2742   if (!MRI.hasOneNonDBGUse(CondDef))
2743     return nullptr;
2744 
2745   MachineBasicBlock *Parent = MI.getParent();
2746   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2747 
2748   if (isNot(MRI, *UseMI)) {
2749     Register NegatedCond = UseMI->getOperand(0).getReg();
2750     if (!MRI.hasOneNonDBGUse(NegatedCond))
2751       return nullptr;
2752 
2753     // We're deleting the def of this value, so we need to remove it.
2754     UseMI->eraseFromParent();
2755 
2756     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2757     Negated = true;
2758   }
2759 
2760   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2761     return nullptr;
2762 
2763   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2764   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2765   if (Next == Parent->end()) {
2766     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2767     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2768       return nullptr;
2769     UncondBrTarget = &*NextMBB;
2770   } else {
2771     if (Next->getOpcode() != AMDGPU::G_BR)
2772       return nullptr;
2773     Br = &*Next;
2774     UncondBrTarget = Br->getOperand(0).getMBB();
2775   }
2776 
2777   return UseMI;
2778 }
2779 
2780 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2781                                          const ArgDescriptor *Arg,
2782                                          const TargetRegisterClass *ArgRC,
2783                                          LLT ArgTy) const {
2784   MCRegister SrcReg = Arg->getRegister();
2785   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2786   assert(DstReg.isVirtual() && "Virtual register expected");
2787 
2788   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2789                                              ArgTy);
2790   if (Arg->isMasked()) {
2791     // TODO: Should we try to emit this once in the entry block?
2792     const LLT S32 = LLT::scalar(32);
2793     const unsigned Mask = Arg->getMask();
2794     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2795 
2796     Register AndMaskSrc = LiveIn;
2797 
2798     if (Shift != 0) {
2799       auto ShiftAmt = B.buildConstant(S32, Shift);
2800       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2801     }
2802 
2803     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2804   } else {
2805     B.buildCopy(DstReg, LiveIn);
2806   }
2807 
2808   return true;
2809 }
2810 
2811 bool AMDGPULegalizerInfo::loadInputValue(
2812     Register DstReg, MachineIRBuilder &B,
2813     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2814   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2815   const ArgDescriptor *Arg;
2816   const TargetRegisterClass *ArgRC;
2817   LLT ArgTy;
2818   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2819 
2820   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2821     return false; // TODO: Handle these
2822   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2823 }
2824 
2825 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2826     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2827     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2828   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2829     return false;
2830 
2831   MI.eraseFromParent();
2832   return true;
2833 }
2834 
2835 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2836                                        MachineRegisterInfo &MRI,
2837                                        MachineIRBuilder &B) const {
2838   Register Dst = MI.getOperand(0).getReg();
2839   LLT DstTy = MRI.getType(Dst);
2840   LLT S16 = LLT::scalar(16);
2841   LLT S32 = LLT::scalar(32);
2842   LLT S64 = LLT::scalar(64);
2843 
2844   if (DstTy == S16)
2845     return legalizeFDIV16(MI, MRI, B);
2846   if (DstTy == S32)
2847     return legalizeFDIV32(MI, MRI, B);
2848   if (DstTy == S64)
2849     return legalizeFDIV64(MI, MRI, B);
2850 
2851   return false;
2852 }
2853 
2854 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2855                                                         Register DstDivReg,
2856                                                         Register DstRemReg,
2857                                                         Register X,
2858                                                         Register Y) const {
2859   const LLT S1 = LLT::scalar(1);
2860   const LLT S32 = LLT::scalar(32);
2861 
2862   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2863   // algorithm used here.
2864 
2865   // Initial estimate of inv(y).
2866   auto FloatY = B.buildUITOFP(S32, Y);
2867   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2868   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2869   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2870   auto Z = B.buildFPTOUI(S32, ScaledY);
2871 
2872   // One round of UNR.
2873   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2874   auto NegYZ = B.buildMul(S32, NegY, Z);
2875   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2876 
2877   // Quotient/remainder estimate.
2878   auto Q = B.buildUMulH(S32, X, Z);
2879   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2880 
2881   // First quotient/remainder refinement.
2882   auto One = B.buildConstant(S32, 1);
2883   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2884   if (DstDivReg)
2885     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2886   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2887 
2888   // Second quotient/remainder refinement.
2889   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2890   if (DstDivReg)
2891     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2892 
2893   if (DstRemReg)
2894     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2895 }
2896 
2897 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2898 //
2899 // Return lo, hi of result
2900 //
2901 // %cvt.lo = G_UITOFP Val.lo
2902 // %cvt.hi = G_UITOFP Val.hi
2903 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2904 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2905 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2906 // %mul2 = G_FMUL %mul1, 2**(-32)
2907 // %trunc = G_INTRINSIC_TRUNC %mul2
2908 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2909 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2910 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2911                                                        Register Val) {
2912   const LLT S32 = LLT::scalar(32);
2913   auto Unmerge = B.buildUnmerge(S32, Val);
2914 
2915   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2916   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2917 
2918   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2919                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2920 
2921   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2922   auto Mul1 =
2923       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2924 
2925   // 2**(-32)
2926   auto Mul2 =
2927       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2928   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2929 
2930   // -(2**32)
2931   auto Mad2 = B.buildFMAD(S32, Trunc,
2932                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2933 
2934   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2935   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2936 
2937   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2938 }
2939 
2940 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2941                                                         Register DstDivReg,
2942                                                         Register DstRemReg,
2943                                                         Register Numer,
2944                                                         Register Denom) const {
2945   const LLT S32 = LLT::scalar(32);
2946   const LLT S64 = LLT::scalar(64);
2947   const LLT S1 = LLT::scalar(1);
2948   Register RcpLo, RcpHi;
2949 
2950   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2951 
2952   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2953 
2954   auto Zero64 = B.buildConstant(S64, 0);
2955   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2956 
2957   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2958   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2959 
2960   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2961   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2962   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2963 
2964   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2965   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2966   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2967   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2968 
2969   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2970   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2971   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2972   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2973   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2974 
2975   auto Zero32 = B.buildConstant(S32, 0);
2976   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2977   auto Add2_HiC =
2978       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2979   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2980   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2981 
2982   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2983   Register NumerLo = UnmergeNumer.getReg(0);
2984   Register NumerHi = UnmergeNumer.getReg(1);
2985 
2986   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2987   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2988   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2989   Register Mul3_Lo = UnmergeMul3.getReg(0);
2990   Register Mul3_Hi = UnmergeMul3.getReg(1);
2991   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2992   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2993   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2994   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2995 
2996   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2997   Register DenomLo = UnmergeDenom.getReg(0);
2998   Register DenomHi = UnmergeDenom.getReg(1);
2999 
3000   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3001   auto C1 = B.buildSExt(S32, CmpHi);
3002 
3003   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3004   auto C2 = B.buildSExt(S32, CmpLo);
3005 
3006   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3007   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3008 
3009   // TODO: Here and below portions of the code can be enclosed into if/endif.
3010   // Currently control flow is unconditional and we have 4 selects after
3011   // potential endif to substitute PHIs.
3012 
3013   // if C3 != 0 ...
3014   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3015   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3016   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3017   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3018 
3019   auto One64 = B.buildConstant(S64, 1);
3020   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3021 
3022   auto C4 =
3023       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3024   auto C5 =
3025       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3026   auto C6 = B.buildSelect(
3027       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3028 
3029   // if (C6 != 0)
3030   auto Add4 = B.buildAdd(S64, Add3, One64);
3031   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3032 
3033   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3034   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3035   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3036 
3037   // endif C6
3038   // endif C3
3039 
3040   if (DstDivReg) {
3041     auto Sel1 = B.buildSelect(
3042         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3043     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3044                   Sel1, MulHi3);
3045   }
3046 
3047   if (DstRemReg) {
3048     auto Sel2 = B.buildSelect(
3049         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3050     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3051                   Sel2, Sub1);
3052   }
3053 }
3054 
3055 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3056                                                   MachineRegisterInfo &MRI,
3057                                                   MachineIRBuilder &B) const {
3058   Register DstDivReg, DstRemReg;
3059   switch (MI.getOpcode()) {
3060   default:
3061     llvm_unreachable("Unexpected opcode!");
3062   case AMDGPU::G_UDIV: {
3063     DstDivReg = MI.getOperand(0).getReg();
3064     break;
3065   }
3066   case AMDGPU::G_UREM: {
3067     DstRemReg = MI.getOperand(0).getReg();
3068     break;
3069   }
3070   case AMDGPU::G_UDIVREM: {
3071     DstDivReg = MI.getOperand(0).getReg();
3072     DstRemReg = MI.getOperand(1).getReg();
3073     break;
3074   }
3075   }
3076 
3077   const LLT S64 = LLT::scalar(64);
3078   const LLT S32 = LLT::scalar(32);
3079   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3080   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3081   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3082   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3083 
3084   if (Ty == S32)
3085     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3086   else if (Ty == S64)
3087     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3088   else
3089     return false;
3090 
3091   MI.eraseFromParent();
3092   return true;
3093 }
3094 
3095 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3096                                                 MachineRegisterInfo &MRI,
3097                                                 MachineIRBuilder &B) const {
3098   const LLT S64 = LLT::scalar(64);
3099   const LLT S32 = LLT::scalar(32);
3100 
3101   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3102   if (Ty != S32 && Ty != S64)
3103     return false;
3104 
3105   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3106   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3107   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3108 
3109   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3110   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3111   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3112 
3113   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3114   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3115 
3116   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3117   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3118 
3119   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3120   switch (MI.getOpcode()) {
3121   default:
3122     llvm_unreachable("Unexpected opcode!");
3123   case AMDGPU::G_SDIV: {
3124     DstDivReg = MI.getOperand(0).getReg();
3125     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3126     break;
3127   }
3128   case AMDGPU::G_SREM: {
3129     DstRemReg = MI.getOperand(0).getReg();
3130     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3131     break;
3132   }
3133   case AMDGPU::G_SDIVREM: {
3134     DstDivReg = MI.getOperand(0).getReg();
3135     DstRemReg = MI.getOperand(1).getReg();
3136     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3137     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3138     break;
3139   }
3140   }
3141 
3142   if (Ty == S32)
3143     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3144   else
3145     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3146 
3147   if (DstDivReg) {
3148     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3149     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3150     B.buildSub(DstDivReg, SignXor, Sign);
3151   }
3152 
3153   if (DstRemReg) {
3154     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3155     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3156     B.buildSub(DstRemReg, SignXor, Sign);
3157   }
3158 
3159   MI.eraseFromParent();
3160   return true;
3161 }
3162 
3163 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3164                                                  MachineRegisterInfo &MRI,
3165                                                  MachineIRBuilder &B) const {
3166   Register Res = MI.getOperand(0).getReg();
3167   Register LHS = MI.getOperand(1).getReg();
3168   Register RHS = MI.getOperand(2).getReg();
3169   uint16_t Flags = MI.getFlags();
3170   LLT ResTy = MRI.getType(Res);
3171 
3172   const MachineFunction &MF = B.getMF();
3173   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3174                             MI.getFlag(MachineInstr::FmAfn);
3175 
3176   if (!AllowInaccurateRcp)
3177     return false;
3178 
3179   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3180     // 1 / x -> RCP(x)
3181     if (CLHS->isExactlyValue(1.0)) {
3182       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3183         .addUse(RHS)
3184         .setMIFlags(Flags);
3185 
3186       MI.eraseFromParent();
3187       return true;
3188     }
3189 
3190     // -1 / x -> RCP( FNEG(x) )
3191     if (CLHS->isExactlyValue(-1.0)) {
3192       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3193       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3194         .addUse(FNeg.getReg(0))
3195         .setMIFlags(Flags);
3196 
3197       MI.eraseFromParent();
3198       return true;
3199     }
3200   }
3201 
3202   // x / y -> x * (1.0 / y)
3203   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3204     .addUse(RHS)
3205     .setMIFlags(Flags);
3206   B.buildFMul(Res, LHS, RCP, Flags);
3207 
3208   MI.eraseFromParent();
3209   return true;
3210 }
3211 
3212 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3213                                                    MachineRegisterInfo &MRI,
3214                                                    MachineIRBuilder &B) const {
3215   Register Res = MI.getOperand(0).getReg();
3216   Register X = MI.getOperand(1).getReg();
3217   Register Y = MI.getOperand(2).getReg();
3218   uint16_t Flags = MI.getFlags();
3219   LLT ResTy = MRI.getType(Res);
3220 
3221   const MachineFunction &MF = B.getMF();
3222   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3223                             MI.getFlag(MachineInstr::FmAfn);
3224 
3225   if (!AllowInaccurateRcp)
3226     return false;
3227 
3228   auto NegY = B.buildFNeg(ResTy, Y);
3229   auto One = B.buildFConstant(ResTy, 1.0);
3230 
3231   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3232     .addUse(Y)
3233     .setMIFlags(Flags);
3234 
3235   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3236   R = B.buildFMA(ResTy, Tmp0, R, R);
3237 
3238   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3239   R = B.buildFMA(ResTy, Tmp1, R, R);
3240 
3241   auto Ret = B.buildFMul(ResTy, X, R);
3242   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3243 
3244   B.buildFMA(Res, Tmp2, R, Ret);
3245   MI.eraseFromParent();
3246   return true;
3247 }
3248 
3249 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3250                                          MachineRegisterInfo &MRI,
3251                                          MachineIRBuilder &B) const {
3252   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3253     return true;
3254 
3255   Register Res = MI.getOperand(0).getReg();
3256   Register LHS = MI.getOperand(1).getReg();
3257   Register RHS = MI.getOperand(2).getReg();
3258 
3259   uint16_t Flags = MI.getFlags();
3260 
3261   LLT S16 = LLT::scalar(16);
3262   LLT S32 = LLT::scalar(32);
3263 
3264   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3265   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3266 
3267   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3268     .addUse(RHSExt.getReg(0))
3269     .setMIFlags(Flags);
3270 
3271   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3272   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3273 
3274   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3275     .addUse(RDst.getReg(0))
3276     .addUse(RHS)
3277     .addUse(LHS)
3278     .setMIFlags(Flags);
3279 
3280   MI.eraseFromParent();
3281   return true;
3282 }
3283 
3284 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3285 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3286 static void toggleSPDenormMode(bool Enable,
3287                                MachineIRBuilder &B,
3288                                const GCNSubtarget &ST,
3289                                AMDGPU::SIModeRegisterDefaults Mode) {
3290   // Set SP denorm mode to this value.
3291   unsigned SPDenormMode =
3292     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3293 
3294   if (ST.hasDenormModeInst()) {
3295     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3296     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3297 
3298     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3299     B.buildInstr(AMDGPU::S_DENORM_MODE)
3300       .addImm(NewDenormModeValue);
3301 
3302   } else {
3303     // Select FP32 bit field in mode register.
3304     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3305                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3306                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3307 
3308     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3309       .addImm(SPDenormMode)
3310       .addImm(SPDenormModeBitField);
3311   }
3312 }
3313 
3314 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3315                                          MachineRegisterInfo &MRI,
3316                                          MachineIRBuilder &B) const {
3317   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3318     return true;
3319 
3320   Register Res = MI.getOperand(0).getReg();
3321   Register LHS = MI.getOperand(1).getReg();
3322   Register RHS = MI.getOperand(2).getReg();
3323   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3324   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3325 
3326   uint16_t Flags = MI.getFlags();
3327 
3328   LLT S32 = LLT::scalar(32);
3329   LLT S1 = LLT::scalar(1);
3330 
3331   auto One = B.buildFConstant(S32, 1.0f);
3332 
3333   auto DenominatorScaled =
3334     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3335       .addUse(LHS)
3336       .addUse(RHS)
3337       .addImm(0)
3338       .setMIFlags(Flags);
3339   auto NumeratorScaled =
3340     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3341       .addUse(LHS)
3342       .addUse(RHS)
3343       .addImm(1)
3344       .setMIFlags(Flags);
3345 
3346   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3347     .addUse(DenominatorScaled.getReg(0))
3348     .setMIFlags(Flags);
3349   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3350 
3351   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3352   // aren't modeled as reading it.
3353   if (!Mode.allFP32Denormals())
3354     toggleSPDenormMode(true, B, ST, Mode);
3355 
3356   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3357   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3358   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3359   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3360   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3361   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3362 
3363   if (!Mode.allFP32Denormals())
3364     toggleSPDenormMode(false, B, ST, Mode);
3365 
3366   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3367     .addUse(Fma4.getReg(0))
3368     .addUse(Fma1.getReg(0))
3369     .addUse(Fma3.getReg(0))
3370     .addUse(NumeratorScaled.getReg(1))
3371     .setMIFlags(Flags);
3372 
3373   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3374     .addUse(Fmas.getReg(0))
3375     .addUse(RHS)
3376     .addUse(LHS)
3377     .setMIFlags(Flags);
3378 
3379   MI.eraseFromParent();
3380   return true;
3381 }
3382 
3383 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3384                                          MachineRegisterInfo &MRI,
3385                                          MachineIRBuilder &B) const {
3386   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3387     return true;
3388 
3389   Register Res = MI.getOperand(0).getReg();
3390   Register LHS = MI.getOperand(1).getReg();
3391   Register RHS = MI.getOperand(2).getReg();
3392 
3393   uint16_t Flags = MI.getFlags();
3394 
3395   LLT S64 = LLT::scalar(64);
3396   LLT S1 = LLT::scalar(1);
3397 
3398   auto One = B.buildFConstant(S64, 1.0);
3399 
3400   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3401     .addUse(LHS)
3402     .addUse(RHS)
3403     .addImm(0)
3404     .setMIFlags(Flags);
3405 
3406   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3407 
3408   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3409     .addUse(DivScale0.getReg(0))
3410     .setMIFlags(Flags);
3411 
3412   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3413   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3414   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3415 
3416   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3417     .addUse(LHS)
3418     .addUse(RHS)
3419     .addImm(1)
3420     .setMIFlags(Flags);
3421 
3422   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3423   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3424   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3425 
3426   Register Scale;
3427   if (!ST.hasUsableDivScaleConditionOutput()) {
3428     // Workaround a hardware bug on SI where the condition output from div_scale
3429     // is not usable.
3430 
3431     LLT S32 = LLT::scalar(32);
3432 
3433     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3434     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3435     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3436     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3437 
3438     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3439                               Scale1Unmerge.getReg(1));
3440     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3441                               Scale0Unmerge.getReg(1));
3442     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3443   } else {
3444     Scale = DivScale1.getReg(1);
3445   }
3446 
3447   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3448     .addUse(Fma4.getReg(0))
3449     .addUse(Fma3.getReg(0))
3450     .addUse(Mul.getReg(0))
3451     .addUse(Scale)
3452     .setMIFlags(Flags);
3453 
3454   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3455     .addUse(Fmas.getReg(0))
3456     .addUse(RHS)
3457     .addUse(LHS)
3458     .setMIFlags(Flags);
3459 
3460   MI.eraseFromParent();
3461   return true;
3462 }
3463 
3464 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3465                                                  MachineRegisterInfo &MRI,
3466                                                  MachineIRBuilder &B) const {
3467   Register Res = MI.getOperand(0).getReg();
3468   Register LHS = MI.getOperand(2).getReg();
3469   Register RHS = MI.getOperand(3).getReg();
3470   uint16_t Flags = MI.getFlags();
3471 
3472   LLT S32 = LLT::scalar(32);
3473   LLT S1 = LLT::scalar(1);
3474 
3475   auto Abs = B.buildFAbs(S32, RHS, Flags);
3476   const APFloat C0Val(1.0f);
3477 
3478   auto C0 = B.buildConstant(S32, 0x6f800000);
3479   auto C1 = B.buildConstant(S32, 0x2f800000);
3480   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3481 
3482   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3483   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3484 
3485   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3486 
3487   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3488     .addUse(Mul0.getReg(0))
3489     .setMIFlags(Flags);
3490 
3491   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3492 
3493   B.buildFMul(Res, Sel, Mul1, Flags);
3494 
3495   MI.eraseFromParent();
3496   return true;
3497 }
3498 
3499 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3500 // FIXME: Why do we handle this one but not other removed instructions?
3501 //
3502 // Reciprocal square root.  The clamp prevents infinite results, clamping
3503 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3504 // +-max_float.
3505 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3506                                                     MachineRegisterInfo &MRI,
3507                                                     MachineIRBuilder &B) const {
3508   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3509     return true;
3510 
3511   Register Dst = MI.getOperand(0).getReg();
3512   Register Src = MI.getOperand(2).getReg();
3513   auto Flags = MI.getFlags();
3514 
3515   LLT Ty = MRI.getType(Dst);
3516 
3517   const fltSemantics *FltSemantics;
3518   if (Ty == LLT::scalar(32))
3519     FltSemantics = &APFloat::IEEEsingle();
3520   else if (Ty == LLT::scalar(64))
3521     FltSemantics = &APFloat::IEEEdouble();
3522   else
3523     return false;
3524 
3525   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3526     .addUse(Src)
3527     .setMIFlags(Flags);
3528 
3529   // We don't need to concern ourselves with the snan handling difference, since
3530   // the rsq quieted (or not) so use the one which will directly select.
3531   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3532   const bool UseIEEE = MFI->getMode().IEEE;
3533 
3534   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3535   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3536                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3537 
3538   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3539 
3540   if (UseIEEE)
3541     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3542   else
3543     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3544   MI.eraseFromParent();
3545   return true;
3546 }
3547 
3548 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3549   switch (IID) {
3550   case Intrinsic::amdgcn_ds_fadd:
3551     return AMDGPU::G_ATOMICRMW_FADD;
3552   case Intrinsic::amdgcn_ds_fmin:
3553     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3554   case Intrinsic::amdgcn_ds_fmax:
3555     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3556   default:
3557     llvm_unreachable("not a DS FP intrinsic");
3558   }
3559 }
3560 
3561 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3562                                                       MachineInstr &MI,
3563                                                       Intrinsic::ID IID) const {
3564   GISelChangeObserver &Observer = Helper.Observer;
3565   Observer.changingInstr(MI);
3566 
3567   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3568 
3569   // The remaining operands were used to set fields in the MemOperand on
3570   // construction.
3571   for (int I = 6; I > 3; --I)
3572     MI.RemoveOperand(I);
3573 
3574   MI.RemoveOperand(1); // Remove the intrinsic ID.
3575   Observer.changedInstr(MI);
3576   return true;
3577 }
3578 
3579 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3580                                             MachineRegisterInfo &MRI,
3581                                             MachineIRBuilder &B) const {
3582   uint64_t Offset =
3583     ST.getTargetLowering()->getImplicitParameterOffset(
3584       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3585   LLT DstTy = MRI.getType(DstReg);
3586   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3587 
3588   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3589   if (!loadInputValue(KernargPtrReg, B,
3590                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3591     return false;
3592 
3593   // FIXME: This should be nuw
3594   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3595   return true;
3596 }
3597 
3598 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3599                                                  MachineRegisterInfo &MRI,
3600                                                  MachineIRBuilder &B) const {
3601   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3602   if (!MFI->isEntryFunction()) {
3603     return legalizePreloadedArgIntrin(MI, MRI, B,
3604                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3605   }
3606 
3607   Register DstReg = MI.getOperand(0).getReg();
3608   if (!getImplicitArgPtr(DstReg, MRI, B))
3609     return false;
3610 
3611   MI.eraseFromParent();
3612   return true;
3613 }
3614 
3615 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3616                                               MachineRegisterInfo &MRI,
3617                                               MachineIRBuilder &B,
3618                                               unsigned AddrSpace) const {
3619   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3620   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3621   Register Hi32 = Unmerge.getReg(1);
3622 
3623   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3624   MI.eraseFromParent();
3625   return true;
3626 }
3627 
3628 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3629 // offset (the offset that is included in bounds checking and swizzling, to be
3630 // split between the instruction's voffset and immoffset fields) and soffset
3631 // (the offset that is excluded from bounds checking and swizzling, to go in
3632 // the instruction's soffset field).  This function takes the first kind of
3633 // offset and figures out how to split it between voffset and immoffset.
3634 std::tuple<Register, unsigned, unsigned>
3635 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3636                                         Register OrigOffset) const {
3637   const unsigned MaxImm = 4095;
3638   Register BaseReg;
3639   unsigned TotalConstOffset;
3640   const LLT S32 = LLT::scalar(32);
3641   MachineRegisterInfo &MRI = *B.getMRI();
3642 
3643   std::tie(BaseReg, TotalConstOffset) =
3644       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3645 
3646   unsigned ImmOffset = TotalConstOffset;
3647 
3648   // If BaseReg is a pointer, convert it to int.
3649   if (MRI.getType(BaseReg).isPointer())
3650     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3651 
3652   // If the immediate value is too big for the immoffset field, put the value
3653   // and -4096 into the immoffset field so that the value that is copied/added
3654   // for the voffset field is a multiple of 4096, and it stands more chance
3655   // of being CSEd with the copy/add for another similar load/store.
3656   // However, do not do that rounding down to a multiple of 4096 if that is a
3657   // negative number, as it appears to be illegal to have a negative offset
3658   // in the vgpr, even if adding the immediate offset makes it positive.
3659   unsigned Overflow = ImmOffset & ~MaxImm;
3660   ImmOffset -= Overflow;
3661   if ((int32_t)Overflow < 0) {
3662     Overflow += ImmOffset;
3663     ImmOffset = 0;
3664   }
3665 
3666   if (Overflow != 0) {
3667     if (!BaseReg) {
3668       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3669     } else {
3670       auto OverflowVal = B.buildConstant(S32, Overflow);
3671       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3672     }
3673   }
3674 
3675   if (!BaseReg)
3676     BaseReg = B.buildConstant(S32, 0).getReg(0);
3677 
3678   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3679 }
3680 
3681 /// Handle register layout difference for f16 images for some subtargets.
3682 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3683                                              MachineRegisterInfo &MRI,
3684                                              Register Reg,
3685                                              bool ImageStore) const {
3686   const LLT S16 = LLT::scalar(16);
3687   const LLT S32 = LLT::scalar(32);
3688   LLT StoreVT = MRI.getType(Reg);
3689   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3690 
3691   if (ST.hasUnpackedD16VMem()) {
3692     auto Unmerge = B.buildUnmerge(S16, Reg);
3693 
3694     SmallVector<Register, 4> WideRegs;
3695     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3696       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3697 
3698     int NumElts = StoreVT.getNumElements();
3699 
3700     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3701   }
3702 
3703   if (ImageStore && ST.hasImageStoreD16Bug()) {
3704     if (StoreVT.getNumElements() == 2) {
3705       SmallVector<Register, 4> PackedRegs;
3706       Reg = B.buildBitcast(S32, Reg).getReg(0);
3707       PackedRegs.push_back(Reg);
3708       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3709       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3710     }
3711 
3712     if (StoreVT.getNumElements() == 3) {
3713       SmallVector<Register, 4> PackedRegs;
3714       auto Unmerge = B.buildUnmerge(S16, Reg);
3715       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3716         PackedRegs.push_back(Unmerge.getReg(I));
3717       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3718       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3719       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3720     }
3721 
3722     if (StoreVT.getNumElements() == 4) {
3723       SmallVector<Register, 4> PackedRegs;
3724       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3725       auto Unmerge = B.buildUnmerge(S32, Reg);
3726       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3727         PackedRegs.push_back(Unmerge.getReg(I));
3728       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3729       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3730     }
3731 
3732     llvm_unreachable("invalid data type");
3733   }
3734 
3735   return Reg;
3736 }
3737 
3738 Register AMDGPULegalizerInfo::fixStoreSourceType(
3739   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3740   MachineRegisterInfo *MRI = B.getMRI();
3741   LLT Ty = MRI->getType(VData);
3742 
3743   const LLT S16 = LLT::scalar(16);
3744 
3745   // Fixup illegal register types for i8 stores.
3746   if (Ty == LLT::scalar(8) || Ty == S16) {
3747     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3748     return AnyExt;
3749   }
3750 
3751   if (Ty.isVector()) {
3752     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3753       if (IsFormat)
3754         return handleD16VData(B, *MRI, VData);
3755     }
3756   }
3757 
3758   return VData;
3759 }
3760 
3761 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3762                                               MachineRegisterInfo &MRI,
3763                                               MachineIRBuilder &B,
3764                                               bool IsTyped,
3765                                               bool IsFormat) const {
3766   Register VData = MI.getOperand(1).getReg();
3767   LLT Ty = MRI.getType(VData);
3768   LLT EltTy = Ty.getScalarType();
3769   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3770   const LLT S32 = LLT::scalar(32);
3771 
3772   VData = fixStoreSourceType(B, VData, IsFormat);
3773   Register RSrc = MI.getOperand(2).getReg();
3774 
3775   MachineMemOperand *MMO = *MI.memoperands_begin();
3776   const int MemSize = MMO->getSize();
3777 
3778   unsigned ImmOffset;
3779   unsigned TotalOffset;
3780 
3781   // The typed intrinsics add an immediate after the registers.
3782   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3783 
3784   // The struct intrinsic variants add one additional operand over raw.
3785   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3786   Register VIndex;
3787   int OpOffset = 0;
3788   if (HasVIndex) {
3789     VIndex = MI.getOperand(3).getReg();
3790     OpOffset = 1;
3791   }
3792 
3793   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3794   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3795 
3796   unsigned Format = 0;
3797   if (IsTyped) {
3798     Format = MI.getOperand(5 + OpOffset).getImm();
3799     ++OpOffset;
3800   }
3801 
3802   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3803 
3804   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3805   if (TotalOffset != 0)
3806     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3807 
3808   unsigned Opc;
3809   if (IsTyped) {
3810     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3811                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3812   } else if (IsFormat) {
3813     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3814                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3815   } else {
3816     switch (MemSize) {
3817     case 1:
3818       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3819       break;
3820     case 2:
3821       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3822       break;
3823     default:
3824       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3825       break;
3826     }
3827   }
3828 
3829   if (!VIndex)
3830     VIndex = B.buildConstant(S32, 0).getReg(0);
3831 
3832   auto MIB = B.buildInstr(Opc)
3833     .addUse(VData)              // vdata
3834     .addUse(RSrc)               // rsrc
3835     .addUse(VIndex)             // vindex
3836     .addUse(VOffset)            // voffset
3837     .addUse(SOffset)            // soffset
3838     .addImm(ImmOffset);         // offset(imm)
3839 
3840   if (IsTyped)
3841     MIB.addImm(Format);
3842 
3843   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3844      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3845      .addMemOperand(MMO);
3846 
3847   MI.eraseFromParent();
3848   return true;
3849 }
3850 
3851 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3852                                              MachineRegisterInfo &MRI,
3853                                              MachineIRBuilder &B,
3854                                              bool IsFormat,
3855                                              bool IsTyped) const {
3856   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3857   MachineMemOperand *MMO = *MI.memoperands_begin();
3858   const int MemSize = MMO->getSize();
3859   const LLT S32 = LLT::scalar(32);
3860 
3861   Register Dst = MI.getOperand(0).getReg();
3862   Register RSrc = MI.getOperand(2).getReg();
3863 
3864   // The typed intrinsics add an immediate after the registers.
3865   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3866 
3867   // The struct intrinsic variants add one additional operand over raw.
3868   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3869   Register VIndex;
3870   int OpOffset = 0;
3871   if (HasVIndex) {
3872     VIndex = MI.getOperand(3).getReg();
3873     OpOffset = 1;
3874   }
3875 
3876   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3877   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3878 
3879   unsigned Format = 0;
3880   if (IsTyped) {
3881     Format = MI.getOperand(5 + OpOffset).getImm();
3882     ++OpOffset;
3883   }
3884 
3885   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3886   unsigned ImmOffset;
3887   unsigned TotalOffset;
3888 
3889   LLT Ty = MRI.getType(Dst);
3890   LLT EltTy = Ty.getScalarType();
3891   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3892   const bool Unpacked = ST.hasUnpackedD16VMem();
3893 
3894   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3895   if (TotalOffset != 0)
3896     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3897 
3898   unsigned Opc;
3899 
3900   if (IsTyped) {
3901     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3902                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3903   } else if (IsFormat) {
3904     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3905                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3906   } else {
3907     switch (MemSize) {
3908     case 1:
3909       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3910       break;
3911     case 2:
3912       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3913       break;
3914     default:
3915       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3916       break;
3917     }
3918   }
3919 
3920   Register LoadDstReg;
3921 
3922   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3923   LLT UnpackedTy = Ty.changeElementSize(32);
3924 
3925   if (IsExtLoad)
3926     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3927   else if (Unpacked && IsD16 && Ty.isVector())
3928     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3929   else
3930     LoadDstReg = Dst;
3931 
3932   if (!VIndex)
3933     VIndex = B.buildConstant(S32, 0).getReg(0);
3934 
3935   auto MIB = B.buildInstr(Opc)
3936     .addDef(LoadDstReg)         // vdata
3937     .addUse(RSrc)               // rsrc
3938     .addUse(VIndex)             // vindex
3939     .addUse(VOffset)            // voffset
3940     .addUse(SOffset)            // soffset
3941     .addImm(ImmOffset);         // offset(imm)
3942 
3943   if (IsTyped)
3944     MIB.addImm(Format);
3945 
3946   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3947      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3948      .addMemOperand(MMO);
3949 
3950   if (LoadDstReg != Dst) {
3951     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3952 
3953     // Widen result for extending loads was widened.
3954     if (IsExtLoad)
3955       B.buildTrunc(Dst, LoadDstReg);
3956     else {
3957       // Repack to original 16-bit vector result
3958       // FIXME: G_TRUNC should work, but legalization currently fails
3959       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3960       SmallVector<Register, 4> Repack;
3961       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3962         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3963       B.buildMerge(Dst, Repack);
3964     }
3965   }
3966 
3967   MI.eraseFromParent();
3968   return true;
3969 }
3970 
3971 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3972                                                MachineIRBuilder &B,
3973                                                bool IsInc) const {
3974   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3975                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3976   B.buildInstr(Opc)
3977     .addDef(MI.getOperand(0).getReg())
3978     .addUse(MI.getOperand(2).getReg())
3979     .addUse(MI.getOperand(3).getReg())
3980     .cloneMemRefs(MI);
3981   MI.eraseFromParent();
3982   return true;
3983 }
3984 
3985 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3986   switch (IntrID) {
3987   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3988   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3989     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3990   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3991   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3992     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3993   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3994   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3995     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3996   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3997   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3998     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3999   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4000   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4001     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4002   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4003   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4004     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4005   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4006   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4007     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4008   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4009   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4010     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4011   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4012   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4013     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4014   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4015   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4016     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4017   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4018   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4019     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4020   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4021   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4022     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4023   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4024   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4025     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4026   case Intrinsic::amdgcn_buffer_atomic_fadd:
4027   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4028   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4029     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4030   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4031   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4032     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4033   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4034   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4035     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4036   default:
4037     llvm_unreachable("unhandled atomic opcode");
4038   }
4039 }
4040 
4041 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4042                                                MachineIRBuilder &B,
4043                                                Intrinsic::ID IID) const {
4044   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4045                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4046   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4047 
4048   Register Dst;
4049 
4050   int OpOffset = 0;
4051   if (HasReturn) {
4052     // A few FP atomics do not support return values.
4053     Dst = MI.getOperand(0).getReg();
4054   } else {
4055     OpOffset = -1;
4056   }
4057 
4058   Register VData = MI.getOperand(2 + OpOffset).getReg();
4059   Register CmpVal;
4060 
4061   if (IsCmpSwap) {
4062     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4063     ++OpOffset;
4064   }
4065 
4066   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4067   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4068 
4069   // The struct intrinsic variants add one additional operand over raw.
4070   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4071   Register VIndex;
4072   if (HasVIndex) {
4073     VIndex = MI.getOperand(4 + OpOffset).getReg();
4074     ++OpOffset;
4075   }
4076 
4077   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4078   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4079   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4080 
4081   MachineMemOperand *MMO = *MI.memoperands_begin();
4082 
4083   unsigned ImmOffset;
4084   unsigned TotalOffset;
4085   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4086   if (TotalOffset != 0)
4087     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4088 
4089   if (!VIndex)
4090     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4091 
4092   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4093 
4094   if (HasReturn)
4095     MIB.addDef(Dst);
4096 
4097   MIB.addUse(VData); // vdata
4098 
4099   if (IsCmpSwap)
4100     MIB.addReg(CmpVal);
4101 
4102   MIB.addUse(RSrc)               // rsrc
4103      .addUse(VIndex)             // vindex
4104      .addUse(VOffset)            // voffset
4105      .addUse(SOffset)            // soffset
4106      .addImm(ImmOffset)          // offset(imm)
4107      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4108      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4109      .addMemOperand(MMO);
4110 
4111   MI.eraseFromParent();
4112   return true;
4113 }
4114 
4115 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4116 /// vector with s16 typed elements.
4117 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4118                                       SmallVectorImpl<Register> &PackedAddrs,
4119                                       unsigned ArgOffset,
4120                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4121                                       bool IsA16, bool IsG16) {
4122   const LLT S16 = LLT::scalar(16);
4123   const LLT V2S16 = LLT::vector(2, 16);
4124   auto EndIdx = Intr->VAddrEnd;
4125 
4126   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4127     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4128     if (!SrcOp.isReg())
4129       continue; // _L to _LZ may have eliminated this.
4130 
4131     Register AddrReg = SrcOp.getReg();
4132 
4133     if (I < Intr->GradientStart) {
4134       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4135       PackedAddrs.push_back(AddrReg);
4136     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4137                (I >= Intr->CoordStart && !IsA16)) {
4138       // Handle any gradient or coordinate operands that should not be packed
4139       PackedAddrs.push_back(AddrReg);
4140     } else {
4141       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4142       // derivatives dx/dh and dx/dv are packed with undef.
4143       if (((I + 1) >= EndIdx) ||
4144           ((Intr->NumGradients / 2) % 2 == 1 &&
4145            (I == static_cast<unsigned>(Intr->GradientStart +
4146                                        (Intr->NumGradients / 2) - 1) ||
4147             I == static_cast<unsigned>(Intr->GradientStart +
4148                                        Intr->NumGradients - 1))) ||
4149           // Check for _L to _LZ optimization
4150           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4151         PackedAddrs.push_back(
4152             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4153                 .getReg(0));
4154       } else {
4155         PackedAddrs.push_back(
4156             B.buildBuildVector(
4157                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4158                 .getReg(0));
4159         ++I;
4160       }
4161     }
4162   }
4163 }
4164 
4165 /// Convert from separate vaddr components to a single vector address register,
4166 /// and replace the remaining operands with $noreg.
4167 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4168                                      int DimIdx, int NumVAddrs) {
4169   const LLT S32 = LLT::scalar(32);
4170 
4171   SmallVector<Register, 8> AddrRegs;
4172   for (int I = 0; I != NumVAddrs; ++I) {
4173     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4174     if (SrcOp.isReg()) {
4175       AddrRegs.push_back(SrcOp.getReg());
4176       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4177     }
4178   }
4179 
4180   int NumAddrRegs = AddrRegs.size();
4181   if (NumAddrRegs != 1) {
4182     // Round up to 8 elements for v5-v7
4183     // FIXME: Missing intermediate sized register classes and instructions.
4184     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4185       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4186       auto Undef = B.buildUndef(S32);
4187       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4188       NumAddrRegs = RoundedNumRegs;
4189     }
4190 
4191     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4192     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4193   }
4194 
4195   for (int I = 1; I != NumVAddrs; ++I) {
4196     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4197     if (SrcOp.isReg())
4198       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4199   }
4200 }
4201 
4202 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4203 ///
4204 /// Depending on the subtarget, load/store with 16-bit element data need to be
4205 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4206 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4207 /// registers.
4208 ///
4209 /// We don't want to directly select image instructions just yet, but also want
4210 /// to exposes all register repacking to the legalizer/combiners. We also don't
4211 /// want a selected instrution entering RegBankSelect. In order to avoid
4212 /// defining a multitude of intermediate image instructions, directly hack on
4213 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4214 /// now unnecessary arguments with $noreg.
4215 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4216     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4217     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4218 
4219   const unsigned NumDefs = MI.getNumExplicitDefs();
4220   const unsigned ArgOffset = NumDefs + 1;
4221   bool IsTFE = NumDefs == 2;
4222   // We are only processing the operands of d16 image operations on subtargets
4223   // that use the unpacked register layout, or need to repack the TFE result.
4224 
4225   // TODO: Do we need to guard against already legalized intrinsics?
4226   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4227       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4228 
4229   MachineRegisterInfo *MRI = B.getMRI();
4230   const LLT S32 = LLT::scalar(32);
4231   const LLT S16 = LLT::scalar(16);
4232   const LLT V2S16 = LLT::vector(2, 16);
4233 
4234   unsigned DMask = 0;
4235 
4236   // Check for 16 bit addresses and pack if true.
4237   LLT GradTy =
4238       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4239   LLT AddrTy =
4240       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4241   const bool IsG16 = GradTy == S16;
4242   const bool IsA16 = AddrTy == S16;
4243 
4244   int DMaskLanes = 0;
4245   if (!BaseOpcode->Atomic) {
4246     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4247     if (BaseOpcode->Gather4) {
4248       DMaskLanes = 4;
4249     } else if (DMask != 0) {
4250       DMaskLanes = countPopulation(DMask);
4251     } else if (!IsTFE && !BaseOpcode->Store) {
4252       // If dmask is 0, this is a no-op load. This can be eliminated.
4253       B.buildUndef(MI.getOperand(0));
4254       MI.eraseFromParent();
4255       return true;
4256     }
4257   }
4258 
4259   Observer.changingInstr(MI);
4260   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4261 
4262   unsigned NewOpcode = NumDefs == 0 ?
4263     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4264 
4265   // Track that we legalized this
4266   MI.setDesc(B.getTII().get(NewOpcode));
4267 
4268   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4269   // dmask to be at least 1 otherwise the instruction will fail
4270   if (IsTFE && DMask == 0) {
4271     DMask = 0x1;
4272     DMaskLanes = 1;
4273     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4274   }
4275 
4276   if (BaseOpcode->Atomic) {
4277     Register VData0 = MI.getOperand(2).getReg();
4278     LLT Ty = MRI->getType(VData0);
4279 
4280     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4281     if (Ty.isVector())
4282       return false;
4283 
4284     if (BaseOpcode->AtomicX2) {
4285       Register VData1 = MI.getOperand(3).getReg();
4286       // The two values are packed in one register.
4287       LLT PackedTy = LLT::vector(2, Ty);
4288       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4289       MI.getOperand(2).setReg(Concat.getReg(0));
4290       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4291     }
4292   }
4293 
4294   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4295 
4296   // Optimize _L to _LZ when _L is zero
4297   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4298           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4299     const ConstantFP *ConstantLod;
4300 
4301     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4302                  m_GFCst(ConstantLod))) {
4303       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4304         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4305         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4306             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4307                                                       Intr->Dim);
4308 
4309         // The starting indexes should remain in the same place.
4310         --CorrectedNumVAddrs;
4311 
4312         MI.getOperand(MI.getNumExplicitDefs())
4313             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4314         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4315         Intr = NewImageDimIntr;
4316       }
4317     }
4318   }
4319 
4320   // Optimize _mip away, when 'lod' is zero
4321   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4322     int64_t ConstantLod;
4323     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4324                  m_ICst(ConstantLod))) {
4325       if (ConstantLod == 0) {
4326         // TODO: Change intrinsic opcode and remove operand instead or replacing
4327         // it with 0, as the _L to _LZ handling is done above.
4328         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4329         --CorrectedNumVAddrs;
4330       }
4331     }
4332   }
4333 
4334   // Rewrite the addressing register layout before doing anything else.
4335   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4336     // 16 bit gradients are supported, but are tied to the A16 control
4337     // so both gradients and addresses must be 16 bit
4338     return false;
4339   }
4340 
4341   if (IsA16 && !ST.hasA16()) {
4342     // A16 not supported
4343     return false;
4344   }
4345 
4346   if (IsA16 || IsG16) {
4347     if (Intr->NumVAddrs > 1) {
4348       SmallVector<Register, 4> PackedRegs;
4349 
4350       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4351                                 IsG16);
4352 
4353       // See also below in the non-a16 branch
4354       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4355 
4356       if (!UseNSA && PackedRegs.size() > 1) {
4357         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4358         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4359         PackedRegs[0] = Concat.getReg(0);
4360         PackedRegs.resize(1);
4361       }
4362 
4363       const unsigned NumPacked = PackedRegs.size();
4364       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4365         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4366         if (!SrcOp.isReg()) {
4367           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4368           continue;
4369         }
4370 
4371         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4372 
4373         if (I - Intr->VAddrStart < NumPacked)
4374           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4375         else
4376           SrcOp.setReg(AMDGPU::NoRegister);
4377       }
4378     }
4379   } else {
4380     // If the register allocator cannot place the address registers contiguously
4381     // without introducing moves, then using the non-sequential address encoding
4382     // is always preferable, since it saves VALU instructions and is usually a
4383     // wash in terms of code size or even better.
4384     //
4385     // However, we currently have no way of hinting to the register allocator
4386     // that MIMG addresses should be placed contiguously when it is possible to
4387     // do so, so force non-NSA for the common 2-address case as a heuristic.
4388     //
4389     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4390     // allocation when possible.
4391     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4392 
4393     if (!UseNSA && Intr->NumVAddrs > 1)
4394       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4395                                Intr->NumVAddrs);
4396   }
4397 
4398   int Flags = 0;
4399   if (IsA16)
4400     Flags |= 1;
4401   if (IsG16)
4402     Flags |= 2;
4403   MI.addOperand(MachineOperand::CreateImm(Flags));
4404 
4405   if (BaseOpcode->Store) { // No TFE for stores?
4406     // TODO: Handle dmask trim
4407     Register VData = MI.getOperand(1).getReg();
4408     LLT Ty = MRI->getType(VData);
4409     if (!Ty.isVector() || Ty.getElementType() != S16)
4410       return true;
4411 
4412     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4413     if (RepackedReg != VData) {
4414       MI.getOperand(1).setReg(RepackedReg);
4415     }
4416 
4417     return true;
4418   }
4419 
4420   Register DstReg = MI.getOperand(0).getReg();
4421   LLT Ty = MRI->getType(DstReg);
4422   const LLT EltTy = Ty.getScalarType();
4423   const bool IsD16 = Ty.getScalarType() == S16;
4424   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4425 
4426   // Confirm that the return type is large enough for the dmask specified
4427   if (NumElts < DMaskLanes)
4428     return false;
4429 
4430   if (NumElts > 4 || DMaskLanes > 4)
4431     return false;
4432 
4433   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4434   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4435 
4436   // The raw dword aligned data component of the load. The only legal cases
4437   // where this matters should be when using the packed D16 format, for
4438   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4439   LLT RoundedTy;
4440 
4441   // S32 vector to to cover all data, plus TFE result element.
4442   LLT TFETy;
4443 
4444   // Register type to use for each loaded component. Will be S32 or V2S16.
4445   LLT RegTy;
4446 
4447   if (IsD16 && ST.hasUnpackedD16VMem()) {
4448     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4449     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4450     RegTy = S32;
4451   } else {
4452     unsigned EltSize = EltTy.getSizeInBits();
4453     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4454     unsigned RoundedSize = 32 * RoundedElts;
4455     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4456     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4457     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4458   }
4459 
4460   // The return type does not need adjustment.
4461   // TODO: Should we change s16 case to s32 or <2 x s16>?
4462   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4463     return true;
4464 
4465   Register Dst1Reg;
4466 
4467   // Insert after the instruction.
4468   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4469 
4470   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4471   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4472   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4473   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4474 
4475   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4476 
4477   MI.getOperand(0).setReg(NewResultReg);
4478 
4479   // In the IR, TFE is supposed to be used with a 2 element struct return
4480   // type. The intruction really returns these two values in one contiguous
4481   // register, with one additional dword beyond the loaded data. Rewrite the
4482   // return type to use a single register result.
4483 
4484   if (IsTFE) {
4485     Dst1Reg = MI.getOperand(1).getReg();
4486     if (MRI->getType(Dst1Reg) != S32)
4487       return false;
4488 
4489     // TODO: Make sure the TFE operand bit is set.
4490     MI.RemoveOperand(1);
4491 
4492     // Handle the easy case that requires no repack instructions.
4493     if (Ty == S32) {
4494       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4495       return true;
4496     }
4497   }
4498 
4499   // Now figure out how to copy the new result register back into the old
4500   // result.
4501   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4502 
4503   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4504 
4505   if (ResultNumRegs == 1) {
4506     assert(!IsTFE);
4507     ResultRegs[0] = NewResultReg;
4508   } else {
4509     // We have to repack into a new vector of some kind.
4510     for (int I = 0; I != NumDataRegs; ++I)
4511       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4512     B.buildUnmerge(ResultRegs, NewResultReg);
4513 
4514     // Drop the final TFE element to get the data part. The TFE result is
4515     // directly written to the right place already.
4516     if (IsTFE)
4517       ResultRegs.resize(NumDataRegs);
4518   }
4519 
4520   // For an s16 scalar result, we form an s32 result with a truncate regardless
4521   // of packed vs. unpacked.
4522   if (IsD16 && !Ty.isVector()) {
4523     B.buildTrunc(DstReg, ResultRegs[0]);
4524     return true;
4525   }
4526 
4527   // Avoid a build/concat_vector of 1 entry.
4528   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4529     B.buildBitcast(DstReg, ResultRegs[0]);
4530     return true;
4531   }
4532 
4533   assert(Ty.isVector());
4534 
4535   if (IsD16) {
4536     // For packed D16 results with TFE enabled, all the data components are
4537     // S32. Cast back to the expected type.
4538     //
4539     // TODO: We don't really need to use load s32 elements. We would only need one
4540     // cast for the TFE result if a multiple of v2s16 was used.
4541     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4542       for (Register &Reg : ResultRegs)
4543         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4544     } else if (ST.hasUnpackedD16VMem()) {
4545       for (Register &Reg : ResultRegs)
4546         Reg = B.buildTrunc(S16, Reg).getReg(0);
4547     }
4548   }
4549 
4550   auto padWithUndef = [&](LLT Ty, int NumElts) {
4551     if (NumElts == 0)
4552       return;
4553     Register Undef = B.buildUndef(Ty).getReg(0);
4554     for (int I = 0; I != NumElts; ++I)
4555       ResultRegs.push_back(Undef);
4556   };
4557 
4558   // Pad out any elements eliminated due to the dmask.
4559   LLT ResTy = MRI->getType(ResultRegs[0]);
4560   if (!ResTy.isVector()) {
4561     padWithUndef(ResTy, NumElts - ResultRegs.size());
4562     B.buildBuildVector(DstReg, ResultRegs);
4563     return true;
4564   }
4565 
4566   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4567   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4568 
4569   // Deal with the one annoying legal case.
4570   const LLT V3S16 = LLT::vector(3, 16);
4571   if (Ty == V3S16) {
4572     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4573     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4574     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4575     return true;
4576   }
4577 
4578   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4579   B.buildConcatVectors(DstReg, ResultRegs);
4580   return true;
4581 }
4582 
4583 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4584   LegalizerHelper &Helper, MachineInstr &MI) const {
4585   MachineIRBuilder &B = Helper.MIRBuilder;
4586   GISelChangeObserver &Observer = Helper.Observer;
4587 
4588   Register Dst = MI.getOperand(0).getReg();
4589   LLT Ty = B.getMRI()->getType(Dst);
4590   unsigned Size = Ty.getSizeInBits();
4591   MachineFunction &MF = B.getMF();
4592 
4593   Observer.changingInstr(MI);
4594 
4595   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4596     Ty = getBitcastRegisterType(Ty);
4597     Helper.bitcastDst(MI, Ty, 0);
4598     Dst = MI.getOperand(0).getReg();
4599     B.setInsertPt(B.getMBB(), MI);
4600   }
4601 
4602   // FIXME: We don't really need this intermediate instruction. The intrinsic
4603   // should be fixed to have a memory operand. Since it's readnone, we're not
4604   // allowed to add one.
4605   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4606   MI.RemoveOperand(1); // Remove intrinsic ID
4607 
4608   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4609   // TODO: Should this use datalayout alignment?
4610   const unsigned MemSize = (Size + 7) / 8;
4611   const Align MemAlign(4);
4612   MachineMemOperand *MMO = MF.getMachineMemOperand(
4613       MachinePointerInfo(),
4614       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4615           MachineMemOperand::MOInvariant,
4616       MemSize, MemAlign);
4617   MI.addMemOperand(MF, MMO);
4618 
4619   // There are no 96-bit result scalar loads, but widening to 128-bit should
4620   // always be legal. We may need to restore this to a 96-bit result if it turns
4621   // out this needs to be converted to a vector load during RegBankSelect.
4622   if (!isPowerOf2_32(Size)) {
4623     if (Ty.isVector())
4624       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4625     else
4626       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4627   }
4628 
4629   Observer.changedInstr(MI);
4630   return true;
4631 }
4632 
4633 // TODO: Move to selection
4634 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4635                                                 MachineRegisterInfo &MRI,
4636                                                 MachineIRBuilder &B) const {
4637   if (!ST.isTrapHandlerEnabled() ||
4638       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4639     return legalizeTrapEndpgm(MI, MRI, B);
4640 
4641   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4642     switch (*HsaAbiVer) {
4643     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4644     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4645       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4646     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4647       return ST.supportsGetDoorbellID() ?
4648           legalizeTrapHsa(MI, MRI, B) :
4649           legalizeTrapHsaQueuePtr(MI, MRI, B);
4650     }
4651   }
4652 
4653   llvm_unreachable("Unknown trap handler");
4654 }
4655 
4656 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4657     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4658   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4659   MI.eraseFromParent();
4660   return true;
4661 }
4662 
4663 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4664     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4665   // Pass queue pointer to trap handler as input, and insert trap instruction
4666   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4667   Register LiveIn =
4668     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4669   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4670     return false;
4671 
4672   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4673   B.buildCopy(SGPR01, LiveIn);
4674   B.buildInstr(AMDGPU::S_TRAP)
4675       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4676       .addReg(SGPR01, RegState::Implicit);
4677 
4678   MI.eraseFromParent();
4679   return true;
4680 }
4681 
4682 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4683     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4684   B.buildInstr(AMDGPU::S_TRAP)
4685       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4686   MI.eraseFromParent();
4687   return true;
4688 }
4689 
4690 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4691     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4692   // Is non-HSA path or trap-handler disabled? then, report a warning
4693   // accordingly
4694   if (!ST.isTrapHandlerEnabled() ||
4695       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4696     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4697                                      "debugtrap handler not supported",
4698                                      MI.getDebugLoc(), DS_Warning);
4699     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4700     Ctx.diagnose(NoTrap);
4701   } else {
4702     // Insert debug-trap instruction
4703     B.buildInstr(AMDGPU::S_TRAP)
4704         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4705   }
4706 
4707   MI.eraseFromParent();
4708   return true;
4709 }
4710 
4711 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4712                                                MachineIRBuilder &B) const {
4713   MachineRegisterInfo &MRI = *B.getMRI();
4714   const LLT S16 = LLT::scalar(16);
4715   const LLT S32 = LLT::scalar(32);
4716 
4717   Register DstReg = MI.getOperand(0).getReg();
4718   Register NodePtr = MI.getOperand(2).getReg();
4719   Register RayExtent = MI.getOperand(3).getReg();
4720   Register RayOrigin = MI.getOperand(4).getReg();
4721   Register RayDir = MI.getOperand(5).getReg();
4722   Register RayInvDir = MI.getOperand(6).getReg();
4723   Register TDescr = MI.getOperand(7).getReg();
4724 
4725   if (!ST.hasGFX10_AEncoding()) {
4726     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4727                                         "intrinsic not supported on subtarget",
4728                                         MI.getDebugLoc());
4729     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4730     return false;
4731   }
4732 
4733   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4734   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4735   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4736                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4737                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4738                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4739 
4740   SmallVector<Register, 12> Ops;
4741   if (Is64) {
4742     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4743     Ops.push_back(Unmerge.getReg(0));
4744     Ops.push_back(Unmerge.getReg(1));
4745   } else {
4746     Ops.push_back(NodePtr);
4747   }
4748   Ops.push_back(RayExtent);
4749 
4750   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4751     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4752     Ops.push_back(Unmerge.getReg(0));
4753     Ops.push_back(Unmerge.getReg(1));
4754     Ops.push_back(Unmerge.getReg(2));
4755   };
4756 
4757   packLanes(RayOrigin);
4758   if (IsA16) {
4759     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4760     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4761     Register R1 = MRI.createGenericVirtualRegister(S32);
4762     Register R2 = MRI.createGenericVirtualRegister(S32);
4763     Register R3 = MRI.createGenericVirtualRegister(S32);
4764     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4765     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4766     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4767     Ops.push_back(R1);
4768     Ops.push_back(R2);
4769     Ops.push_back(R3);
4770   } else {
4771     packLanes(RayDir);
4772     packLanes(RayInvDir);
4773   }
4774 
4775   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4776     .addDef(DstReg)
4777     .addImm(Opcode);
4778 
4779   for (Register R : Ops) {
4780     MIB.addUse(R);
4781   }
4782 
4783   MIB.addUse(TDescr)
4784      .addImm(IsA16 ? 1 : 0)
4785      .cloneMemRefs(MI);
4786 
4787   MI.eraseFromParent();
4788   return true;
4789 }
4790 
4791 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4792                                             MachineInstr &MI) const {
4793   MachineIRBuilder &B = Helper.MIRBuilder;
4794   MachineRegisterInfo &MRI = *B.getMRI();
4795 
4796   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4797   auto IntrID = MI.getIntrinsicID();
4798   switch (IntrID) {
4799   case Intrinsic::amdgcn_if:
4800   case Intrinsic::amdgcn_else: {
4801     MachineInstr *Br = nullptr;
4802     MachineBasicBlock *UncondBrTarget = nullptr;
4803     bool Negated = false;
4804     if (MachineInstr *BrCond =
4805             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4806       const SIRegisterInfo *TRI
4807         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4808 
4809       Register Def = MI.getOperand(1).getReg();
4810       Register Use = MI.getOperand(3).getReg();
4811 
4812       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4813 
4814       if (Negated)
4815         std::swap(CondBrTarget, UncondBrTarget);
4816 
4817       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4818       if (IntrID == Intrinsic::amdgcn_if) {
4819         B.buildInstr(AMDGPU::SI_IF)
4820           .addDef(Def)
4821           .addUse(Use)
4822           .addMBB(UncondBrTarget);
4823       } else {
4824         B.buildInstr(AMDGPU::SI_ELSE)
4825             .addDef(Def)
4826             .addUse(Use)
4827             .addMBB(UncondBrTarget);
4828       }
4829 
4830       if (Br) {
4831         Br->getOperand(0).setMBB(CondBrTarget);
4832       } else {
4833         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4834         // since we're swapping branch targets it needs to be reinserted.
4835         // FIXME: IRTranslator should probably not do this
4836         B.buildBr(*CondBrTarget);
4837       }
4838 
4839       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4840       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4841       MI.eraseFromParent();
4842       BrCond->eraseFromParent();
4843       return true;
4844     }
4845 
4846     return false;
4847   }
4848   case Intrinsic::amdgcn_loop: {
4849     MachineInstr *Br = nullptr;
4850     MachineBasicBlock *UncondBrTarget = nullptr;
4851     bool Negated = false;
4852     if (MachineInstr *BrCond =
4853             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4854       const SIRegisterInfo *TRI
4855         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4856 
4857       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4858       Register Reg = MI.getOperand(2).getReg();
4859 
4860       if (Negated)
4861         std::swap(CondBrTarget, UncondBrTarget);
4862 
4863       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4864       B.buildInstr(AMDGPU::SI_LOOP)
4865         .addUse(Reg)
4866         .addMBB(UncondBrTarget);
4867 
4868       if (Br)
4869         Br->getOperand(0).setMBB(CondBrTarget);
4870       else
4871         B.buildBr(*CondBrTarget);
4872 
4873       MI.eraseFromParent();
4874       BrCond->eraseFromParent();
4875       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4876       return true;
4877     }
4878 
4879     return false;
4880   }
4881   case Intrinsic::amdgcn_kernarg_segment_ptr:
4882     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4883       // This only makes sense to call in a kernel, so just lower to null.
4884       B.buildConstant(MI.getOperand(0).getReg(), 0);
4885       MI.eraseFromParent();
4886       return true;
4887     }
4888 
4889     return legalizePreloadedArgIntrin(
4890       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4891   case Intrinsic::amdgcn_implicitarg_ptr:
4892     return legalizeImplicitArgPtr(MI, MRI, B);
4893   case Intrinsic::amdgcn_workitem_id_x:
4894     return legalizePreloadedArgIntrin(MI, MRI, B,
4895                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4896   case Intrinsic::amdgcn_workitem_id_y:
4897     return legalizePreloadedArgIntrin(MI, MRI, B,
4898                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4899   case Intrinsic::amdgcn_workitem_id_z:
4900     return legalizePreloadedArgIntrin(MI, MRI, B,
4901                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4902   case Intrinsic::amdgcn_workgroup_id_x:
4903     return legalizePreloadedArgIntrin(MI, MRI, B,
4904                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4905   case Intrinsic::amdgcn_workgroup_id_y:
4906     return legalizePreloadedArgIntrin(MI, MRI, B,
4907                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4908   case Intrinsic::amdgcn_workgroup_id_z:
4909     return legalizePreloadedArgIntrin(MI, MRI, B,
4910                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4911   case Intrinsic::amdgcn_dispatch_ptr:
4912     return legalizePreloadedArgIntrin(MI, MRI, B,
4913                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4914   case Intrinsic::amdgcn_queue_ptr:
4915     return legalizePreloadedArgIntrin(MI, MRI, B,
4916                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4917   case Intrinsic::amdgcn_implicit_buffer_ptr:
4918     return legalizePreloadedArgIntrin(
4919       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4920   case Intrinsic::amdgcn_dispatch_id:
4921     return legalizePreloadedArgIntrin(MI, MRI, B,
4922                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4923   case Intrinsic::amdgcn_fdiv_fast:
4924     return legalizeFDIVFastIntrin(MI, MRI, B);
4925   case Intrinsic::amdgcn_is_shared:
4926     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4927   case Intrinsic::amdgcn_is_private:
4928     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4929   case Intrinsic::amdgcn_wavefrontsize: {
4930     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4931     MI.eraseFromParent();
4932     return true;
4933   }
4934   case Intrinsic::amdgcn_s_buffer_load:
4935     return legalizeSBufferLoad(Helper, MI);
4936   case Intrinsic::amdgcn_raw_buffer_store:
4937   case Intrinsic::amdgcn_struct_buffer_store:
4938     return legalizeBufferStore(MI, MRI, B, false, false);
4939   case Intrinsic::amdgcn_raw_buffer_store_format:
4940   case Intrinsic::amdgcn_struct_buffer_store_format:
4941     return legalizeBufferStore(MI, MRI, B, false, true);
4942   case Intrinsic::amdgcn_raw_tbuffer_store:
4943   case Intrinsic::amdgcn_struct_tbuffer_store:
4944     return legalizeBufferStore(MI, MRI, B, true, true);
4945   case Intrinsic::amdgcn_raw_buffer_load:
4946   case Intrinsic::amdgcn_struct_buffer_load:
4947     return legalizeBufferLoad(MI, MRI, B, false, false);
4948   case Intrinsic::amdgcn_raw_buffer_load_format:
4949   case Intrinsic::amdgcn_struct_buffer_load_format:
4950     return legalizeBufferLoad(MI, MRI, B, true, false);
4951   case Intrinsic::amdgcn_raw_tbuffer_load:
4952   case Intrinsic::amdgcn_struct_tbuffer_load:
4953     return legalizeBufferLoad(MI, MRI, B, true, true);
4954   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4955   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4956   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4957   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4958   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4959   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4960   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4961   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4962   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4963   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4964   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4965   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4966   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4967   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4968   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4969   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4970   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4971   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4972   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4973   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4974   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4975   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4976   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4977   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4978   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4979   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4980   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4981   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4982   case Intrinsic::amdgcn_buffer_atomic_fadd:
4983   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4984   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4985   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4986   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4987     return legalizeBufferAtomic(MI, B, IntrID);
4988   case Intrinsic::amdgcn_atomic_inc:
4989     return legalizeAtomicIncDec(MI, B, true);
4990   case Intrinsic::amdgcn_atomic_dec:
4991     return legalizeAtomicIncDec(MI, B, false);
4992   case Intrinsic::trap:
4993     return legalizeTrapIntrinsic(MI, MRI, B);
4994   case Intrinsic::debugtrap:
4995     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4996   case Intrinsic::amdgcn_rsq_clamp:
4997     return legalizeRsqClampIntrinsic(MI, MRI, B);
4998   case Intrinsic::amdgcn_ds_fadd:
4999   case Intrinsic::amdgcn_ds_fmin:
5000   case Intrinsic::amdgcn_ds_fmax:
5001     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5002   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5003     return legalizeBVHIntrinsic(MI, B);
5004   default: {
5005     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5006             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5007       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5008     return true;
5009   }
5010   }
5011 
5012   return true;
5013 }
5014