1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338 
339   LLT EltTy = Ty.getElementType();
340   if (EltTy.isPointer())
341     return true;
342 
343   unsigned EltSize = EltTy.getSizeInBits();
344   return EltSize != 32 && EltSize != 64;
345 }
346 
347 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
348                              unsigned Opcode) {
349   const LLT Ty = Query.Types[0];
350   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
351          !loadStoreBitcastWorkaround(Ty);
352 }
353 
354 /// Return true if a load or store of the type should be lowered with a bitcast
355 /// to a different type.
356 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
357                                        const unsigned MemSizeInBits) {
358   const unsigned Size = Ty.getSizeInBits();
359     if (Size != MemSizeInBits)
360       return Size <= 32 && Ty.isVector();
361 
362   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
363     return true;
364   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
365          !isRegisterVectorElementType(Ty.getElementType());
366 }
367 
368 /// Return true if we should legalize a load by widening an odd sized memory
369 /// access up to the alignment. Note this case when the memory access itself
370 /// changes, not the size of the result register.
371 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
372                             unsigned AlignInBits, unsigned AddrSpace,
373                             unsigned Opcode) {
374   // We don't want to widen cases that are naturally legal.
375   if (isPowerOf2_32(SizeInBits))
376     return false;
377 
378   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
379   // end up widening these for a scalar load during RegBankSelect, since there
380   // aren't 96-bit scalar loads.
381   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
382     return false;
383 
384   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
385     return false;
386 
387   // A load is known dereferenceable up to the alignment, so it's legal to widen
388   // to it.
389   //
390   // TODO: Could check dereferenceable for less aligned cases.
391   unsigned RoundedSize = NextPowerOf2(SizeInBits);
392   if (AlignInBits < RoundedSize)
393     return false;
394 
395   // Do not widen if it would introduce a slow unaligned load.
396   const SITargetLowering *TLI = ST.getTargetLowering();
397   bool Fast = false;
398   return TLI->allowsMisalignedMemoryAccessesImpl(
399              RoundedSize, AddrSpace, Align(AlignInBits / 8),
400              MachineMemOperand::MOLoad, &Fast) &&
401          Fast;
402 }
403 
404 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
405                             unsigned Opcode) {
406   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
407     return false;
408 
409   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
410                          Query.MMODescrs[0].AlignInBits,
411                          Query.Types[1].getAddressSpace(), Opcode);
412 }
413 
414 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
415                                          const GCNTargetMachine &TM)
416   :  ST(ST_) {
417   using namespace TargetOpcode;
418 
419   auto GetAddrSpacePtr = [&TM](unsigned AS) {
420     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
421   };
422 
423   const LLT S1 = LLT::scalar(1);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S16 = LLT::vector(2, 16);
433   const LLT V4S16 = LLT::vector(4, 16);
434 
435   const LLT V2S32 = LLT::vector(2, 32);
436   const LLT V3S32 = LLT::vector(3, 32);
437   const LLT V4S32 = LLT::vector(4, 32);
438   const LLT V5S32 = LLT::vector(5, 32);
439   const LLT V6S32 = LLT::vector(6, 32);
440   const LLT V7S32 = LLT::vector(7, 32);
441   const LLT V8S32 = LLT::vector(8, 32);
442   const LLT V9S32 = LLT::vector(9, 32);
443   const LLT V10S32 = LLT::vector(10, 32);
444   const LLT V11S32 = LLT::vector(11, 32);
445   const LLT V12S32 = LLT::vector(12, 32);
446   const LLT V13S32 = LLT::vector(13, 32);
447   const LLT V14S32 = LLT::vector(14, 32);
448   const LLT V15S32 = LLT::vector(15, 32);
449   const LLT V16S32 = LLT::vector(16, 32);
450   const LLT V32S32 = LLT::vector(32, 32);
451 
452   const LLT V2S64 = LLT::vector(2, 64);
453   const LLT V3S64 = LLT::vector(3, 64);
454   const LLT V4S64 = LLT::vector(4, 64);
455   const LLT V5S64 = LLT::vector(5, 64);
456   const LLT V6S64 = LLT::vector(6, 64);
457   const LLT V7S64 = LLT::vector(7, 64);
458   const LLT V8S64 = LLT::vector(8, 64);
459   const LLT V16S64 = LLT::vector(16, 64);
460 
461   std::initializer_list<LLT> AllS32Vectors =
462     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
463      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
464   std::initializer_list<LLT> AllS64Vectors =
465     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
466 
467   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
468   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
469   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
470   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
471   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
472   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
473   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
474 
475   const LLT CodePtr = FlatPtr;
476 
477   const std::initializer_list<LLT> AddrSpaces64 = {
478     GlobalPtr, ConstantPtr, FlatPtr
479   };
480 
481   const std::initializer_list<LLT> AddrSpaces32 = {
482     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
483   };
484 
485   const std::initializer_list<LLT> FPTypesBase = {
486     S32, S64
487   };
488 
489   const std::initializer_list<LLT> FPTypes16 = {
490     S32, S64, S16
491   };
492 
493   const std::initializer_list<LLT> FPTypesPK16 = {
494     S32, S64, S16, V2S16
495   };
496 
497   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
498 
499   setAction({G_BRCOND, S1}, Legal); // VCC branches
500   setAction({G_BRCOND, S32}, Legal); // SCC branches
501 
502   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
503   // elements for v3s16
504   getActionDefinitionsBuilder(G_PHI)
505     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
506     .legalFor(AllS32Vectors)
507     .legalFor(AllS64Vectors)
508     .legalFor(AddrSpaces64)
509     .legalFor(AddrSpaces32)
510     .legalIf(isPointer(0))
511     .clampScalar(0, S16, S256)
512     .widenScalarToNextPow2(0, 32)
513     .clampMaxNumElements(0, S32, 16)
514     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
515     .scalarize(0);
516 
517   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
518     // Full set of gfx9 features.
519     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
520       .legalFor({S32, S16, V2S16})
521       .clampScalar(0, S16, S32)
522       .clampMaxNumElements(0, S16, 2)
523       .scalarize(0)
524       .widenScalarToNextPow2(0, 32);
525 
526     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
527       .legalFor({S32, S16, V2S16}) // Clamp modifier
528       .minScalarOrElt(0, S16)
529       .clampMaxNumElements(0, S16, 2)
530       .scalarize(0)
531       .widenScalarToNextPow2(0, 32)
532       .lower();
533   } else if (ST.has16BitInsts()) {
534     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
535       .legalFor({S32, S16})
536       .clampScalar(0, S16, S32)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
539 
540     // Technically the saturating operations require clamp bit support, but this
541     // was introduced at the same time as 16-bit operations.
542     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
543       .legalFor({S32, S16}) // Clamp modifier
544       .minScalar(0, S16)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 16)
547       .lower();
548 
549     // We're just lowering this, but it helps get a better result to try to
550     // coerce to the desired type first.
551     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
552       .minScalar(0, S16)
553       .scalarize(0)
554       .lower();
555   } else {
556     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
557       .legalFor({S32})
558       .clampScalar(0, S32, S32)
559       .scalarize(0);
560 
561     if (ST.hasIntClamp()) {
562       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
563         .legalFor({S32}) // Clamp modifier.
564         .scalarize(0)
565         .minScalarOrElt(0, S32)
566         .lower();
567     } else {
568       // Clamp bit support was added in VI, along with 16-bit operations.
569       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
570         .minScalar(0, S32)
571         .scalarize(0)
572         .lower();
573     }
574 
575     // FIXME: DAG expansion gets better results. The widening uses the smaller
576     // range values and goes for the min/max lowering directly.
577     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
578       .minScalar(0, S32)
579       .scalarize(0)
580       .lower();
581   }
582 
583   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
584     .customFor({S32, S64})
585     .clampScalar(0, S32, S64)
586     .widenScalarToNextPow2(0, 32)
587     .scalarize(0);
588 
589   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
590     .legalFor({S32})
591     .clampScalar(0, S32, S32)
592     .scalarize(0);
593 
594   // Report legal for any types we can handle anywhere. For the cases only legal
595   // on the SALU, RegBankSelect will be able to re-legalize.
596   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
597     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
598     .clampScalar(0, S32, S64)
599     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
600     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
601     .widenScalarToNextPow2(0)
602     .scalarize(0);
603 
604   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
605                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
606     .legalFor({{S32, S1}, {S32, S32}})
607     .minScalar(0, S32)
608     // TODO: .scalarize(0)
609     .lower();
610 
611   getActionDefinitionsBuilder(G_BITCAST)
612     // Don't worry about the size constraint.
613     .legalIf(all(isRegisterType(0), isRegisterType(1)))
614     .lower();
615 
616 
617   getActionDefinitionsBuilder(G_CONSTANT)
618     .legalFor({S1, S32, S64, S16, GlobalPtr,
619                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
620     .legalIf(isPointer(0))
621     .clampScalar(0, S32, S64)
622     .widenScalarToNextPow2(0);
623 
624   getActionDefinitionsBuilder(G_FCONSTANT)
625     .legalFor({S32, S64, S16})
626     .clampScalar(0, S16, S64);
627 
628   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
629       .legalIf(isRegisterType(0))
630       // s1 and s16 are special cases because they have legal operations on
631       // them, but don't really occupy registers in the normal way.
632       .legalFor({S1, S16})
633       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
634       .clampScalarOrElt(0, S32, MaxScalar)
635       .widenScalarToNextPow2(0, 32)
636       .clampMaxNumElements(0, S32, 16);
637 
638   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
639 
640   // If the amount is divergent, we have to do a wave reduction to get the
641   // maximum value, so this is expanded during RegBankSelect.
642   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
643     .legalFor({{PrivatePtr, S32}});
644 
645   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
646     .customIf(typeIsNot(0, PrivatePtr));
647 
648   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
649 
650   auto &FPOpActions = getActionDefinitionsBuilder(
651     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
652     .legalFor({S32, S64});
653   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
654     .customFor({S32, S64});
655   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
656     .customFor({S32, S64});
657 
658   if (ST.has16BitInsts()) {
659     if (ST.hasVOP3PInsts())
660       FPOpActions.legalFor({S16, V2S16});
661     else
662       FPOpActions.legalFor({S16});
663 
664     TrigActions.customFor({S16});
665     FDIVActions.customFor({S16});
666   }
667 
668   auto &MinNumMaxNum = getActionDefinitionsBuilder({
669       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
670 
671   if (ST.hasVOP3PInsts()) {
672     MinNumMaxNum.customFor(FPTypesPK16)
673       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
674       .clampMaxNumElements(0, S16, 2)
675       .clampScalar(0, S16, S64)
676       .scalarize(0);
677   } else if (ST.has16BitInsts()) {
678     MinNumMaxNum.customFor(FPTypes16)
679       .clampScalar(0, S16, S64)
680       .scalarize(0);
681   } else {
682     MinNumMaxNum.customFor(FPTypesBase)
683       .clampScalar(0, S32, S64)
684       .scalarize(0);
685   }
686 
687   if (ST.hasVOP3PInsts())
688     FPOpActions.clampMaxNumElements(0, S16, 2);
689 
690   FPOpActions
691     .scalarize(0)
692     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
693 
694   TrigActions
695     .scalarize(0)
696     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
697 
698   FDIVActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   getActionDefinitionsBuilder({G_FNEG, G_FABS})
703     .legalFor(FPTypesPK16)
704     .clampMaxNumElements(0, S16, 2)
705     .scalarize(0)
706     .clampScalar(0, S16, S64);
707 
708   if (ST.has16BitInsts()) {
709     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
710       .legalFor({S32, S64, S16})
711       .scalarize(0)
712       .clampScalar(0, S16, S64);
713   } else {
714     getActionDefinitionsBuilder(G_FSQRT)
715       .legalFor({S32, S64})
716       .scalarize(0)
717       .clampScalar(0, S32, S64);
718 
719     if (ST.hasFractBug()) {
720       getActionDefinitionsBuilder(G_FFLOOR)
721         .customFor({S64})
722         .legalFor({S32, S64})
723         .scalarize(0)
724         .clampScalar(0, S32, S64);
725     } else {
726       getActionDefinitionsBuilder(G_FFLOOR)
727         .legalFor({S32, S64})
728         .scalarize(0)
729         .clampScalar(0, S32, S64);
730     }
731   }
732 
733   getActionDefinitionsBuilder(G_FPTRUNC)
734     .legalFor({{S32, S64}, {S16, S32}})
735     .scalarize(0)
736     .lower();
737 
738   getActionDefinitionsBuilder(G_FPEXT)
739     .legalFor({{S64, S32}, {S32, S16}})
740     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
741     .scalarize(0);
742 
743   getActionDefinitionsBuilder(G_FSUB)
744       // Use actual fsub instruction
745       .legalFor({S32})
746       // Must use fadd + fneg
747       .lowerFor({S64, S16, V2S16})
748       .scalarize(0)
749       .clampScalar(0, S32, S64);
750 
751   // Whether this is legal depends on the floating point mode for the function.
752   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
753   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
754     FMad.customFor({S32, S16});
755   else if (ST.hasMadMacF32Insts())
756     FMad.customFor({S32});
757   else if (ST.hasMadF16())
758     FMad.customFor({S16});
759   FMad.scalarize(0)
760       .lower();
761 
762   auto &FRem = getActionDefinitionsBuilder(G_FREM);
763   if (ST.has16BitInsts()) {
764     FRem.customFor({S16, S32, S64});
765   } else {
766     FRem.minScalar(0, S32)
767         .customFor({S32, S64});
768   }
769   FRem.scalarize(0);
770 
771   // TODO: Do we need to clamp maximum bitwidth?
772   getActionDefinitionsBuilder(G_TRUNC)
773     .legalIf(isScalar(0))
774     .legalFor({{V2S16, V2S32}})
775     .clampMaxNumElements(0, S16, 2)
776     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
777     // situations (like an invalid implicit use), we don't want to infinite loop
778     // in the legalizer.
779     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
780     .alwaysLegal();
781 
782   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
783     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
784                {S32, S1}, {S64, S1}, {S16, S1}})
785     .scalarize(0)
786     .clampScalar(0, S32, S64)
787     .widenScalarToNextPow2(1, 32);
788 
789   // TODO: Split s1->s64 during regbankselect for VALU.
790   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
791     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
792     .lowerFor({{S32, S64}})
793     .lowerIf(typeIs(1, S1))
794     .customFor({{S64, S64}});
795   if (ST.has16BitInsts())
796     IToFP.legalFor({{S16, S16}});
797   IToFP.clampScalar(1, S32, S64)
798        .minScalar(0, S32)
799        .scalarize(0)
800        .widenScalarToNextPow2(1);
801 
802   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
803     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
804     .customFor({{S64, S64}})
805     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
806   if (ST.has16BitInsts())
807     FPToI.legalFor({{S16, S16}});
808   else
809     FPToI.minScalar(1, S32);
810 
811   FPToI.minScalar(0, S32)
812        .scalarize(0)
813        .lower();
814 
815   // Lower roundeven into G_FRINT
816   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
817     .scalarize(0)
818     .lower();
819 
820   if (ST.has16BitInsts()) {
821     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
822       .legalFor({S16, S32, S64})
823       .clampScalar(0, S16, S64)
824       .scalarize(0);
825   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
826     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
827       .legalFor({S32, S64})
828       .clampScalar(0, S32, S64)
829       .scalarize(0);
830   } else {
831     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832       .legalFor({S32})
833       .customFor({S64})
834       .clampScalar(0, S32, S64)
835       .scalarize(0);
836   }
837 
838   getActionDefinitionsBuilder(G_PTR_ADD)
839     .legalIf(all(isPointer(0), sameSize(0, 1)))
840     .scalarize(0)
841     .scalarSameSizeAs(1, 0);
842 
843   getActionDefinitionsBuilder(G_PTRMASK)
844     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
845     .scalarSameSizeAs(1, 0)
846     .scalarize(0);
847 
848   auto &CmpBuilder =
849     getActionDefinitionsBuilder(G_ICMP)
850     // The compare output type differs based on the register bank of the output,
851     // so make both s1 and s32 legal.
852     //
853     // Scalar compares producing output in scc will be promoted to s32, as that
854     // is the allocatable register type that will be needed for the copy from
855     // scc. This will be promoted during RegBankSelect, and we assume something
856     // before that won't try to use s32 result types.
857     //
858     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
859     // bank.
860     .legalForCartesianProduct(
861       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
862     .legalForCartesianProduct(
863       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
864   if (ST.has16BitInsts()) {
865     CmpBuilder.legalFor({{S1, S16}});
866   }
867 
868   CmpBuilder
869     .widenScalarToNextPow2(1)
870     .clampScalar(1, S32, S64)
871     .scalarize(0)
872     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
873 
874   getActionDefinitionsBuilder(G_FCMP)
875     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
876     .widenScalarToNextPow2(1)
877     .clampScalar(1, S32, S64)
878     .scalarize(0);
879 
880   // FIXME: fpow has a selection pattern that should move to custom lowering.
881   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
882   if (ST.has16BitInsts())
883     Exp2Ops.legalFor({S32, S16});
884   else
885     Exp2Ops.legalFor({S32});
886   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
887   Exp2Ops.scalarize(0);
888 
889   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
890   if (ST.has16BitInsts())
891     ExpOps.customFor({{S32}, {S16}});
892   else
893     ExpOps.customFor({S32});
894   ExpOps.clampScalar(0, MinScalarFPTy, S32)
895         .scalarize(0);
896 
897   getActionDefinitionsBuilder(G_FPOWI)
898     .clampScalar(0, MinScalarFPTy, S32)
899     .lower();
900 
901   // The 64-bit versions produce 32-bit results, but only on the SALU.
902   getActionDefinitionsBuilder(G_CTPOP)
903     .legalFor({{S32, S32}, {S32, S64}})
904     .clampScalar(0, S32, S32)
905     .clampScalar(1, S32, S64)
906     .scalarize(0)
907     .widenScalarToNextPow2(0, 32)
908     .widenScalarToNextPow2(1, 32);
909 
910   // The hardware instructions return a different result on 0 than the generic
911   // instructions expect. The hardware produces -1, but these produce the
912   // bitwidth.
913   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
914     .scalarize(0)
915     .clampScalar(0, S32, S32)
916     .clampScalar(1, S32, S64)
917     .widenScalarToNextPow2(0, 32)
918     .widenScalarToNextPow2(1, 32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   getActionDefinitionsBuilder(G_BITREVERSE)
931     .legalFor({S32})
932     .clampScalar(0, S32, S32)
933     .scalarize(0);
934 
935   if (ST.has16BitInsts()) {
936     getActionDefinitionsBuilder(G_BSWAP)
937       .legalFor({S16, S32, V2S16})
938       .clampMaxNumElements(0, S16, 2)
939       // FIXME: Fixing non-power-of-2 before clamp is workaround for
940       // narrowScalar limitation.
941       .widenScalarToNextPow2(0)
942       .clampScalar(0, S16, S32)
943       .scalarize(0);
944 
945     if (ST.hasVOP3PInsts()) {
946       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
947         .legalFor({S32, S16, V2S16})
948         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
949         .clampMaxNumElements(0, S16, 2)
950         .minScalar(0, S16)
951         .widenScalarToNextPow2(0)
952         .scalarize(0)
953         .lower();
954     } else {
955       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
956         .legalFor({S32, S16})
957         .widenScalarToNextPow2(0)
958         .minScalar(0, S16)
959         .scalarize(0)
960         .lower();
961     }
962   } else {
963     // TODO: Should have same legality without v_perm_b32
964     getActionDefinitionsBuilder(G_BSWAP)
965       .legalFor({S32})
966       .lowerIf(scalarNarrowerThan(0, 32))
967       // FIXME: Fixing non-power-of-2 before clamp is workaround for
968       // narrowScalar limitation.
969       .widenScalarToNextPow2(0)
970       .maxScalar(0, S32)
971       .scalarize(0)
972       .lower();
973 
974     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
975       .legalFor({S32})
976       .minScalar(0, S32)
977       .widenScalarToNextPow2(0)
978       .scalarize(0)
979       .lower();
980   }
981 
982   getActionDefinitionsBuilder(G_INTTOPTR)
983     // List the common cases
984     .legalForCartesianProduct(AddrSpaces64, {S64})
985     .legalForCartesianProduct(AddrSpaces32, {S32})
986     .scalarize(0)
987     // Accept any address space as long as the size matches
988     .legalIf(sameSize(0, 1))
989     .widenScalarIf(smallerThan(1, 0),
990       [](const LegalityQuery &Query) {
991         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
992       })
993     .narrowScalarIf(largerThan(1, 0),
994       [](const LegalityQuery &Query) {
995         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
996       });
997 
998   getActionDefinitionsBuilder(G_PTRTOINT)
999     // List the common cases
1000     .legalForCartesianProduct(AddrSpaces64, {S64})
1001     .legalForCartesianProduct(AddrSpaces32, {S32})
1002     .scalarize(0)
1003     // Accept any address space as long as the size matches
1004     .legalIf(sameSize(0, 1))
1005     .widenScalarIf(smallerThan(0, 1),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1008       })
1009     .narrowScalarIf(
1010       largerThan(0, 1),
1011       [](const LegalityQuery &Query) {
1012         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1013       });
1014 
1015   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1016     .scalarize(0)
1017     .custom();
1018 
1019   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1020                                     bool IsLoad) -> bool {
1021     const LLT DstTy = Query.Types[0];
1022 
1023     // Split vector extloads.
1024     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1025     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1026 
1027     if (MemSize < DstTy.getSizeInBits())
1028       MemSize = std::max(MemSize, AlignBits);
1029 
1030     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1031       return true;
1032 
1033     const LLT PtrTy = Query.Types[1];
1034     unsigned AS = PtrTy.getAddressSpace();
1035     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1036       return true;
1037 
1038     // Catch weird sized loads that don't evenly divide into the access sizes
1039     // TODO: May be able to widen depending on alignment etc.
1040     unsigned NumRegs = (MemSize + 31) / 32;
1041     if (NumRegs == 3) {
1042       if (!ST.hasDwordx3LoadStores())
1043         return true;
1044     } else {
1045       // If the alignment allows, these should have been widened.
1046       if (!isPowerOf2_32(NumRegs))
1047         return true;
1048     }
1049 
1050     if (AlignBits < MemSize) {
1051       const SITargetLowering *TLI = ST.getTargetLowering();
1052       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1053                                                       Align(AlignBits / 8));
1054     }
1055 
1056     return false;
1057   };
1058 
1059   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1060   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1061   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1062 
1063   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1064   // LDS
1065   // TODO: Unsupported flat for SI.
1066 
1067   for (unsigned Op : {G_LOAD, G_STORE}) {
1068     const bool IsStore = Op == G_STORE;
1069 
1070     auto &Actions = getActionDefinitionsBuilder(Op);
1071     // Explicitly list some common cases.
1072     // TODO: Does this help compile time at all?
1073     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1074                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1075                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1076                                       {S64, GlobalPtr, 64, GlobalAlign32},
1077                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1078                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1079                                       {S32, GlobalPtr, 8, GlobalAlign8},
1080                                       {S32, GlobalPtr, 16, GlobalAlign16},
1081 
1082                                       {S32, LocalPtr, 32, 32},
1083                                       {S64, LocalPtr, 64, 32},
1084                                       {V2S32, LocalPtr, 64, 32},
1085                                       {S32, LocalPtr, 8, 8},
1086                                       {S32, LocalPtr, 16, 16},
1087                                       {V2S16, LocalPtr, 32, 32},
1088 
1089                                       {S32, PrivatePtr, 32, 32},
1090                                       {S32, PrivatePtr, 8, 8},
1091                                       {S32, PrivatePtr, 16, 16},
1092                                       {V2S16, PrivatePtr, 32, 32},
1093 
1094                                       {S32, ConstantPtr, 32, GlobalAlign32},
1095                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1096                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1097                                       {S64, ConstantPtr, 64, GlobalAlign32},
1098                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1099     Actions.legalIf(
1100       [=](const LegalityQuery &Query) -> bool {
1101         return isLoadStoreLegal(ST, Query, Op);
1102       });
1103 
1104     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1105     // 64-bits.
1106     //
1107     // TODO: Should generalize bitcast action into coerce, which will also cover
1108     // inserting addrspacecasts.
1109     Actions.customIf(typeIs(1, Constant32Ptr));
1110 
1111     // Turn any illegal element vectors into something easier to deal
1112     // with. These will ultimately produce 32-bit scalar shifts to extract the
1113     // parts anyway.
1114     //
1115     // For odd 16-bit element vectors, prefer to split those into pieces with
1116     // 16-bit vector parts.
1117     Actions.bitcastIf(
1118       [=](const LegalityQuery &Query) -> bool {
1119         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1120                                           Query.MMODescrs[0].SizeInBits);
1121       }, bitcastToRegisterType(0));
1122 
1123     if (!IsStore) {
1124       // Widen suitably aligned loads by loading extra bytes. The standard
1125       // legalization actions can't properly express widening memory operands.
1126       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1127         return shouldWidenLoad(ST, Query, G_LOAD);
1128       });
1129     }
1130 
1131     // FIXME: load/store narrowing should be moved to lower action
1132     Actions
1133         .narrowScalarIf(
1134             [=](const LegalityQuery &Query) -> bool {
1135               return !Query.Types[0].isVector() &&
1136                      needToSplitMemOp(Query, Op == G_LOAD);
1137             },
1138             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1139               const LLT DstTy = Query.Types[0];
1140               const LLT PtrTy = Query.Types[1];
1141 
1142               const unsigned DstSize = DstTy.getSizeInBits();
1143               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1144 
1145               // Split extloads.
1146               if (DstSize > MemSize)
1147                 return std::make_pair(0, LLT::scalar(MemSize));
1148 
1149               if (!isPowerOf2_32(DstSize)) {
1150                 // We're probably decomposing an odd sized store. Try to split
1151                 // to the widest type. TODO: Account for alignment. As-is it
1152                 // should be OK, since the new parts will be further legalized.
1153                 unsigned FloorSize = PowerOf2Floor(DstSize);
1154                 return std::make_pair(0, LLT::scalar(FloorSize));
1155               }
1156 
1157               if (DstSize > 32 && (DstSize % 32 != 0)) {
1158                 // FIXME: Need a way to specify non-extload of larger size if
1159                 // suitably aligned.
1160                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1161               }
1162 
1163               unsigned MaxSize = maxSizeForAddrSpace(ST,
1164                                                      PtrTy.getAddressSpace(),
1165                                                      Op == G_LOAD);
1166               if (MemSize > MaxSize)
1167                 return std::make_pair(0, LLT::scalar(MaxSize));
1168 
1169               unsigned Align = Query.MMODescrs[0].AlignInBits;
1170               return std::make_pair(0, LLT::scalar(Align));
1171             })
1172         .fewerElementsIf(
1173             [=](const LegalityQuery &Query) -> bool {
1174               return Query.Types[0].isVector() &&
1175                      needToSplitMemOp(Query, Op == G_LOAD);
1176             },
1177             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1178               const LLT DstTy = Query.Types[0];
1179               const LLT PtrTy = Query.Types[1];
1180 
1181               LLT EltTy = DstTy.getElementType();
1182               unsigned MaxSize = maxSizeForAddrSpace(ST,
1183                                                      PtrTy.getAddressSpace(),
1184                                                      Op == G_LOAD);
1185 
1186               // FIXME: Handle widened to power of 2 results better. This ends
1187               // up scalarizing.
1188               // FIXME: 3 element stores scalarized on SI
1189 
1190               // Split if it's too large for the address space.
1191               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1192                 unsigned NumElts = DstTy.getNumElements();
1193                 unsigned EltSize = EltTy.getSizeInBits();
1194 
1195                 if (MaxSize % EltSize == 0) {
1196                   return std::make_pair(
1197                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1198                 }
1199 
1200                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1201 
1202                 // FIXME: Refine when odd breakdowns handled
1203                 // The scalars will need to be re-legalized.
1204                 if (NumPieces == 1 || NumPieces >= NumElts ||
1205                     NumElts % NumPieces != 0)
1206                   return std::make_pair(0, EltTy);
1207 
1208                 return std::make_pair(0,
1209                                       LLT::vector(NumElts / NumPieces, EltTy));
1210               }
1211 
1212               // FIXME: We could probably handle weird extending loads better.
1213               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1214               if (DstTy.getSizeInBits() > MemSize)
1215                 return std::make_pair(0, EltTy);
1216 
1217               unsigned EltSize = EltTy.getSizeInBits();
1218               unsigned DstSize = DstTy.getSizeInBits();
1219               if (!isPowerOf2_32(DstSize)) {
1220                 // We're probably decomposing an odd sized store. Try to split
1221                 // to the widest type. TODO: Account for alignment. As-is it
1222                 // should be OK, since the new parts will be further legalized.
1223                 unsigned FloorSize = PowerOf2Floor(DstSize);
1224                 return std::make_pair(
1225                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1226               }
1227 
1228               // Need to split because of alignment.
1229               unsigned Align = Query.MMODescrs[0].AlignInBits;
1230               if (EltSize > Align &&
1231                   (EltSize / Align < DstTy.getNumElements())) {
1232                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1233               }
1234 
1235               // May need relegalization for the scalars.
1236               return std::make_pair(0, EltTy);
1237             })
1238     .lowerIfMemSizeNotPow2()
1239     .minScalar(0, S32);
1240 
1241     if (IsStore)
1242       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1243 
1244     Actions
1245         .widenScalarToNextPow2(0)
1246         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1247         .lower();
1248   }
1249 
1250   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1251                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1252                                                   {S32, GlobalPtr, 16, 2 * 8},
1253                                                   {S32, LocalPtr, 8, 8},
1254                                                   {S32, LocalPtr, 16, 16},
1255                                                   {S32, PrivatePtr, 8, 8},
1256                                                   {S32, PrivatePtr, 16, 16},
1257                                                   {S32, ConstantPtr, 8, 8},
1258                                                   {S32, ConstantPtr, 16, 2 * 8}});
1259   if (ST.hasFlatAddressSpace()) {
1260     ExtLoads.legalForTypesWithMemDesc(
1261         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1262   }
1263 
1264   ExtLoads.clampScalar(0, S32, S32)
1265           .widenScalarToNextPow2(0)
1266           .unsupportedIfMemSizeNotPow2()
1267           .lower();
1268 
1269   auto &Atomics = getActionDefinitionsBuilder(
1270     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1271      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1272      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1273      G_ATOMICRMW_UMIN})
1274     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1275                {S64, GlobalPtr}, {S64, LocalPtr},
1276                {S32, RegionPtr}, {S64, RegionPtr}});
1277   if (ST.hasFlatAddressSpace()) {
1278     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1279   }
1280 
1281   if (ST.hasLDSFPAtomics()) {
1282     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1283       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1284   }
1285 
1286   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1287   // demarshalling
1288   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1289     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1290                 {S32, FlatPtr}, {S64, FlatPtr}})
1291     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1292                {S32, RegionPtr}, {S64, RegionPtr}});
1293   // TODO: Pointer types, any 32-bit or 64-bit vector
1294 
1295   // Condition should be s32 for scalar, s1 for vector.
1296   getActionDefinitionsBuilder(G_SELECT)
1297     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1298           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1299           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1300     .clampScalar(0, S16, S64)
1301     .scalarize(1)
1302     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1303     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1304     .clampMaxNumElements(0, S32, 2)
1305     .clampMaxNumElements(0, LocalPtr, 2)
1306     .clampMaxNumElements(0, PrivatePtr, 2)
1307     .scalarize(0)
1308     .widenScalarToNextPow2(0)
1309     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1310 
1311   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1312   // be more flexible with the shift amount type.
1313   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1314     .legalFor({{S32, S32}, {S64, S32}});
1315   if (ST.has16BitInsts()) {
1316     if (ST.hasVOP3PInsts()) {
1317       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1318             .clampMaxNumElements(0, S16, 2);
1319     } else
1320       Shifts.legalFor({{S16, S16}});
1321 
1322     // TODO: Support 16-bit shift amounts for all types
1323     Shifts.widenScalarIf(
1324       [=](const LegalityQuery &Query) {
1325         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1326         // 32-bit amount.
1327         const LLT ValTy = Query.Types[0];
1328         const LLT AmountTy = Query.Types[1];
1329         return ValTy.getSizeInBits() <= 16 &&
1330                AmountTy.getSizeInBits() < 16;
1331       }, changeTo(1, S16));
1332     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1333     Shifts.clampScalar(1, S32, S32);
1334     Shifts.clampScalar(0, S16, S64);
1335     Shifts.widenScalarToNextPow2(0, 16);
1336 
1337     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1338       .minScalar(0, S16)
1339       .scalarize(0)
1340       .lower();
1341   } else {
1342     // Make sure we legalize the shift amount type first, as the general
1343     // expansion for the shifted type will produce much worse code if it hasn't
1344     // been truncated already.
1345     Shifts.clampScalar(1, S32, S32);
1346     Shifts.clampScalar(0, S32, S64);
1347     Shifts.widenScalarToNextPow2(0, 32);
1348 
1349     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350       .minScalar(0, S32)
1351       .scalarize(0)
1352       .lower();
1353   }
1354   Shifts.scalarize(0);
1355 
1356   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1357     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1358     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1359     unsigned IdxTypeIdx = 2;
1360 
1361     getActionDefinitionsBuilder(Op)
1362       .customIf([=](const LegalityQuery &Query) {
1363           const LLT EltTy = Query.Types[EltTypeIdx];
1364           const LLT VecTy = Query.Types[VecTypeIdx];
1365           const LLT IdxTy = Query.Types[IdxTypeIdx];
1366           const unsigned EltSize = EltTy.getSizeInBits();
1367           return (EltSize == 32 || EltSize == 64) &&
1368                   VecTy.getSizeInBits() % 32 == 0 &&
1369                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1370                   IdxTy.getSizeInBits() == 32;
1371         })
1372       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1373                  bitcastToVectorElement32(VecTypeIdx))
1374       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1375       .bitcastIf(
1376         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1377         [=](const LegalityQuery &Query) {
1378           // For > 64-bit element types, try to turn this into a 64-bit
1379           // element vector since we may be able to do better indexing
1380           // if this is scalar. If not, fall back to 32.
1381           const LLT EltTy = Query.Types[EltTypeIdx];
1382           const LLT VecTy = Query.Types[VecTypeIdx];
1383           const unsigned DstEltSize = EltTy.getSizeInBits();
1384           const unsigned VecSize = VecTy.getSizeInBits();
1385 
1386           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1387           return std::make_pair(
1388             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1389         })
1390       .clampScalar(EltTypeIdx, S32, S64)
1391       .clampScalar(VecTypeIdx, S32, S64)
1392       .clampScalar(IdxTypeIdx, S32, S32)
1393       .clampMaxNumElements(VecTypeIdx, S32, 32)
1394       // TODO: Clamp elements for 64-bit vectors?
1395       // It should only be necessary with variable indexes.
1396       // As a last resort, lower to the stack
1397       .lower();
1398   }
1399 
1400   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1401     .unsupportedIf([=](const LegalityQuery &Query) {
1402         const LLT &EltTy = Query.Types[1].getElementType();
1403         return Query.Types[0] != EltTy;
1404       });
1405 
1406   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1407     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1408     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1409 
1410     // FIXME: Doesn't handle extract of illegal sizes.
1411     getActionDefinitionsBuilder(Op)
1412       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1413       // FIXME: Multiples of 16 should not be legal.
1414       .legalIf([=](const LegalityQuery &Query) {
1415           const LLT BigTy = Query.Types[BigTyIdx];
1416           const LLT LitTy = Query.Types[LitTyIdx];
1417           return (BigTy.getSizeInBits() % 32 == 0) &&
1418                  (LitTy.getSizeInBits() % 16 == 0);
1419         })
1420       .widenScalarIf(
1421         [=](const LegalityQuery &Query) {
1422           const LLT BigTy = Query.Types[BigTyIdx];
1423           return (BigTy.getScalarSizeInBits() < 16);
1424         },
1425         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1426       .widenScalarIf(
1427         [=](const LegalityQuery &Query) {
1428           const LLT LitTy = Query.Types[LitTyIdx];
1429           return (LitTy.getScalarSizeInBits() < 16);
1430         },
1431         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1432       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1433       .widenScalarToNextPow2(BigTyIdx, 32);
1434 
1435   }
1436 
1437   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1438     .legalForCartesianProduct(AllS32Vectors, {S32})
1439     .legalForCartesianProduct(AllS64Vectors, {S64})
1440     .clampNumElements(0, V16S32, V32S32)
1441     .clampNumElements(0, V2S64, V16S64)
1442     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1443 
1444   if (ST.hasScalarPackInsts()) {
1445     BuildVector
1446       // FIXME: Should probably widen s1 vectors straight to s32
1447       .minScalarOrElt(0, S16)
1448       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1449       .minScalar(1, S32);
1450 
1451     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1452       .legalFor({V2S16, S32})
1453       .lower();
1454     BuildVector.minScalarOrElt(0, S32);
1455   } else {
1456     BuildVector.customFor({V2S16, S16});
1457     BuildVector.minScalarOrElt(0, S32);
1458 
1459     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460       .customFor({V2S16, S32})
1461       .lower();
1462   }
1463 
1464   BuildVector.legalIf(isRegisterType(0));
1465 
1466   // FIXME: Clamp maximum size
1467   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1468     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1469     .clampMaxNumElements(0, S32, 32)
1470     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1471     .clampMaxNumElements(0, S16, 64);
1472 
1473   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1474   // pre-legalize.
1475   if (ST.hasVOP3PInsts()) {
1476     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1477       .customFor({V2S16, V2S16})
1478       .lower();
1479   } else
1480     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1481 
1482   // Merge/Unmerge
1483   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1484     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1485     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1486 
1487     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1488       const LLT Ty = Query.Types[TypeIdx];
1489       if (Ty.isVector()) {
1490         const LLT &EltTy = Ty.getElementType();
1491         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1492           return true;
1493         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1494           return true;
1495       }
1496       return false;
1497     };
1498 
1499     auto &Builder = getActionDefinitionsBuilder(Op)
1500       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1501       .lowerFor({{S16, V2S16}})
1502       .lowerIf([=](const LegalityQuery &Query) {
1503           const LLT BigTy = Query.Types[BigTyIdx];
1504           return BigTy.getSizeInBits() == 32;
1505         })
1506       // Try to widen to s16 first for small types.
1507       // TODO: Only do this on targets with legal s16 shifts
1508       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1509       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1510       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1511       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1512                            elementTypeIs(1, S16)),
1513                        changeTo(1, V2S16))
1514       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1515       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1516       // valid.
1517       .clampScalar(LitTyIdx, S32, S512)
1518       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1519       // Break up vectors with weird elements into scalars
1520       .fewerElementsIf(
1521         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1522         scalarize(0))
1523       .fewerElementsIf(
1524         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1525         scalarize(1))
1526       .clampScalar(BigTyIdx, S32, MaxScalar);
1527 
1528     if (Op == G_MERGE_VALUES) {
1529       Builder.widenScalarIf(
1530         // TODO: Use 16-bit shifts if legal for 8-bit values?
1531         [=](const LegalityQuery &Query) {
1532           const LLT Ty = Query.Types[LitTyIdx];
1533           return Ty.getSizeInBits() < 32;
1534         },
1535         changeTo(LitTyIdx, S32));
1536     }
1537 
1538     Builder.widenScalarIf(
1539       [=](const LegalityQuery &Query) {
1540         const LLT Ty = Query.Types[BigTyIdx];
1541         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1542           Ty.getSizeInBits() % 16 != 0;
1543       },
1544       [=](const LegalityQuery &Query) {
1545         // Pick the next power of 2, or a multiple of 64 over 128.
1546         // Whichever is smaller.
1547         const LLT &Ty = Query.Types[BigTyIdx];
1548         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1549         if (NewSizeInBits >= 256) {
1550           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1551           if (RoundedTo < NewSizeInBits)
1552             NewSizeInBits = RoundedTo;
1553         }
1554         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1555       })
1556       // Any vectors left are the wrong size. Scalarize them.
1557       .scalarize(0)
1558       .scalarize(1);
1559   }
1560 
1561   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1562   // RegBankSelect.
1563   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1564     .legalFor({{S32}, {S64}});
1565 
1566   if (ST.hasVOP3PInsts()) {
1567     SextInReg.lowerFor({{V2S16}})
1568       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1569       // get more vector shift opportunities, since we'll get those when
1570       // expanded.
1571       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1572   } else if (ST.has16BitInsts()) {
1573     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1574   } else {
1575     // Prefer to promote to s32 before lowering if we don't have 16-bit
1576     // shifts. This avoid a lot of intermediate truncate and extend operations.
1577     SextInReg.lowerFor({{S32}, {S64}});
1578   }
1579 
1580   SextInReg
1581     .scalarize(0)
1582     .clampScalar(0, S32, S64)
1583     .lower();
1584 
1585   getActionDefinitionsBuilder(G_FSHR)
1586     .legalFor({{S32, S32}})
1587     .scalarize(0)
1588     .lower();
1589 
1590   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1591     .legalFor({S64});
1592 
1593   getActionDefinitionsBuilder(G_FENCE)
1594     .alwaysLegal();
1595 
1596   getActionDefinitionsBuilder({
1597       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1598       G_FCOPYSIGN,
1599 
1600       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1601       G_ATOMICRMW_NAND,
1602       G_ATOMICRMW_FSUB,
1603       G_READ_REGISTER,
1604       G_WRITE_REGISTER,
1605 
1606       G_SADDO, G_SSUBO,
1607 
1608        // TODO: Implement
1609       G_FMINIMUM, G_FMAXIMUM,
1610       G_FSHL
1611     }).lower();
1612 
1613   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1614         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1615         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1616     .unsupported();
1617 
1618   computeTables();
1619   verify(*ST.getInstrInfo());
1620 }
1621 
1622 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1623                                          MachineInstr &MI) const {
1624   MachineIRBuilder &B = Helper.MIRBuilder;
1625   MachineRegisterInfo &MRI = *B.getMRI();
1626 
1627   switch (MI.getOpcode()) {
1628   case TargetOpcode::G_ADDRSPACE_CAST:
1629     return legalizeAddrSpaceCast(MI, MRI, B);
1630   case TargetOpcode::G_FRINT:
1631     return legalizeFrint(MI, MRI, B);
1632   case TargetOpcode::G_FCEIL:
1633     return legalizeFceil(MI, MRI, B);
1634   case TargetOpcode::G_FREM:
1635     return legalizeFrem(MI, MRI, B);
1636   case TargetOpcode::G_INTRINSIC_TRUNC:
1637     return legalizeIntrinsicTrunc(MI, MRI, B);
1638   case TargetOpcode::G_SITOFP:
1639     return legalizeITOFP(MI, MRI, B, true);
1640   case TargetOpcode::G_UITOFP:
1641     return legalizeITOFP(MI, MRI, B, false);
1642   case TargetOpcode::G_FPTOSI:
1643     return legalizeFPTOI(MI, MRI, B, true);
1644   case TargetOpcode::G_FPTOUI:
1645     return legalizeFPTOI(MI, MRI, B, false);
1646   case TargetOpcode::G_FMINNUM:
1647   case TargetOpcode::G_FMAXNUM:
1648   case TargetOpcode::G_FMINNUM_IEEE:
1649   case TargetOpcode::G_FMAXNUM_IEEE:
1650     return legalizeMinNumMaxNum(Helper, MI);
1651   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1652     return legalizeExtractVectorElt(MI, MRI, B);
1653   case TargetOpcode::G_INSERT_VECTOR_ELT:
1654     return legalizeInsertVectorElt(MI, MRI, B);
1655   case TargetOpcode::G_SHUFFLE_VECTOR:
1656     return legalizeShuffleVector(MI, MRI, B);
1657   case TargetOpcode::G_FSIN:
1658   case TargetOpcode::G_FCOS:
1659     return legalizeSinCos(MI, MRI, B);
1660   case TargetOpcode::G_GLOBAL_VALUE:
1661     return legalizeGlobalValue(MI, MRI, B);
1662   case TargetOpcode::G_LOAD:
1663     return legalizeLoad(Helper, MI);
1664   case TargetOpcode::G_FMAD:
1665     return legalizeFMad(MI, MRI, B);
1666   case TargetOpcode::G_FDIV:
1667     return legalizeFDIV(MI, MRI, B);
1668   case TargetOpcode::G_UDIV:
1669   case TargetOpcode::G_UREM:
1670     return legalizeUDIV_UREM(MI, MRI, B);
1671   case TargetOpcode::G_SDIV:
1672   case TargetOpcode::G_SREM:
1673     return legalizeSDIV_SREM(MI, MRI, B);
1674   case TargetOpcode::G_ATOMIC_CMPXCHG:
1675     return legalizeAtomicCmpXChg(MI, MRI, B);
1676   case TargetOpcode::G_FLOG:
1677     return legalizeFlog(MI, B, numbers::ln2f);
1678   case TargetOpcode::G_FLOG10:
1679     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1680   case TargetOpcode::G_FEXP:
1681     return legalizeFExp(MI, B);
1682   case TargetOpcode::G_FPOW:
1683     return legalizeFPow(MI, B);
1684   case TargetOpcode::G_FFLOOR:
1685     return legalizeFFloor(MI, MRI, B);
1686   case TargetOpcode::G_BUILD_VECTOR:
1687     return legalizeBuildVector(MI, MRI, B);
1688   default:
1689     return false;
1690   }
1691 
1692   llvm_unreachable("expected switch to return");
1693 }
1694 
1695 Register AMDGPULegalizerInfo::getSegmentAperture(
1696   unsigned AS,
1697   MachineRegisterInfo &MRI,
1698   MachineIRBuilder &B) const {
1699   MachineFunction &MF = B.getMF();
1700   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1701   const LLT S32 = LLT::scalar(32);
1702 
1703   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1704 
1705   if (ST.hasApertureRegs()) {
1706     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1707     // getreg.
1708     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1709         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1710         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1711     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1712         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1713         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1714     unsigned Encoding =
1715         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1716         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1717         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1718 
1719     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1720 
1721     B.buildInstr(AMDGPU::S_GETREG_B32)
1722       .addDef(GetReg)
1723       .addImm(Encoding);
1724     MRI.setType(GetReg, S32);
1725 
1726     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1727     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1728   }
1729 
1730   Register QueuePtr = MRI.createGenericVirtualRegister(
1731     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1732 
1733   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1734     return Register();
1735 
1736   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1737   // private_segment_aperture_base_hi.
1738   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1739 
1740   // TODO: can we be smarter about machine pointer info?
1741   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1742   MachineMemOperand *MMO = MF.getMachineMemOperand(
1743       PtrInfo,
1744       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1745           MachineMemOperand::MOInvariant,
1746       4, commonAlignment(Align(64), StructOffset));
1747 
1748   Register LoadAddr;
1749 
1750   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1751   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1752 }
1753 
1754 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1755   MachineInstr &MI, MachineRegisterInfo &MRI,
1756   MachineIRBuilder &B) const {
1757   MachineFunction &MF = B.getMF();
1758 
1759   const LLT S32 = LLT::scalar(32);
1760   Register Dst = MI.getOperand(0).getReg();
1761   Register Src = MI.getOperand(1).getReg();
1762 
1763   LLT DstTy = MRI.getType(Dst);
1764   LLT SrcTy = MRI.getType(Src);
1765   unsigned DestAS = DstTy.getAddressSpace();
1766   unsigned SrcAS = SrcTy.getAddressSpace();
1767 
1768   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1769   // vector element.
1770   assert(!DstTy.isVector());
1771 
1772   const AMDGPUTargetMachine &TM
1773     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1774 
1775   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1776     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1777     return true;
1778   }
1779 
1780   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1781     // Truncate.
1782     B.buildExtract(Dst, Src, 0);
1783     MI.eraseFromParent();
1784     return true;
1785   }
1786 
1787   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1788     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1789     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1790 
1791     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1792     // another. Merge operands are required to be the same type, but creating an
1793     // extra ptrtoint would be kind of pointless.
1794     auto HighAddr = B.buildConstant(
1795       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1796     B.buildMerge(Dst, {Src, HighAddr});
1797     MI.eraseFromParent();
1798     return true;
1799   }
1800 
1801   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1802     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1803            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1804     unsigned NullVal = TM.getNullPointerValue(DestAS);
1805 
1806     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1807     auto FlatNull = B.buildConstant(SrcTy, 0);
1808 
1809     // Extract low 32-bits of the pointer.
1810     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1811 
1812     auto CmpRes =
1813         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1814     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1815 
1816     MI.eraseFromParent();
1817     return true;
1818   }
1819 
1820   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1821     return false;
1822 
1823   if (!ST.hasFlatAddressSpace())
1824     return false;
1825 
1826   auto SegmentNull =
1827       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1828   auto FlatNull =
1829       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1830 
1831   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1832   if (!ApertureReg.isValid())
1833     return false;
1834 
1835   auto CmpRes =
1836       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1837 
1838   // Coerce the type of the low half of the result so we can use merge_values.
1839   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1840 
1841   // TODO: Should we allow mismatched types but matching sizes in merges to
1842   // avoid the ptrtoint?
1843   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1844   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1845 
1846   MI.eraseFromParent();
1847   return true;
1848 }
1849 
1850 bool AMDGPULegalizerInfo::legalizeFrint(
1851   MachineInstr &MI, MachineRegisterInfo &MRI,
1852   MachineIRBuilder &B) const {
1853   Register Src = MI.getOperand(1).getReg();
1854   LLT Ty = MRI.getType(Src);
1855   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1856 
1857   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1858   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1859 
1860   auto C1 = B.buildFConstant(Ty, C1Val);
1861   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1862 
1863   // TODO: Should this propagate fast-math-flags?
1864   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1865   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1866 
1867   auto C2 = B.buildFConstant(Ty, C2Val);
1868   auto Fabs = B.buildFAbs(Ty, Src);
1869 
1870   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1871   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1872   MI.eraseFromParent();
1873   return true;
1874 }
1875 
1876 bool AMDGPULegalizerInfo::legalizeFceil(
1877   MachineInstr &MI, MachineRegisterInfo &MRI,
1878   MachineIRBuilder &B) const {
1879 
1880   const LLT S1 = LLT::scalar(1);
1881   const LLT S64 = LLT::scalar(64);
1882 
1883   Register Src = MI.getOperand(1).getReg();
1884   assert(MRI.getType(Src) == S64);
1885 
1886   // result = trunc(src)
1887   // if (src > 0.0 && src != result)
1888   //   result += 1.0
1889 
1890   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1891 
1892   const auto Zero = B.buildFConstant(S64, 0.0);
1893   const auto One = B.buildFConstant(S64, 1.0);
1894   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1895   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1896   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1897   auto Add = B.buildSelect(S64, And, One, Zero);
1898 
1899   // TODO: Should this propagate fast-math-flags?
1900   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1901   return true;
1902 }
1903 
1904 bool AMDGPULegalizerInfo::legalizeFrem(
1905   MachineInstr &MI, MachineRegisterInfo &MRI,
1906   MachineIRBuilder &B) const {
1907     Register DstReg = MI.getOperand(0).getReg();
1908     Register Src0Reg = MI.getOperand(1).getReg();
1909     Register Src1Reg = MI.getOperand(2).getReg();
1910     auto Flags = MI.getFlags();
1911     LLT Ty = MRI.getType(DstReg);
1912 
1913     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1914     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1915     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1916     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1917     MI.eraseFromParent();
1918     return true;
1919 }
1920 
1921 static MachineInstrBuilder extractF64Exponent(Register Hi,
1922                                               MachineIRBuilder &B) {
1923   const unsigned FractBits = 52;
1924   const unsigned ExpBits = 11;
1925   LLT S32 = LLT::scalar(32);
1926 
1927   auto Const0 = B.buildConstant(S32, FractBits - 32);
1928   auto Const1 = B.buildConstant(S32, ExpBits);
1929 
1930   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1931     .addUse(Hi)
1932     .addUse(Const0.getReg(0))
1933     .addUse(Const1.getReg(0));
1934 
1935   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1936 }
1937 
1938 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1939   MachineInstr &MI, MachineRegisterInfo &MRI,
1940   MachineIRBuilder &B) const {
1941   const LLT S1 = LLT::scalar(1);
1942   const LLT S32 = LLT::scalar(32);
1943   const LLT S64 = LLT::scalar(64);
1944 
1945   Register Src = MI.getOperand(1).getReg();
1946   assert(MRI.getType(Src) == S64);
1947 
1948   // TODO: Should this use extract since the low half is unused?
1949   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1950   Register Hi = Unmerge.getReg(1);
1951 
1952   // Extract the upper half, since this is where we will find the sign and
1953   // exponent.
1954   auto Exp = extractF64Exponent(Hi, B);
1955 
1956   const unsigned FractBits = 52;
1957 
1958   // Extract the sign bit.
1959   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1960   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1961 
1962   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1963 
1964   const auto Zero32 = B.buildConstant(S32, 0);
1965 
1966   // Extend back to 64-bits.
1967   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1968 
1969   auto Shr = B.buildAShr(S64, FractMask, Exp);
1970   auto Not = B.buildNot(S64, Shr);
1971   auto Tmp0 = B.buildAnd(S64, Src, Not);
1972   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1973 
1974   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1975   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1976 
1977   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1978   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1979   MI.eraseFromParent();
1980   return true;
1981 }
1982 
1983 bool AMDGPULegalizerInfo::legalizeITOFP(
1984   MachineInstr &MI, MachineRegisterInfo &MRI,
1985   MachineIRBuilder &B, bool Signed) const {
1986 
1987   Register Dst = MI.getOperand(0).getReg();
1988   Register Src = MI.getOperand(1).getReg();
1989 
1990   const LLT S64 = LLT::scalar(64);
1991   const LLT S32 = LLT::scalar(32);
1992 
1993   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1994 
1995   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1996 
1997   auto CvtHi = Signed ?
1998     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1999     B.buildUITOFP(S64, Unmerge.getReg(1));
2000 
2001   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2002 
2003   auto ThirtyTwo = B.buildConstant(S32, 32);
2004   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2005     .addUse(CvtHi.getReg(0))
2006     .addUse(ThirtyTwo.getReg(0));
2007 
2008   // TODO: Should this propagate fast-math-flags?
2009   B.buildFAdd(Dst, LdExp, CvtLo);
2010   MI.eraseFromParent();
2011   return true;
2012 }
2013 
2014 // TODO: Copied from DAG implementation. Verify logic and document how this
2015 // actually works.
2016 bool AMDGPULegalizerInfo::legalizeFPTOI(
2017   MachineInstr &MI, MachineRegisterInfo &MRI,
2018   MachineIRBuilder &B, bool Signed) const {
2019 
2020   Register Dst = MI.getOperand(0).getReg();
2021   Register Src = MI.getOperand(1).getReg();
2022 
2023   const LLT S64 = LLT::scalar(64);
2024   const LLT S32 = LLT::scalar(32);
2025 
2026   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2027 
2028   unsigned Flags = MI.getFlags();
2029 
2030   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2031   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2032   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2033 
2034   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2035   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2036   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2037 
2038   auto Hi = Signed ?
2039     B.buildFPTOSI(S32, FloorMul) :
2040     B.buildFPTOUI(S32, FloorMul);
2041   auto Lo = B.buildFPTOUI(S32, Fma);
2042 
2043   B.buildMerge(Dst, { Lo, Hi });
2044   MI.eraseFromParent();
2045 
2046   return true;
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2050                                                MachineInstr &MI) const {
2051   MachineFunction &MF = Helper.MIRBuilder.getMF();
2052   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2053 
2054   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2055                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2056 
2057   // With ieee_mode disabled, the instructions have the correct behavior
2058   // already for G_FMINNUM/G_FMAXNUM
2059   if (!MFI->getMode().IEEE)
2060     return !IsIEEEOp;
2061 
2062   if (IsIEEEOp)
2063     return true;
2064 
2065   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2066 }
2067 
2068 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2069   MachineInstr &MI, MachineRegisterInfo &MRI,
2070   MachineIRBuilder &B) const {
2071   // TODO: Should move some of this into LegalizerHelper.
2072 
2073   // TODO: Promote dynamic indexing of s16 to s32
2074 
2075   // FIXME: Artifact combiner probably should have replaced the truncated
2076   // constant before this, so we shouldn't need
2077   // getConstantVRegValWithLookThrough.
2078   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2079     MI.getOperand(2).getReg(), MRI);
2080   if (!IdxVal) // Dynamic case will be selected to register indexing.
2081     return true;
2082 
2083   Register Dst = MI.getOperand(0).getReg();
2084   Register Vec = MI.getOperand(1).getReg();
2085 
2086   LLT VecTy = MRI.getType(Vec);
2087   LLT EltTy = VecTy.getElementType();
2088   assert(EltTy == MRI.getType(Dst));
2089 
2090   if (IdxVal->Value < VecTy.getNumElements())
2091     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2092   else
2093     B.buildUndef(Dst);
2094 
2095   MI.eraseFromParent();
2096   return true;
2097 }
2098 
2099 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2100   MachineInstr &MI, MachineRegisterInfo &MRI,
2101   MachineIRBuilder &B) const {
2102   // TODO: Should move some of this into LegalizerHelper.
2103 
2104   // TODO: Promote dynamic indexing of s16 to s32
2105 
2106   // FIXME: Artifact combiner probably should have replaced the truncated
2107   // constant before this, so we shouldn't need
2108   // getConstantVRegValWithLookThrough.
2109   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2110     MI.getOperand(3).getReg(), MRI);
2111   if (!IdxVal) // Dynamic case will be selected to register indexing.
2112     return true;
2113 
2114   Register Dst = MI.getOperand(0).getReg();
2115   Register Vec = MI.getOperand(1).getReg();
2116   Register Ins = MI.getOperand(2).getReg();
2117 
2118   LLT VecTy = MRI.getType(Vec);
2119   LLT EltTy = VecTy.getElementType();
2120   assert(EltTy == MRI.getType(Ins));
2121 
2122   if (IdxVal->Value < VecTy.getNumElements())
2123     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2124   else
2125     B.buildUndef(Dst);
2126 
2127   MI.eraseFromParent();
2128   return true;
2129 }
2130 
2131 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2132   MachineInstr &MI, MachineRegisterInfo &MRI,
2133   MachineIRBuilder &B) const {
2134   const LLT V2S16 = LLT::vector(2, 16);
2135 
2136   Register Dst = MI.getOperand(0).getReg();
2137   Register Src0 = MI.getOperand(1).getReg();
2138   LLT DstTy = MRI.getType(Dst);
2139   LLT SrcTy = MRI.getType(Src0);
2140 
2141   if (SrcTy == V2S16 && DstTy == V2S16 &&
2142       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2143     return true;
2144 
2145   MachineIRBuilder HelperBuilder(MI);
2146   GISelObserverWrapper DummyObserver;
2147   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2148   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2149 }
2150 
2151 bool AMDGPULegalizerInfo::legalizeSinCos(
2152   MachineInstr &MI, MachineRegisterInfo &MRI,
2153   MachineIRBuilder &B) const {
2154 
2155   Register DstReg = MI.getOperand(0).getReg();
2156   Register SrcReg = MI.getOperand(1).getReg();
2157   LLT Ty = MRI.getType(DstReg);
2158   unsigned Flags = MI.getFlags();
2159 
2160   Register TrigVal;
2161   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2162   if (ST.hasTrigReducedRange()) {
2163     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2164     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2165       .addUse(MulVal.getReg(0))
2166       .setMIFlags(Flags).getReg(0);
2167   } else
2168     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2169 
2170   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2171     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2172   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2173     .addUse(TrigVal)
2174     .setMIFlags(Flags);
2175   MI.eraseFromParent();
2176   return true;
2177 }
2178 
2179 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2180                                                   MachineIRBuilder &B,
2181                                                   const GlobalValue *GV,
2182                                                   int64_t Offset,
2183                                                   unsigned GAFlags) const {
2184   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2185   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2186   // to the following code sequence:
2187   //
2188   // For constant address space:
2189   //   s_getpc_b64 s[0:1]
2190   //   s_add_u32 s0, s0, $symbol
2191   //   s_addc_u32 s1, s1, 0
2192   //
2193   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2194   //   a fixup or relocation is emitted to replace $symbol with a literal
2195   //   constant, which is a pc-relative offset from the encoding of the $symbol
2196   //   operand to the global variable.
2197   //
2198   // For global address space:
2199   //   s_getpc_b64 s[0:1]
2200   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2201   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2202   //
2203   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204   //   fixups or relocations are emitted to replace $symbol@*@lo and
2205   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2206   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2207   //   operand to the global variable.
2208   //
2209   // What we want here is an offset from the value returned by s_getpc
2210   // (which is the address of the s_add_u32 instruction) to the global
2211   // variable, but since the encoding of $symbol starts 4 bytes after the start
2212   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2213   // small. This requires us to add 4 to the global variable offset in order to
2214   // compute the correct address.
2215 
2216   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2217 
2218   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2219     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2220 
2221   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2222     .addDef(PCReg);
2223 
2224   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2225   if (GAFlags == SIInstrInfo::MO_NONE)
2226     MIB.addImm(0);
2227   else
2228     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2229 
2230   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2231 
2232   if (PtrTy.getSizeInBits() == 32)
2233     B.buildExtract(DstReg, PCReg, 0);
2234   return true;
2235  }
2236 
2237 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2238   MachineInstr &MI, MachineRegisterInfo &MRI,
2239   MachineIRBuilder &B) const {
2240   Register DstReg = MI.getOperand(0).getReg();
2241   LLT Ty = MRI.getType(DstReg);
2242   unsigned AS = Ty.getAddressSpace();
2243 
2244   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2245   MachineFunction &MF = B.getMF();
2246   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2247 
2248   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2249     if (!MFI->isEntryFunction()) {
2250       const Function &Fn = MF.getFunction();
2251       DiagnosticInfoUnsupported BadLDSDecl(
2252         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2253         DS_Warning);
2254       Fn.getContext().diagnose(BadLDSDecl);
2255 
2256       // We currently don't have a way to correctly allocate LDS objects that
2257       // aren't directly associated with a kernel. We do force inlining of
2258       // functions that use local objects. However, if these dead functions are
2259       // not eliminated, we don't want a compile time error. Just emit a warning
2260       // and a trap, since there should be no callable path here.
2261       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2262       B.buildUndef(DstReg);
2263       MI.eraseFromParent();
2264       return true;
2265     }
2266 
2267     // TODO: We could emit code to handle the initialization somewhere.
2268     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2269       const SITargetLowering *TLI = ST.getTargetLowering();
2270       if (!TLI->shouldUseLDSConstAddress(GV)) {
2271         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2272         return true; // Leave in place;
2273       }
2274 
2275       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2276         Type *Ty = GV->getValueType();
2277         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2278         // zero-sized type in other languages to declare the dynamic shared
2279         // memory which size is not known at the compile time. They will be
2280         // allocated by the runtime and placed directly after the static
2281         // allocated ones. They all share the same offset.
2282         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2283           // Adjust alignment for that dynamic shared memory array.
2284           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2285           LLT S32 = LLT::scalar(32);
2286           auto Sz =
2287               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2288           B.buildIntToPtr(DstReg, Sz);
2289           MI.eraseFromParent();
2290           return true;
2291         }
2292       }
2293 
2294       B.buildConstant(
2295           DstReg,
2296           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2297       MI.eraseFromParent();
2298       return true;
2299     }
2300 
2301     const Function &Fn = MF.getFunction();
2302     DiagnosticInfoUnsupported BadInit(
2303       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2304     Fn.getContext().diagnose(BadInit);
2305     return true;
2306   }
2307 
2308   const SITargetLowering *TLI = ST.getTargetLowering();
2309 
2310   if (TLI->shouldEmitFixup(GV)) {
2311     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2312     MI.eraseFromParent();
2313     return true;
2314   }
2315 
2316   if (TLI->shouldEmitPCReloc(GV)) {
2317     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2318     MI.eraseFromParent();
2319     return true;
2320   }
2321 
2322   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2323   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2324 
2325   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2326       MachinePointerInfo::getGOT(MF),
2327       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2328           MachineMemOperand::MOInvariant,
2329       8 /*Size*/, Align(8));
2330 
2331   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2332 
2333   if (Ty.getSizeInBits() == 32) {
2334     // Truncate if this is a 32-bit constant adrdess.
2335     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2336     B.buildExtract(DstReg, Load, 0);
2337   } else
2338     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2339 
2340   MI.eraseFromParent();
2341   return true;
2342 }
2343 
2344 static LLT widenToNextPowerOf2(LLT Ty) {
2345   if (Ty.isVector())
2346     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2347   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2348 }
2349 
2350 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2351                                        MachineInstr &MI) const {
2352   MachineIRBuilder &B = Helper.MIRBuilder;
2353   MachineRegisterInfo &MRI = *B.getMRI();
2354   GISelChangeObserver &Observer = Helper.Observer;
2355 
2356   Register PtrReg = MI.getOperand(1).getReg();
2357   LLT PtrTy = MRI.getType(PtrReg);
2358   unsigned AddrSpace = PtrTy.getAddressSpace();
2359 
2360   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2361     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2362     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2363     Observer.changingInstr(MI);
2364     MI.getOperand(1).setReg(Cast.getReg(0));
2365     Observer.changedInstr(MI);
2366     return true;
2367   }
2368 
2369   Register ValReg = MI.getOperand(0).getReg();
2370   LLT ValTy = MRI.getType(ValReg);
2371 
2372   MachineMemOperand *MMO = *MI.memoperands_begin();
2373   const unsigned ValSize = ValTy.getSizeInBits();
2374   const unsigned MemSize = 8 * MMO->getSize();
2375   const Align MemAlign = MMO->getAlign();
2376   const unsigned AlignInBits = 8 * MemAlign.value();
2377 
2378   // Widen non-power-of-2 loads to the alignment if needed
2379   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2380     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2381 
2382     // This was already the correct extending load result type, so just adjust
2383     // the memory type.
2384     if (WideMemSize == ValSize) {
2385       MachineFunction &MF = B.getMF();
2386 
2387       MachineMemOperand *WideMMO =
2388           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2389       Observer.changingInstr(MI);
2390       MI.setMemRefs(MF, {WideMMO});
2391       Observer.changedInstr(MI);
2392       return true;
2393     }
2394 
2395     // Don't bother handling edge case that should probably never be produced.
2396     if (ValSize > WideMemSize)
2397       return false;
2398 
2399     LLT WideTy = widenToNextPowerOf2(ValTy);
2400 
2401     Register WideLoad;
2402     if (!WideTy.isVector()) {
2403       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2404       B.buildTrunc(ValReg, WideLoad).getReg(0);
2405     } else {
2406       // Extract the subvector.
2407 
2408       if (isRegisterType(ValTy)) {
2409         // If this a case where G_EXTRACT is legal, use it.
2410         // (e.g. <3 x s32> -> <4 x s32>)
2411         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2412         B.buildExtract(ValReg, WideLoad, 0);
2413       } else {
2414         // For cases where the widened type isn't a nice register value, unmerge
2415         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2416         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2417         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2418         B.setInsertPt(B.getMBB(), MI.getIterator());
2419         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2420       }
2421     }
2422 
2423     MI.eraseFromParent();
2424     return true;
2425   }
2426 
2427   return false;
2428 }
2429 
2430 bool AMDGPULegalizerInfo::legalizeFMad(
2431   MachineInstr &MI, MachineRegisterInfo &MRI,
2432   MachineIRBuilder &B) const {
2433   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2434   assert(Ty.isScalar());
2435 
2436   MachineFunction &MF = B.getMF();
2437   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2438 
2439   // TODO: Always legal with future ftz flag.
2440   // FIXME: Do we need just output?
2441   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2442     return true;
2443   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2444     return true;
2445 
2446   MachineIRBuilder HelperBuilder(MI);
2447   GISelObserverWrapper DummyObserver;
2448   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2449   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2450 }
2451 
2452 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2453   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2454   Register DstReg = MI.getOperand(0).getReg();
2455   Register PtrReg = MI.getOperand(1).getReg();
2456   Register CmpVal = MI.getOperand(2).getReg();
2457   Register NewVal = MI.getOperand(3).getReg();
2458 
2459   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2460          "this should not have been custom lowered");
2461 
2462   LLT ValTy = MRI.getType(CmpVal);
2463   LLT VecTy = LLT::vector(2, ValTy);
2464 
2465   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2466 
2467   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2468     .addDef(DstReg)
2469     .addUse(PtrReg)
2470     .addUse(PackedVal)
2471     .setMemRefs(MI.memoperands());
2472 
2473   MI.eraseFromParent();
2474   return true;
2475 }
2476 
2477 bool AMDGPULegalizerInfo::legalizeFlog(
2478   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2479   Register Dst = MI.getOperand(0).getReg();
2480   Register Src = MI.getOperand(1).getReg();
2481   LLT Ty = B.getMRI()->getType(Dst);
2482   unsigned Flags = MI.getFlags();
2483 
2484   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2485   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2486 
2487   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2488   MI.eraseFromParent();
2489   return true;
2490 }
2491 
2492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2493                                        MachineIRBuilder &B) const {
2494   Register Dst = MI.getOperand(0).getReg();
2495   Register Src = MI.getOperand(1).getReg();
2496   unsigned Flags = MI.getFlags();
2497   LLT Ty = B.getMRI()->getType(Dst);
2498 
2499   auto K = B.buildFConstant(Ty, numbers::log2e);
2500   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2501   B.buildFExp2(Dst, Mul, Flags);
2502   MI.eraseFromParent();
2503   return true;
2504 }
2505 
2506 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2507                                        MachineIRBuilder &B) const {
2508   Register Dst = MI.getOperand(0).getReg();
2509   Register Src0 = MI.getOperand(1).getReg();
2510   Register Src1 = MI.getOperand(2).getReg();
2511   unsigned Flags = MI.getFlags();
2512   LLT Ty = B.getMRI()->getType(Dst);
2513   const LLT S16 = LLT::scalar(16);
2514   const LLT S32 = LLT::scalar(32);
2515 
2516   if (Ty == S32) {
2517     auto Log = B.buildFLog2(S32, Src0, Flags);
2518     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2519       .addUse(Log.getReg(0))
2520       .addUse(Src1)
2521       .setMIFlags(Flags);
2522     B.buildFExp2(Dst, Mul, Flags);
2523   } else if (Ty == S16) {
2524     // There's no f16 fmul_legacy, so we need to convert for it.
2525     auto Log = B.buildFLog2(S16, Src0, Flags);
2526     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2527     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2528     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2529       .addUse(Ext0.getReg(0))
2530       .addUse(Ext1.getReg(0))
2531       .setMIFlags(Flags);
2532 
2533     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2534   } else
2535     return false;
2536 
2537   MI.eraseFromParent();
2538   return true;
2539 }
2540 
2541 // Find a source register, ignoring any possible source modifiers.
2542 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2543   Register ModSrc = OrigSrc;
2544   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2545     ModSrc = SrcFNeg->getOperand(1).getReg();
2546     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2547       ModSrc = SrcFAbs->getOperand(1).getReg();
2548   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2549     ModSrc = SrcFAbs->getOperand(1).getReg();
2550   return ModSrc;
2551 }
2552 
2553 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2554                                          MachineRegisterInfo &MRI,
2555                                          MachineIRBuilder &B) const {
2556 
2557   const LLT S1 = LLT::scalar(1);
2558   const LLT S64 = LLT::scalar(64);
2559   Register Dst = MI.getOperand(0).getReg();
2560   Register OrigSrc = MI.getOperand(1).getReg();
2561   unsigned Flags = MI.getFlags();
2562   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2563          "this should not have been custom lowered");
2564 
2565   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2566   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2567   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2568   // V_FRACT bug is:
2569   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2570   //
2571   // Convert floor(x) to (x - fract(x))
2572 
2573   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2574     .addUse(OrigSrc)
2575     .setMIFlags(Flags);
2576 
2577   // Give source modifier matching some assistance before obscuring a foldable
2578   // pattern.
2579 
2580   // TODO: We can avoid the neg on the fract? The input sign to fract
2581   // shouldn't matter?
2582   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2583 
2584   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2585 
2586   Register Min = MRI.createGenericVirtualRegister(S64);
2587 
2588   // We don't need to concern ourselves with the snan handling difference, so
2589   // use the one which will directly select.
2590   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2591   if (MFI->getMode().IEEE)
2592     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2593   else
2594     B.buildFMinNum(Min, Fract, Const, Flags);
2595 
2596   Register CorrectedFract = Min;
2597   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2598     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2599     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2600   }
2601 
2602   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2603   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2604 
2605   MI.eraseFromParent();
2606   return true;
2607 }
2608 
2609 // Turn an illegal packed v2s16 build vector into bit operations.
2610 // TODO: This should probably be a bitcast action in LegalizerHelper.
2611 bool AMDGPULegalizerInfo::legalizeBuildVector(
2612   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2613   Register Dst = MI.getOperand(0).getReg();
2614   const LLT S32 = LLT::scalar(32);
2615   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2616 
2617   Register Src0 = MI.getOperand(1).getReg();
2618   Register Src1 = MI.getOperand(2).getReg();
2619   assert(MRI.getType(Src0) == LLT::scalar(16));
2620 
2621   auto Merge = B.buildMerge(S32, {Src0, Src1});
2622   B.buildBitcast(Dst, Merge);
2623 
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627 
2628 // Check that this is a G_XOR x, -1
2629 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2630   if (MI.getOpcode() != TargetOpcode::G_XOR)
2631     return false;
2632   auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
2633   return ConstVal && *ConstVal == -1;
2634 }
2635 
2636 // Return the use branch instruction, otherwise null if the usage is invalid.
2637 static MachineInstr *
2638 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2639                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2640   Register CondDef = MI.getOperand(0).getReg();
2641   if (!MRI.hasOneNonDBGUse(CondDef))
2642     return nullptr;
2643 
2644   MachineBasicBlock *Parent = MI.getParent();
2645   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2646 
2647   if (isNot(MRI, *UseMI)) {
2648     Register NegatedCond = UseMI->getOperand(0).getReg();
2649     if (!MRI.hasOneNonDBGUse(NegatedCond))
2650       return nullptr;
2651 
2652     // We're deleting the def of this value, so we need to remove it.
2653     UseMI->eraseFromParent();
2654 
2655     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2656     Negated = true;
2657   }
2658 
2659   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2660     return nullptr;
2661 
2662   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2663   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2664   if (Next == Parent->end()) {
2665     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2666     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2667       return nullptr;
2668     UncondBrTarget = &*NextMBB;
2669   } else {
2670     if (Next->getOpcode() != AMDGPU::G_BR)
2671       return nullptr;
2672     Br = &*Next;
2673     UncondBrTarget = Br->getOperand(0).getMBB();
2674   }
2675 
2676   return UseMI;
2677 }
2678 
2679 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2680                                          const ArgDescriptor *Arg,
2681                                          const TargetRegisterClass *ArgRC,
2682                                          LLT ArgTy) const {
2683   MCRegister SrcReg = Arg->getRegister();
2684   assert(SrcReg.isPhysical() && "Physical register expected");
2685   assert(DstReg.isVirtual() && "Virtual register expected");
2686 
2687   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2688                                              ArgTy);
2689   if (Arg->isMasked()) {
2690     // TODO: Should we try to emit this once in the entry block?
2691     const LLT S32 = LLT::scalar(32);
2692     const unsigned Mask = Arg->getMask();
2693     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2694 
2695     Register AndMaskSrc = LiveIn;
2696 
2697     if (Shift != 0) {
2698       auto ShiftAmt = B.buildConstant(S32, Shift);
2699       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2700     }
2701 
2702     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2703   } else {
2704     B.buildCopy(DstReg, LiveIn);
2705   }
2706 
2707   return true;
2708 }
2709 
2710 bool AMDGPULegalizerInfo::loadInputValue(
2711     Register DstReg, MachineIRBuilder &B,
2712     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2713   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2714   const ArgDescriptor *Arg;
2715   const TargetRegisterClass *ArgRC;
2716   LLT ArgTy;
2717   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2718 
2719   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2720     return false; // TODO: Handle these
2721   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2722 }
2723 
2724 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2725     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2726     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2727   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2728     return false;
2729 
2730   MI.eraseFromParent();
2731   return true;
2732 }
2733 
2734 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2735                                        MachineRegisterInfo &MRI,
2736                                        MachineIRBuilder &B) const {
2737   Register Dst = MI.getOperand(0).getReg();
2738   LLT DstTy = MRI.getType(Dst);
2739   LLT S16 = LLT::scalar(16);
2740   LLT S32 = LLT::scalar(32);
2741   LLT S64 = LLT::scalar(64);
2742 
2743   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2744     return true;
2745 
2746   if (DstTy == S16)
2747     return legalizeFDIV16(MI, MRI, B);
2748   if (DstTy == S32)
2749     return legalizeFDIV32(MI, MRI, B);
2750   if (DstTy == S64)
2751     return legalizeFDIV64(MI, MRI, B);
2752 
2753   return false;
2754 }
2755 
2756 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2757                                                   Register DstReg,
2758                                                   Register X,
2759                                                   Register Y,
2760                                                   bool IsDiv) const {
2761   const LLT S1 = LLT::scalar(1);
2762   const LLT S32 = LLT::scalar(32);
2763 
2764   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2765   // algorithm used here.
2766 
2767   // Initial estimate of inv(y).
2768   auto FloatY = B.buildUITOFP(S32, Y);
2769   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2770   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2771   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2772   auto Z = B.buildFPTOUI(S32, ScaledY);
2773 
2774   // One round of UNR.
2775   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2776   auto NegYZ = B.buildMul(S32, NegY, Z);
2777   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2778 
2779   // Quotient/remainder estimate.
2780   auto Q = B.buildUMulH(S32, X, Z);
2781   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2782 
2783   // First quotient/remainder refinement.
2784   auto One = B.buildConstant(S32, 1);
2785   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2786   if (IsDiv)
2787     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2788   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2789 
2790   // Second quotient/remainder refinement.
2791   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2792   if (IsDiv)
2793     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2794   else
2795     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2796 }
2797 
2798 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2799                                               MachineRegisterInfo &MRI,
2800                                               MachineIRBuilder &B) const {
2801   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2802   Register DstReg = MI.getOperand(0).getReg();
2803   Register Num = MI.getOperand(1).getReg();
2804   Register Den = MI.getOperand(2).getReg();
2805   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2806   MI.eraseFromParent();
2807   return true;
2808 }
2809 
2810 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2811 //
2812 // Return lo, hi of result
2813 //
2814 // %cvt.lo = G_UITOFP Val.lo
2815 // %cvt.hi = G_UITOFP Val.hi
2816 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2817 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2818 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2819 // %mul2 = G_FMUL %mul1, 2**(-32)
2820 // %trunc = G_INTRINSIC_TRUNC %mul2
2821 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2822 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2823 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2824                                                        Register Val) {
2825   const LLT S32 = LLT::scalar(32);
2826   auto Unmerge = B.buildUnmerge(S32, Val);
2827 
2828   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2829   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2830 
2831   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2832                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2833 
2834   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2835   auto Mul1 =
2836       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2837 
2838   // 2**(-32)
2839   auto Mul2 =
2840       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2841   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2842 
2843   // -(2**32)
2844   auto Mad2 = B.buildFMAD(S32, Trunc,
2845                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2846 
2847   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2848   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2849 
2850   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2851 }
2852 
2853 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2854                                                   Register DstReg,
2855                                                   Register Numer,
2856                                                   Register Denom,
2857                                                   bool IsDiv) const {
2858   const LLT S32 = LLT::scalar(32);
2859   const LLT S64 = LLT::scalar(64);
2860   const LLT S1 = LLT::scalar(1);
2861   Register RcpLo, RcpHi;
2862 
2863   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2864 
2865   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2866 
2867   auto Zero64 = B.buildConstant(S64, 0);
2868   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2869 
2870   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2871   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2872 
2873   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2874   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2875   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2876 
2877   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2878   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2879   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2880   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2881 
2882   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2883   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2884   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2885   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2886   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2887 
2888   auto Zero32 = B.buildConstant(S32, 0);
2889   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2890   auto Add2_HiC =
2891       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2892   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2893   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2894 
2895   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2896   Register NumerLo = UnmergeNumer.getReg(0);
2897   Register NumerHi = UnmergeNumer.getReg(1);
2898 
2899   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2900   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2901   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2902   Register Mul3_Lo = UnmergeMul3.getReg(0);
2903   Register Mul3_Hi = UnmergeMul3.getReg(1);
2904   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2905   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2906   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2907   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2908 
2909   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2910   Register DenomLo = UnmergeDenom.getReg(0);
2911   Register DenomHi = UnmergeDenom.getReg(1);
2912 
2913   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2914   auto C1 = B.buildSExt(S32, CmpHi);
2915 
2916   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2917   auto C2 = B.buildSExt(S32, CmpLo);
2918 
2919   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2920   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2921 
2922   // TODO: Here and below portions of the code can be enclosed into if/endif.
2923   // Currently control flow is unconditional and we have 4 selects after
2924   // potential endif to substitute PHIs.
2925 
2926   // if C3 != 0 ...
2927   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2928   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2929   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2930   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2931 
2932   auto One64 = B.buildConstant(S64, 1);
2933   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2934 
2935   auto C4 =
2936       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2937   auto C5 =
2938       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2939   auto C6 = B.buildSelect(
2940       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2941 
2942   // if (C6 != 0)
2943   auto Add4 = B.buildAdd(S64, Add3, One64);
2944   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2945 
2946   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2947   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2948   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2949 
2950   // endif C6
2951   // endif C3
2952 
2953   if (IsDiv) {
2954     auto Sel1 = B.buildSelect(
2955         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2956     B.buildSelect(DstReg,
2957                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2958   } else {
2959     auto Sel2 = B.buildSelect(
2960         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2961     B.buildSelect(DstReg,
2962                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2963   }
2964 }
2965 
2966 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2967                                             MachineRegisterInfo &MRI,
2968                                             MachineIRBuilder &B) const {
2969   const LLT S64 = LLT::scalar(64);
2970   const LLT S32 = LLT::scalar(32);
2971   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2972   Register DstReg = MI.getOperand(0).getReg();
2973   Register Num = MI.getOperand(1).getReg();
2974   Register Den = MI.getOperand(2).getReg();
2975   LLT Ty = MRI.getType(DstReg);
2976 
2977   if (Ty == S32)
2978     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2979   else if (Ty == S64)
2980     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2981   else
2982     return false;
2983 
2984   MI.eraseFromParent();
2985   return true;
2986 
2987 }
2988 
2989 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2990                                             MachineRegisterInfo &MRI,
2991                                             MachineIRBuilder &B) const {
2992   const LLT S64 = LLT::scalar(64);
2993   const LLT S32 = LLT::scalar(32);
2994 
2995   Register DstReg = MI.getOperand(0).getReg();
2996   const LLT Ty = MRI.getType(DstReg);
2997   if (Ty != S32 && Ty != S64)
2998     return false;
2999 
3000   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3001 
3002   Register LHS = MI.getOperand(1).getReg();
3003   Register RHS = MI.getOperand(2).getReg();
3004 
3005   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3006   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3007   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3008 
3009   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3010   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3011 
3012   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3013   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3014 
3015   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3016   if (Ty == S32)
3017     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3018   else
3019     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3020 
3021   Register Sign;
3022   if (IsDiv)
3023     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3024   else
3025     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3026 
3027   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3028   B.buildSub(DstReg, UDivRem, Sign);
3029 
3030   MI.eraseFromParent();
3031   return true;
3032 }
3033 
3034 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3035                                                  MachineRegisterInfo &MRI,
3036                                                  MachineIRBuilder &B) const {
3037   Register Res = MI.getOperand(0).getReg();
3038   Register LHS = MI.getOperand(1).getReg();
3039   Register RHS = MI.getOperand(2).getReg();
3040 
3041   uint16_t Flags = MI.getFlags();
3042 
3043   LLT ResTy = MRI.getType(Res);
3044   LLT S32 = LLT::scalar(32);
3045   LLT S64 = LLT::scalar(64);
3046 
3047   const MachineFunction &MF = B.getMF();
3048   bool Unsafe =
3049     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3050 
3051   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3052     return false;
3053 
3054   if (!Unsafe && ResTy == S32 &&
3055       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3056     return false;
3057 
3058   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3059     // 1 / x -> RCP(x)
3060     if (CLHS->isExactlyValue(1.0)) {
3061       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3062         .addUse(RHS)
3063         .setMIFlags(Flags);
3064 
3065       MI.eraseFromParent();
3066       return true;
3067     }
3068 
3069     // -1 / x -> RCP( FNEG(x) )
3070     if (CLHS->isExactlyValue(-1.0)) {
3071       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3072       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3073         .addUse(FNeg.getReg(0))
3074         .setMIFlags(Flags);
3075 
3076       MI.eraseFromParent();
3077       return true;
3078     }
3079   }
3080 
3081   // x / y -> x * (1.0 / y)
3082   if (Unsafe) {
3083     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3084       .addUse(RHS)
3085       .setMIFlags(Flags);
3086     B.buildFMul(Res, LHS, RCP, Flags);
3087 
3088     MI.eraseFromParent();
3089     return true;
3090   }
3091 
3092   return false;
3093 }
3094 
3095 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3096                                          MachineRegisterInfo &MRI,
3097                                          MachineIRBuilder &B) const {
3098   Register Res = MI.getOperand(0).getReg();
3099   Register LHS = MI.getOperand(1).getReg();
3100   Register RHS = MI.getOperand(2).getReg();
3101 
3102   uint16_t Flags = MI.getFlags();
3103 
3104   LLT S16 = LLT::scalar(16);
3105   LLT S32 = LLT::scalar(32);
3106 
3107   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3108   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3109 
3110   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3111     .addUse(RHSExt.getReg(0))
3112     .setMIFlags(Flags);
3113 
3114   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3115   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3116 
3117   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3118     .addUse(RDst.getReg(0))
3119     .addUse(RHS)
3120     .addUse(LHS)
3121     .setMIFlags(Flags);
3122 
3123   MI.eraseFromParent();
3124   return true;
3125 }
3126 
3127 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3128 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3129 static void toggleSPDenormMode(bool Enable,
3130                                MachineIRBuilder &B,
3131                                const GCNSubtarget &ST,
3132                                AMDGPU::SIModeRegisterDefaults Mode) {
3133   // Set SP denorm mode to this value.
3134   unsigned SPDenormMode =
3135     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3136 
3137   if (ST.hasDenormModeInst()) {
3138     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3139     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3140 
3141     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3142     B.buildInstr(AMDGPU::S_DENORM_MODE)
3143       .addImm(NewDenormModeValue);
3144 
3145   } else {
3146     // Select FP32 bit field in mode register.
3147     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3148                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3149                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3150 
3151     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3152       .addImm(SPDenormMode)
3153       .addImm(SPDenormModeBitField);
3154   }
3155 }
3156 
3157 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3158                                          MachineRegisterInfo &MRI,
3159                                          MachineIRBuilder &B) const {
3160   Register Res = MI.getOperand(0).getReg();
3161   Register LHS = MI.getOperand(1).getReg();
3162   Register RHS = MI.getOperand(2).getReg();
3163   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3164   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3165 
3166   uint16_t Flags = MI.getFlags();
3167 
3168   LLT S32 = LLT::scalar(32);
3169   LLT S1 = LLT::scalar(1);
3170 
3171   auto One = B.buildFConstant(S32, 1.0f);
3172 
3173   auto DenominatorScaled =
3174     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3175       .addUse(LHS)
3176       .addUse(RHS)
3177       .addImm(0)
3178       .setMIFlags(Flags);
3179   auto NumeratorScaled =
3180     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3181       .addUse(LHS)
3182       .addUse(RHS)
3183       .addImm(1)
3184       .setMIFlags(Flags);
3185 
3186   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3187     .addUse(DenominatorScaled.getReg(0))
3188     .setMIFlags(Flags);
3189   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3190 
3191   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3192   // aren't modeled as reading it.
3193   if (!Mode.allFP32Denormals())
3194     toggleSPDenormMode(true, B, ST, Mode);
3195 
3196   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3197   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3198   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3199   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3200   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3201   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3202 
3203   if (!Mode.allFP32Denormals())
3204     toggleSPDenormMode(false, B, ST, Mode);
3205 
3206   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3207     .addUse(Fma4.getReg(0))
3208     .addUse(Fma1.getReg(0))
3209     .addUse(Fma3.getReg(0))
3210     .addUse(NumeratorScaled.getReg(1))
3211     .setMIFlags(Flags);
3212 
3213   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3214     .addUse(Fmas.getReg(0))
3215     .addUse(RHS)
3216     .addUse(LHS)
3217     .setMIFlags(Flags);
3218 
3219   MI.eraseFromParent();
3220   return true;
3221 }
3222 
3223 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3224                                          MachineRegisterInfo &MRI,
3225                                          MachineIRBuilder &B) const {
3226   Register Res = MI.getOperand(0).getReg();
3227   Register LHS = MI.getOperand(1).getReg();
3228   Register RHS = MI.getOperand(2).getReg();
3229 
3230   uint16_t Flags = MI.getFlags();
3231 
3232   LLT S64 = LLT::scalar(64);
3233   LLT S1 = LLT::scalar(1);
3234 
3235   auto One = B.buildFConstant(S64, 1.0);
3236 
3237   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3238     .addUse(LHS)
3239     .addUse(RHS)
3240     .addImm(0)
3241     .setMIFlags(Flags);
3242 
3243   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3244 
3245   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3246     .addUse(DivScale0.getReg(0))
3247     .setMIFlags(Flags);
3248 
3249   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3250   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3251   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3252 
3253   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3254     .addUse(LHS)
3255     .addUse(RHS)
3256     .addImm(1)
3257     .setMIFlags(Flags);
3258 
3259   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3260   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3261   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3262 
3263   Register Scale;
3264   if (!ST.hasUsableDivScaleConditionOutput()) {
3265     // Workaround a hardware bug on SI where the condition output from div_scale
3266     // is not usable.
3267 
3268     LLT S32 = LLT::scalar(32);
3269 
3270     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3271     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3272     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3273     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3274 
3275     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3276                               Scale1Unmerge.getReg(1));
3277     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3278                               Scale0Unmerge.getReg(1));
3279     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3280   } else {
3281     Scale = DivScale1.getReg(1);
3282   }
3283 
3284   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3285     .addUse(Fma4.getReg(0))
3286     .addUse(Fma3.getReg(0))
3287     .addUse(Mul.getReg(0))
3288     .addUse(Scale)
3289     .setMIFlags(Flags);
3290 
3291   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3292     .addUse(Fmas.getReg(0))
3293     .addUse(RHS)
3294     .addUse(LHS)
3295     .setMIFlags(Flags);
3296 
3297   MI.eraseFromParent();
3298   return true;
3299 }
3300 
3301 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3302                                                  MachineRegisterInfo &MRI,
3303                                                  MachineIRBuilder &B) const {
3304   Register Res = MI.getOperand(0).getReg();
3305   Register LHS = MI.getOperand(2).getReg();
3306   Register RHS = MI.getOperand(3).getReg();
3307   uint16_t Flags = MI.getFlags();
3308 
3309   LLT S32 = LLT::scalar(32);
3310   LLT S1 = LLT::scalar(1);
3311 
3312   auto Abs = B.buildFAbs(S32, RHS, Flags);
3313   const APFloat C0Val(1.0f);
3314 
3315   auto C0 = B.buildConstant(S32, 0x6f800000);
3316   auto C1 = B.buildConstant(S32, 0x2f800000);
3317   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3318 
3319   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3320   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3321 
3322   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3323 
3324   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3325     .addUse(Mul0.getReg(0))
3326     .setMIFlags(Flags);
3327 
3328   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3329 
3330   B.buildFMul(Res, Sel, Mul1, Flags);
3331 
3332   MI.eraseFromParent();
3333   return true;
3334 }
3335 
3336 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3337 // FIXME: Why do we handle this one but not other removed instructions?
3338 //
3339 // Reciprocal square root.  The clamp prevents infinite results, clamping
3340 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3341 // +-max_float.
3342 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3343                                                     MachineRegisterInfo &MRI,
3344                                                     MachineIRBuilder &B) const {
3345   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3346     return true;
3347 
3348   Register Dst = MI.getOperand(0).getReg();
3349   Register Src = MI.getOperand(2).getReg();
3350   auto Flags = MI.getFlags();
3351 
3352   LLT Ty = MRI.getType(Dst);
3353 
3354   const fltSemantics *FltSemantics;
3355   if (Ty == LLT::scalar(32))
3356     FltSemantics = &APFloat::IEEEsingle();
3357   else if (Ty == LLT::scalar(64))
3358     FltSemantics = &APFloat::IEEEdouble();
3359   else
3360     return false;
3361 
3362   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3363     .addUse(Src)
3364     .setMIFlags(Flags);
3365 
3366   // We don't need to concern ourselves with the snan handling difference, since
3367   // the rsq quieted (or not) so use the one which will directly select.
3368   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3369   const bool UseIEEE = MFI->getMode().IEEE;
3370 
3371   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3372   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3373                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3374 
3375   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3376 
3377   if (UseIEEE)
3378     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3379   else
3380     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3381   MI.eraseFromParent();
3382   return true;
3383 }
3384 
3385 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3386   switch (IID) {
3387   case Intrinsic::amdgcn_ds_fadd:
3388     return AMDGPU::G_ATOMICRMW_FADD;
3389   case Intrinsic::amdgcn_ds_fmin:
3390     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3391   case Intrinsic::amdgcn_ds_fmax:
3392     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3393   default:
3394     llvm_unreachable("not a DS FP intrinsic");
3395   }
3396 }
3397 
3398 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3399                                                       MachineInstr &MI,
3400                                                       Intrinsic::ID IID) const {
3401   GISelChangeObserver &Observer = Helper.Observer;
3402   Observer.changingInstr(MI);
3403 
3404   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3405 
3406   // The remaining operands were used to set fields in the MemOperand on
3407   // construction.
3408   for (int I = 6; I > 3; --I)
3409     MI.RemoveOperand(I);
3410 
3411   MI.RemoveOperand(1); // Remove the intrinsic ID.
3412   Observer.changedInstr(MI);
3413   return true;
3414 }
3415 
3416 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3417                                             MachineRegisterInfo &MRI,
3418                                             MachineIRBuilder &B) const {
3419   uint64_t Offset =
3420     ST.getTargetLowering()->getImplicitParameterOffset(
3421       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3422   LLT DstTy = MRI.getType(DstReg);
3423   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3424 
3425   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3426   if (!loadInputValue(KernargPtrReg, B,
3427                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3428     return false;
3429 
3430   // FIXME: This should be nuw
3431   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3432   return true;
3433 }
3434 
3435 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3436                                                  MachineRegisterInfo &MRI,
3437                                                  MachineIRBuilder &B) const {
3438   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3439   if (!MFI->isEntryFunction()) {
3440     return legalizePreloadedArgIntrin(MI, MRI, B,
3441                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3442   }
3443 
3444   Register DstReg = MI.getOperand(0).getReg();
3445   if (!getImplicitArgPtr(DstReg, MRI, B))
3446     return false;
3447 
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3453                                               MachineRegisterInfo &MRI,
3454                                               MachineIRBuilder &B,
3455                                               unsigned AddrSpace) const {
3456   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3457   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3458   Register Hi32 = Unmerge.getReg(1);
3459 
3460   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3461   MI.eraseFromParent();
3462   return true;
3463 }
3464 
3465 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3466 // offset (the offset that is included in bounds checking and swizzling, to be
3467 // split between the instruction's voffset and immoffset fields) and soffset
3468 // (the offset that is excluded from bounds checking and swizzling, to go in
3469 // the instruction's soffset field).  This function takes the first kind of
3470 // offset and figures out how to split it between voffset and immoffset.
3471 std::tuple<Register, unsigned, unsigned>
3472 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3473                                         Register OrigOffset) const {
3474   const unsigned MaxImm = 4095;
3475   Register BaseReg;
3476   unsigned TotalConstOffset;
3477   MachineInstr *OffsetDef;
3478   const LLT S32 = LLT::scalar(32);
3479 
3480   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3481     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3482 
3483   unsigned ImmOffset = TotalConstOffset;
3484 
3485   // If the immediate value is too big for the immoffset field, put the value
3486   // and -4096 into the immoffset field so that the value that is copied/added
3487   // for the voffset field is a multiple of 4096, and it stands more chance
3488   // of being CSEd with the copy/add for another similar load/store.
3489   // However, do not do that rounding down to a multiple of 4096 if that is a
3490   // negative number, as it appears to be illegal to have a negative offset
3491   // in the vgpr, even if adding the immediate offset makes it positive.
3492   unsigned Overflow = ImmOffset & ~MaxImm;
3493   ImmOffset -= Overflow;
3494   if ((int32_t)Overflow < 0) {
3495     Overflow += ImmOffset;
3496     ImmOffset = 0;
3497   }
3498 
3499   if (Overflow != 0) {
3500     if (!BaseReg) {
3501       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3502     } else {
3503       auto OverflowVal = B.buildConstant(S32, Overflow);
3504       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3505     }
3506   }
3507 
3508   if (!BaseReg)
3509     BaseReg = B.buildConstant(S32, 0).getReg(0);
3510 
3511   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3512 }
3513 
3514 /// Handle register layout difference for f16 images for some subtargets.
3515 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3516                                              MachineRegisterInfo &MRI,
3517                                              Register Reg) const {
3518   if (!ST.hasUnpackedD16VMem())
3519     return Reg;
3520 
3521   const LLT S16 = LLT::scalar(16);
3522   const LLT S32 = LLT::scalar(32);
3523   LLT StoreVT = MRI.getType(Reg);
3524   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3525 
3526   auto Unmerge = B.buildUnmerge(S16, Reg);
3527 
3528   SmallVector<Register, 4> WideRegs;
3529   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3530     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3531 
3532   int NumElts = StoreVT.getNumElements();
3533 
3534   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3535 }
3536 
3537 Register AMDGPULegalizerInfo::fixStoreSourceType(
3538   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3539   MachineRegisterInfo *MRI = B.getMRI();
3540   LLT Ty = MRI->getType(VData);
3541 
3542   const LLT S16 = LLT::scalar(16);
3543 
3544   // Fixup illegal register types for i8 stores.
3545   if (Ty == LLT::scalar(8) || Ty == S16) {
3546     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3547     return AnyExt;
3548   }
3549 
3550   if (Ty.isVector()) {
3551     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3552       if (IsFormat)
3553         return handleD16VData(B, *MRI, VData);
3554     }
3555   }
3556 
3557   return VData;
3558 }
3559 
3560 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3561                                               MachineRegisterInfo &MRI,
3562                                               MachineIRBuilder &B,
3563                                               bool IsTyped,
3564                                               bool IsFormat) const {
3565   Register VData = MI.getOperand(1).getReg();
3566   LLT Ty = MRI.getType(VData);
3567   LLT EltTy = Ty.getScalarType();
3568   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3569   const LLT S32 = LLT::scalar(32);
3570 
3571   VData = fixStoreSourceType(B, VData, IsFormat);
3572   Register RSrc = MI.getOperand(2).getReg();
3573 
3574   MachineMemOperand *MMO = *MI.memoperands_begin();
3575   const int MemSize = MMO->getSize();
3576 
3577   unsigned ImmOffset;
3578   unsigned TotalOffset;
3579 
3580   // The typed intrinsics add an immediate after the registers.
3581   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3582 
3583   // The struct intrinsic variants add one additional operand over raw.
3584   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3585   Register VIndex;
3586   int OpOffset = 0;
3587   if (HasVIndex) {
3588     VIndex = MI.getOperand(3).getReg();
3589     OpOffset = 1;
3590   }
3591 
3592   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3593   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3594 
3595   unsigned Format = 0;
3596   if (IsTyped) {
3597     Format = MI.getOperand(5 + OpOffset).getImm();
3598     ++OpOffset;
3599   }
3600 
3601   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3602 
3603   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3604   if (TotalOffset != 0)
3605     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3606 
3607   unsigned Opc;
3608   if (IsTyped) {
3609     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3610                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3611   } else if (IsFormat) {
3612     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3613                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3614   } else {
3615     switch (MemSize) {
3616     case 1:
3617       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3618       break;
3619     case 2:
3620       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3621       break;
3622     default:
3623       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3624       break;
3625     }
3626   }
3627 
3628   if (!VIndex)
3629     VIndex = B.buildConstant(S32, 0).getReg(0);
3630 
3631   auto MIB = B.buildInstr(Opc)
3632     .addUse(VData)              // vdata
3633     .addUse(RSrc)               // rsrc
3634     .addUse(VIndex)             // vindex
3635     .addUse(VOffset)            // voffset
3636     .addUse(SOffset)            // soffset
3637     .addImm(ImmOffset);         // offset(imm)
3638 
3639   if (IsTyped)
3640     MIB.addImm(Format);
3641 
3642   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3643      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3644      .addMemOperand(MMO);
3645 
3646   MI.eraseFromParent();
3647   return true;
3648 }
3649 
3650 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3651                                              MachineRegisterInfo &MRI,
3652                                              MachineIRBuilder &B,
3653                                              bool IsFormat,
3654                                              bool IsTyped) const {
3655   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3656   MachineMemOperand *MMO = *MI.memoperands_begin();
3657   const int MemSize = MMO->getSize();
3658   const LLT S32 = LLT::scalar(32);
3659 
3660   Register Dst = MI.getOperand(0).getReg();
3661   Register RSrc = MI.getOperand(2).getReg();
3662 
3663   // The typed intrinsics add an immediate after the registers.
3664   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3665 
3666   // The struct intrinsic variants add one additional operand over raw.
3667   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3668   Register VIndex;
3669   int OpOffset = 0;
3670   if (HasVIndex) {
3671     VIndex = MI.getOperand(3).getReg();
3672     OpOffset = 1;
3673   }
3674 
3675   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3676   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3677 
3678   unsigned Format = 0;
3679   if (IsTyped) {
3680     Format = MI.getOperand(5 + OpOffset).getImm();
3681     ++OpOffset;
3682   }
3683 
3684   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3685   unsigned ImmOffset;
3686   unsigned TotalOffset;
3687 
3688   LLT Ty = MRI.getType(Dst);
3689   LLT EltTy = Ty.getScalarType();
3690   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3691   const bool Unpacked = ST.hasUnpackedD16VMem();
3692 
3693   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3694   if (TotalOffset != 0)
3695     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3696 
3697   unsigned Opc;
3698 
3699   if (IsTyped) {
3700     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3701                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3702   } else if (IsFormat) {
3703     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3704                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3705   } else {
3706     switch (MemSize) {
3707     case 1:
3708       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3709       break;
3710     case 2:
3711       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3712       break;
3713     default:
3714       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3715       break;
3716     }
3717   }
3718 
3719   Register LoadDstReg;
3720 
3721   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3722   LLT UnpackedTy = Ty.changeElementSize(32);
3723 
3724   if (IsExtLoad)
3725     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3726   else if (Unpacked && IsD16 && Ty.isVector())
3727     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3728   else
3729     LoadDstReg = Dst;
3730 
3731   if (!VIndex)
3732     VIndex = B.buildConstant(S32, 0).getReg(0);
3733 
3734   auto MIB = B.buildInstr(Opc)
3735     .addDef(LoadDstReg)         // vdata
3736     .addUse(RSrc)               // rsrc
3737     .addUse(VIndex)             // vindex
3738     .addUse(VOffset)            // voffset
3739     .addUse(SOffset)            // soffset
3740     .addImm(ImmOffset);         // offset(imm)
3741 
3742   if (IsTyped)
3743     MIB.addImm(Format);
3744 
3745   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3746      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3747      .addMemOperand(MMO);
3748 
3749   if (LoadDstReg != Dst) {
3750     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3751 
3752     // Widen result for extending loads was widened.
3753     if (IsExtLoad)
3754       B.buildTrunc(Dst, LoadDstReg);
3755     else {
3756       // Repack to original 16-bit vector result
3757       // FIXME: G_TRUNC should work, but legalization currently fails
3758       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3759       SmallVector<Register, 4> Repack;
3760       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3761         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3762       B.buildMerge(Dst, Repack);
3763     }
3764   }
3765 
3766   MI.eraseFromParent();
3767   return true;
3768 }
3769 
3770 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3771                                                MachineIRBuilder &B,
3772                                                bool IsInc) const {
3773   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3774                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3775   B.buildInstr(Opc)
3776     .addDef(MI.getOperand(0).getReg())
3777     .addUse(MI.getOperand(2).getReg())
3778     .addUse(MI.getOperand(3).getReg())
3779     .cloneMemRefs(MI);
3780   MI.eraseFromParent();
3781   return true;
3782 }
3783 
3784 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3785   switch (IntrID) {
3786   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3787   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3788     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3789   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3790   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3791     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3792   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3793   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3794     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3795   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3796   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3797     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3798   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3799   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3800     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3801   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3802   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3803     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3804   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3805   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3806     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3807   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3808   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3809     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3810   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3811   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3812     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3813   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3814   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3815     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3816   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3817   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3818     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3819   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3820   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3821     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3822   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3823   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3824     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3825   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3826   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3827     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3828   default:
3829     llvm_unreachable("unhandled atomic opcode");
3830   }
3831 }
3832 
3833 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3834                                                MachineIRBuilder &B,
3835                                                Intrinsic::ID IID) const {
3836   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3837                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3838   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3839 
3840   Register Dst;
3841 
3842   int OpOffset = 0;
3843   if (HasReturn) {
3844     // A few FP atomics do not support return values.
3845     Dst = MI.getOperand(0).getReg();
3846   } else {
3847     OpOffset = -1;
3848   }
3849 
3850   Register VData = MI.getOperand(2 + OpOffset).getReg();
3851   Register CmpVal;
3852 
3853   if (IsCmpSwap) {
3854     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3855     ++OpOffset;
3856   }
3857 
3858   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3859   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3860 
3861   // The struct intrinsic variants add one additional operand over raw.
3862   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3863   Register VIndex;
3864   if (HasVIndex) {
3865     VIndex = MI.getOperand(4 + OpOffset).getReg();
3866     ++OpOffset;
3867   }
3868 
3869   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3870   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3871   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3872 
3873   MachineMemOperand *MMO = *MI.memoperands_begin();
3874 
3875   unsigned ImmOffset;
3876   unsigned TotalOffset;
3877   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3878   if (TotalOffset != 0)
3879     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3880 
3881   if (!VIndex)
3882     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3883 
3884   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3885 
3886   if (HasReturn)
3887     MIB.addDef(Dst);
3888 
3889   MIB.addUse(VData); // vdata
3890 
3891   if (IsCmpSwap)
3892     MIB.addReg(CmpVal);
3893 
3894   MIB.addUse(RSrc)               // rsrc
3895      .addUse(VIndex)             // vindex
3896      .addUse(VOffset)            // voffset
3897      .addUse(SOffset)            // soffset
3898      .addImm(ImmOffset)          // offset(imm)
3899      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3900      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3901      .addMemOperand(MMO);
3902 
3903   MI.eraseFromParent();
3904   return true;
3905 }
3906 
3907 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3908 /// vector with s16 typed elements.
3909 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3910                                         SmallVectorImpl<Register> &PackedAddrs,
3911                                         int AddrIdx, int DimIdx, int EndIdx,
3912                                         int NumGradients) {
3913   const LLT S16 = LLT::scalar(16);
3914   const LLT V2S16 = LLT::vector(2, 16);
3915 
3916   for (int I = AddrIdx; I < EndIdx; ++I) {
3917     MachineOperand &SrcOp = MI.getOperand(I);
3918     if (!SrcOp.isReg())
3919       continue; // _L to _LZ may have eliminated this.
3920 
3921     Register AddrReg = SrcOp.getReg();
3922 
3923     if (I < DimIdx) {
3924       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3925       PackedAddrs.push_back(AddrReg);
3926     } else {
3927       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3928       // derivatives dx/dh and dx/dv are packed with undef.
3929       if (((I + 1) >= EndIdx) ||
3930           ((NumGradients / 2) % 2 == 1 &&
3931            (I == DimIdx + (NumGradients / 2) - 1 ||
3932             I == DimIdx + NumGradients - 1)) ||
3933           // Check for _L to _LZ optimization
3934           !MI.getOperand(I + 1).isReg()) {
3935         PackedAddrs.push_back(
3936             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3937                 .getReg(0));
3938       } else {
3939         PackedAddrs.push_back(
3940             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3941                 .getReg(0));
3942         ++I;
3943       }
3944     }
3945   }
3946 }
3947 
3948 /// Convert from separate vaddr components to a single vector address register,
3949 /// and replace the remaining operands with $noreg.
3950 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3951                                      int DimIdx, int NumVAddrs) {
3952   const LLT S32 = LLT::scalar(32);
3953 
3954   SmallVector<Register, 8> AddrRegs;
3955   for (int I = 0; I != NumVAddrs; ++I) {
3956     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3957     if (SrcOp.isReg()) {
3958       AddrRegs.push_back(SrcOp.getReg());
3959       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3960     }
3961   }
3962 
3963   int NumAddrRegs = AddrRegs.size();
3964   if (NumAddrRegs != 1) {
3965     // Round up to 8 elements for v5-v7
3966     // FIXME: Missing intermediate sized register classes and instructions.
3967     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3968       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3969       auto Undef = B.buildUndef(S32);
3970       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3971       NumAddrRegs = RoundedNumRegs;
3972     }
3973 
3974     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3975     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3976   }
3977 
3978   for (int I = 1; I != NumVAddrs; ++I) {
3979     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3980     if (SrcOp.isReg())
3981       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3982   }
3983 }
3984 
3985 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3986 ///
3987 /// Depending on the subtarget, load/store with 16-bit element data need to be
3988 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3989 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3990 /// registers.
3991 ///
3992 /// We don't want to directly select image instructions just yet, but also want
3993 /// to exposes all register repacking to the legalizer/combiners. We also don't
3994 /// want a selected instrution entering RegBankSelect. In order to avoid
3995 /// defining a multitude of intermediate image instructions, directly hack on
3996 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3997 /// now unnecessary arguments with $noreg.
3998 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3999     MachineInstr &MI, MachineIRBuilder &B,
4000     GISelChangeObserver &Observer,
4001     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
4002 
4003   const int NumDefs = MI.getNumExplicitDefs();
4004   bool IsTFE = NumDefs == 2;
4005   // We are only processing the operands of d16 image operations on subtargets
4006   // that use the unpacked register layout, or need to repack the TFE result.
4007 
4008   // TODO: Do we need to guard against already legalized intrinsics?
4009   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4010     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
4011 
4012   MachineRegisterInfo *MRI = B.getMRI();
4013   const LLT S32 = LLT::scalar(32);
4014   const LLT S16 = LLT::scalar(16);
4015   const LLT V2S16 = LLT::vector(2, 16);
4016 
4017   // Index of first address argument
4018   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
4019 
4020   int NumVAddrs, NumGradients;
4021   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
4022   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
4023     getDMaskIdx(BaseOpcode, NumDefs);
4024   unsigned DMask = 0;
4025 
4026   // Check for 16 bit addresses and pack if true.
4027   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4028   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
4029   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
4030   const bool IsG16 = GradTy == S16;
4031   const bool IsA16 = AddrTy == S16;
4032 
4033   int DMaskLanes = 0;
4034   if (!BaseOpcode->Atomic) {
4035     DMask = MI.getOperand(DMaskIdx).getImm();
4036     if (BaseOpcode->Gather4) {
4037       DMaskLanes = 4;
4038     } else if (DMask != 0) {
4039       DMaskLanes = countPopulation(DMask);
4040     } else if (!IsTFE && !BaseOpcode->Store) {
4041       // If dmask is 0, this is a no-op load. This can be eliminated.
4042       B.buildUndef(MI.getOperand(0));
4043       MI.eraseFromParent();
4044       return true;
4045     }
4046   }
4047 
4048   Observer.changingInstr(MI);
4049   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4050 
4051   unsigned NewOpcode = NumDefs == 0 ?
4052     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4053 
4054   // Track that we legalized this
4055   MI.setDesc(B.getTII().get(NewOpcode));
4056 
4057   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4058   // dmask to be at least 1 otherwise the instruction will fail
4059   if (IsTFE && DMask == 0) {
4060     DMask = 0x1;
4061     DMaskLanes = 1;
4062     MI.getOperand(DMaskIdx).setImm(DMask);
4063   }
4064 
4065   if (BaseOpcode->Atomic) {
4066     Register VData0 = MI.getOperand(2).getReg();
4067     LLT Ty = MRI->getType(VData0);
4068 
4069     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4070     if (Ty.isVector())
4071       return false;
4072 
4073     if (BaseOpcode->AtomicX2) {
4074       Register VData1 = MI.getOperand(3).getReg();
4075       // The two values are packed in one register.
4076       LLT PackedTy = LLT::vector(2, Ty);
4077       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4078       MI.getOperand(2).setReg(Concat.getReg(0));
4079       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4080     }
4081   }
4082 
4083   int CorrectedNumVAddrs = NumVAddrs;
4084 
4085   // Optimize _L to _LZ when _L is zero
4086   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4087         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
4088     const ConstantFP *ConstantLod;
4089     const int LodIdx = AddrIdx + NumVAddrs - 1;
4090 
4091     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
4092       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4093         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4094         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
4095           LZMappingInfo->LZ, ImageDimIntr->Dim);
4096 
4097         // The starting indexes should remain in the same place.
4098         --NumVAddrs;
4099         --CorrectedNumVAddrs;
4100 
4101         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
4102           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
4103         MI.RemoveOperand(LodIdx);
4104       }
4105     }
4106   }
4107 
4108   // Optimize _mip away, when 'lod' is zero
4109   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
4110     int64_t ConstantLod;
4111     const int LodIdx = AddrIdx + NumVAddrs - 1;
4112 
4113     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
4114       if (ConstantLod == 0) {
4115         // TODO: Change intrinsic opcode and remove operand instead or replacing
4116         // it with 0, as the _L to _LZ handling is done above.
4117         MI.getOperand(LodIdx).ChangeToImmediate(0);
4118         --CorrectedNumVAddrs;
4119       }
4120     }
4121   }
4122 
4123   // Rewrite the addressing register layout before doing anything else.
4124   if (IsA16 || IsG16) {
4125     if (IsA16) {
4126       // Target must support the feature and gradients need to be 16 bit too
4127       if (!ST.hasA16() || !IsG16)
4128         return false;
4129     } else if (!ST.hasG16())
4130       return false;
4131 
4132     if (NumVAddrs > 1) {
4133       SmallVector<Register, 4> PackedRegs;
4134       // Don't compress addresses for G16
4135       const int PackEndIdx =
4136           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4137       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4138                                   PackEndIdx, NumGradients);
4139 
4140       if (!IsA16) {
4141         // Add uncompressed address
4142         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4143           int AddrReg = MI.getOperand(I).getReg();
4144           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4145           PackedRegs.push_back(AddrReg);
4146         }
4147       }
4148 
4149       // See also below in the non-a16 branch
4150       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4151 
4152       if (!UseNSA && PackedRegs.size() > 1) {
4153         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4154         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4155         PackedRegs[0] = Concat.getReg(0);
4156         PackedRegs.resize(1);
4157       }
4158 
4159       const int NumPacked = PackedRegs.size();
4160       for (int I = 0; I != NumVAddrs; ++I) {
4161         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4162         if (!SrcOp.isReg()) {
4163           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4164           continue;
4165         }
4166 
4167         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4168 
4169         if (I < NumPacked)
4170           SrcOp.setReg(PackedRegs[I]);
4171         else
4172           SrcOp.setReg(AMDGPU::NoRegister);
4173       }
4174     }
4175   } else {
4176     // If the register allocator cannot place the address registers contiguously
4177     // without introducing moves, then using the non-sequential address encoding
4178     // is always preferable, since it saves VALU instructions and is usually a
4179     // wash in terms of code size or even better.
4180     //
4181     // However, we currently have no way of hinting to the register allocator
4182     // that MIMG addresses should be placed contiguously when it is possible to
4183     // do so, so force non-NSA for the common 2-address case as a heuristic.
4184     //
4185     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4186     // allocation when possible.
4187     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4188 
4189     if (!UseNSA && NumVAddrs > 1)
4190       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4191   }
4192 
4193   int Flags = 0;
4194   if (IsA16)
4195     Flags |= 1;
4196   if (IsG16)
4197     Flags |= 2;
4198   MI.addOperand(MachineOperand::CreateImm(Flags));
4199 
4200   if (BaseOpcode->Store) { // No TFE for stores?
4201     // TODO: Handle dmask trim
4202     Register VData = MI.getOperand(1).getReg();
4203     LLT Ty = MRI->getType(VData);
4204     if (!Ty.isVector() || Ty.getElementType() != S16)
4205       return true;
4206 
4207     Register RepackedReg = handleD16VData(B, *MRI, VData);
4208     if (RepackedReg != VData) {
4209       MI.getOperand(1).setReg(RepackedReg);
4210     }
4211 
4212     return true;
4213   }
4214 
4215   Register DstReg = MI.getOperand(0).getReg();
4216   LLT Ty = MRI->getType(DstReg);
4217   const LLT EltTy = Ty.getScalarType();
4218   const bool IsD16 = Ty.getScalarType() == S16;
4219   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4220 
4221   // Confirm that the return type is large enough for the dmask specified
4222   if (NumElts < DMaskLanes)
4223     return false;
4224 
4225   if (NumElts > 4 || DMaskLanes > 4)
4226     return false;
4227 
4228   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4229   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4230 
4231   // The raw dword aligned data component of the load. The only legal cases
4232   // where this matters should be when using the packed D16 format, for
4233   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4234   LLT RoundedTy;
4235 
4236   // S32 vector to to cover all data, plus TFE result element.
4237   LLT TFETy;
4238 
4239   // Register type to use for each loaded component. Will be S32 or V2S16.
4240   LLT RegTy;
4241 
4242   if (IsD16 && ST.hasUnpackedD16VMem()) {
4243     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4244     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4245     RegTy = S32;
4246   } else {
4247     unsigned EltSize = EltTy.getSizeInBits();
4248     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4249     unsigned RoundedSize = 32 * RoundedElts;
4250     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4251     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4252     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4253   }
4254 
4255   // The return type does not need adjustment.
4256   // TODO: Should we change s16 case to s32 or <2 x s16>?
4257   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4258     return true;
4259 
4260   Register Dst1Reg;
4261 
4262   // Insert after the instruction.
4263   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4264 
4265   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4266   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4267   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4268   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4269 
4270   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4271 
4272   MI.getOperand(0).setReg(NewResultReg);
4273 
4274   // In the IR, TFE is supposed to be used with a 2 element struct return
4275   // type. The intruction really returns these two values in one contiguous
4276   // register, with one additional dword beyond the loaded data. Rewrite the
4277   // return type to use a single register result.
4278 
4279   if (IsTFE) {
4280     Dst1Reg = MI.getOperand(1).getReg();
4281     if (MRI->getType(Dst1Reg) != S32)
4282       return false;
4283 
4284     // TODO: Make sure the TFE operand bit is set.
4285     MI.RemoveOperand(1);
4286 
4287     // Handle the easy case that requires no repack instructions.
4288     if (Ty == S32) {
4289       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4290       return true;
4291     }
4292   }
4293 
4294   // Now figure out how to copy the new result register back into the old
4295   // result.
4296   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4297 
4298   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4299 
4300   if (ResultNumRegs == 1) {
4301     assert(!IsTFE);
4302     ResultRegs[0] = NewResultReg;
4303   } else {
4304     // We have to repack into a new vector of some kind.
4305     for (int I = 0; I != NumDataRegs; ++I)
4306       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4307     B.buildUnmerge(ResultRegs, NewResultReg);
4308 
4309     // Drop the final TFE element to get the data part. The TFE result is
4310     // directly written to the right place already.
4311     if (IsTFE)
4312       ResultRegs.resize(NumDataRegs);
4313   }
4314 
4315   // For an s16 scalar result, we form an s32 result with a truncate regardless
4316   // of packed vs. unpacked.
4317   if (IsD16 && !Ty.isVector()) {
4318     B.buildTrunc(DstReg, ResultRegs[0]);
4319     return true;
4320   }
4321 
4322   // Avoid a build/concat_vector of 1 entry.
4323   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4324     B.buildBitcast(DstReg, ResultRegs[0]);
4325     return true;
4326   }
4327 
4328   assert(Ty.isVector());
4329 
4330   if (IsD16) {
4331     // For packed D16 results with TFE enabled, all the data components are
4332     // S32. Cast back to the expected type.
4333     //
4334     // TODO: We don't really need to use load s32 elements. We would only need one
4335     // cast for the TFE result if a multiple of v2s16 was used.
4336     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4337       for (Register &Reg : ResultRegs)
4338         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4339     } else if (ST.hasUnpackedD16VMem()) {
4340       for (Register &Reg : ResultRegs)
4341         Reg = B.buildTrunc(S16, Reg).getReg(0);
4342     }
4343   }
4344 
4345   auto padWithUndef = [&](LLT Ty, int NumElts) {
4346     if (NumElts == 0)
4347       return;
4348     Register Undef = B.buildUndef(Ty).getReg(0);
4349     for (int I = 0; I != NumElts; ++I)
4350       ResultRegs.push_back(Undef);
4351   };
4352 
4353   // Pad out any elements eliminated due to the dmask.
4354   LLT ResTy = MRI->getType(ResultRegs[0]);
4355   if (!ResTy.isVector()) {
4356     padWithUndef(ResTy, NumElts - ResultRegs.size());
4357     B.buildBuildVector(DstReg, ResultRegs);
4358     return true;
4359   }
4360 
4361   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4362   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4363 
4364   // Deal with the one annoying legal case.
4365   const LLT V3S16 = LLT::vector(3, 16);
4366   if (Ty == V3S16) {
4367     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4368     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4369     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4370     return true;
4371   }
4372 
4373   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4374   B.buildConcatVectors(DstReg, ResultRegs);
4375   return true;
4376 }
4377 
4378 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4379   LegalizerHelper &Helper, MachineInstr &MI) const {
4380   MachineIRBuilder &B = Helper.MIRBuilder;
4381   GISelChangeObserver &Observer = Helper.Observer;
4382 
4383   Register Dst = MI.getOperand(0).getReg();
4384   LLT Ty = B.getMRI()->getType(Dst);
4385   unsigned Size = Ty.getSizeInBits();
4386   MachineFunction &MF = B.getMF();
4387 
4388   Observer.changingInstr(MI);
4389 
4390   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4391     Ty = getBitcastRegisterType(Ty);
4392     Helper.bitcastDst(MI, Ty, 0);
4393     Dst = MI.getOperand(0).getReg();
4394     B.setInsertPt(B.getMBB(), MI);
4395   }
4396 
4397   // FIXME: We don't really need this intermediate instruction. The intrinsic
4398   // should be fixed to have a memory operand. Since it's readnone, we're not
4399   // allowed to add one.
4400   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4401   MI.RemoveOperand(1); // Remove intrinsic ID
4402 
4403   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4404   // TODO: Should this use datalayout alignment?
4405   const unsigned MemSize = (Size + 7) / 8;
4406   const Align MemAlign(4);
4407   MachineMemOperand *MMO = MF.getMachineMemOperand(
4408       MachinePointerInfo(),
4409       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4410           MachineMemOperand::MOInvariant,
4411       MemSize, MemAlign);
4412   MI.addMemOperand(MF, MMO);
4413 
4414   // There are no 96-bit result scalar loads, but widening to 128-bit should
4415   // always be legal. We may need to restore this to a 96-bit result if it turns
4416   // out this needs to be converted to a vector load during RegBankSelect.
4417   if (!isPowerOf2_32(Size)) {
4418     if (Ty.isVector())
4419       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4420     else
4421       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4422   }
4423 
4424   Observer.changedInstr(MI);
4425   return true;
4426 }
4427 
4428 // TODO: Move to selection
4429 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4430                                                 MachineRegisterInfo &MRI,
4431                                                 MachineIRBuilder &B) const {
4432   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4433   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4434       !ST.isTrapHandlerEnabled()) {
4435     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4436   } else {
4437     // Pass queue pointer to trap handler as input, and insert trap instruction
4438     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4439     MachineRegisterInfo &MRI = *B.getMRI();
4440 
4441     Register LiveIn =
4442       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4443     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4444       return false;
4445 
4446     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4447     B.buildCopy(SGPR01, LiveIn);
4448     B.buildInstr(AMDGPU::S_TRAP)
4449         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4450         .addReg(SGPR01, RegState::Implicit);
4451   }
4452 
4453   MI.eraseFromParent();
4454   return true;
4455 }
4456 
4457 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4458     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4459   // Is non-HSA path or trap-handler disabled? then, report a warning
4460   // accordingly
4461   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4462       !ST.isTrapHandlerEnabled()) {
4463     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4464                                      "debugtrap handler not supported",
4465                                      MI.getDebugLoc(), DS_Warning);
4466     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4467     Ctx.diagnose(NoTrap);
4468   } else {
4469     // Insert debug-trap instruction
4470     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4471   }
4472 
4473   MI.eraseFromParent();
4474   return true;
4475 }
4476 
4477 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4478                                             MachineInstr &MI) const {
4479   MachineIRBuilder &B = Helper.MIRBuilder;
4480   MachineRegisterInfo &MRI = *B.getMRI();
4481 
4482   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4483   auto IntrID = MI.getIntrinsicID();
4484   switch (IntrID) {
4485   case Intrinsic::amdgcn_if:
4486   case Intrinsic::amdgcn_else: {
4487     MachineInstr *Br = nullptr;
4488     MachineBasicBlock *UncondBrTarget = nullptr;
4489     bool Negated = false;
4490     if (MachineInstr *BrCond =
4491             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4492       const SIRegisterInfo *TRI
4493         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4494 
4495       Register Def = MI.getOperand(1).getReg();
4496       Register Use = MI.getOperand(3).getReg();
4497 
4498       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4499 
4500       if (Negated)
4501         std::swap(CondBrTarget, UncondBrTarget);
4502 
4503       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4504       if (IntrID == Intrinsic::amdgcn_if) {
4505         B.buildInstr(AMDGPU::SI_IF)
4506           .addDef(Def)
4507           .addUse(Use)
4508           .addMBB(UncondBrTarget);
4509       } else {
4510         B.buildInstr(AMDGPU::SI_ELSE)
4511           .addDef(Def)
4512           .addUse(Use)
4513           .addMBB(UncondBrTarget)
4514           .addImm(0);
4515       }
4516 
4517       if (Br) {
4518         Br->getOperand(0).setMBB(CondBrTarget);
4519       } else {
4520         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4521         // since we're swapping branch targets it needs to be reinserted.
4522         // FIXME: IRTranslator should probably not do this
4523         B.buildBr(*CondBrTarget);
4524       }
4525 
4526       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4527       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4528       MI.eraseFromParent();
4529       BrCond->eraseFromParent();
4530       return true;
4531     }
4532 
4533     return false;
4534   }
4535   case Intrinsic::amdgcn_loop: {
4536     MachineInstr *Br = nullptr;
4537     MachineBasicBlock *UncondBrTarget = nullptr;
4538     bool Negated = false;
4539     if (MachineInstr *BrCond =
4540             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4541       const SIRegisterInfo *TRI
4542         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4543 
4544       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4545       Register Reg = MI.getOperand(2).getReg();
4546 
4547       if (Negated)
4548         std::swap(CondBrTarget, UncondBrTarget);
4549 
4550       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4551       B.buildInstr(AMDGPU::SI_LOOP)
4552         .addUse(Reg)
4553         .addMBB(UncondBrTarget);
4554 
4555       if (Br)
4556         Br->getOperand(0).setMBB(CondBrTarget);
4557       else
4558         B.buildBr(*CondBrTarget);
4559 
4560       MI.eraseFromParent();
4561       BrCond->eraseFromParent();
4562       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4563       return true;
4564     }
4565 
4566     return false;
4567   }
4568   case Intrinsic::amdgcn_kernarg_segment_ptr:
4569     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4570       // This only makes sense to call in a kernel, so just lower to null.
4571       B.buildConstant(MI.getOperand(0).getReg(), 0);
4572       MI.eraseFromParent();
4573       return true;
4574     }
4575 
4576     return legalizePreloadedArgIntrin(
4577       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4578   case Intrinsic::amdgcn_implicitarg_ptr:
4579     return legalizeImplicitArgPtr(MI, MRI, B);
4580   case Intrinsic::amdgcn_workitem_id_x:
4581     return legalizePreloadedArgIntrin(MI, MRI, B,
4582                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4583   case Intrinsic::amdgcn_workitem_id_y:
4584     return legalizePreloadedArgIntrin(MI, MRI, B,
4585                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4586   case Intrinsic::amdgcn_workitem_id_z:
4587     return legalizePreloadedArgIntrin(MI, MRI, B,
4588                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4589   case Intrinsic::amdgcn_workgroup_id_x:
4590     return legalizePreloadedArgIntrin(MI, MRI, B,
4591                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4592   case Intrinsic::amdgcn_workgroup_id_y:
4593     return legalizePreloadedArgIntrin(MI, MRI, B,
4594                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4595   case Intrinsic::amdgcn_workgroup_id_z:
4596     return legalizePreloadedArgIntrin(MI, MRI, B,
4597                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4598   case Intrinsic::amdgcn_dispatch_ptr:
4599     return legalizePreloadedArgIntrin(MI, MRI, B,
4600                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4601   case Intrinsic::amdgcn_queue_ptr:
4602     return legalizePreloadedArgIntrin(MI, MRI, B,
4603                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4604   case Intrinsic::amdgcn_implicit_buffer_ptr:
4605     return legalizePreloadedArgIntrin(
4606       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4607   case Intrinsic::amdgcn_dispatch_id:
4608     return legalizePreloadedArgIntrin(MI, MRI, B,
4609                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4610   case Intrinsic::amdgcn_fdiv_fast:
4611     return legalizeFDIVFastIntrin(MI, MRI, B);
4612   case Intrinsic::amdgcn_is_shared:
4613     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4614   case Intrinsic::amdgcn_is_private:
4615     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4616   case Intrinsic::amdgcn_wavefrontsize: {
4617     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4618     MI.eraseFromParent();
4619     return true;
4620   }
4621   case Intrinsic::amdgcn_s_buffer_load:
4622     return legalizeSBufferLoad(Helper, MI);
4623   case Intrinsic::amdgcn_raw_buffer_store:
4624   case Intrinsic::amdgcn_struct_buffer_store:
4625     return legalizeBufferStore(MI, MRI, B, false, false);
4626   case Intrinsic::amdgcn_raw_buffer_store_format:
4627   case Intrinsic::amdgcn_struct_buffer_store_format:
4628     return legalizeBufferStore(MI, MRI, B, false, true);
4629   case Intrinsic::amdgcn_raw_tbuffer_store:
4630   case Intrinsic::amdgcn_struct_tbuffer_store:
4631     return legalizeBufferStore(MI, MRI, B, true, true);
4632   case Intrinsic::amdgcn_raw_buffer_load:
4633   case Intrinsic::amdgcn_struct_buffer_load:
4634     return legalizeBufferLoad(MI, MRI, B, false, false);
4635   case Intrinsic::amdgcn_raw_buffer_load_format:
4636   case Intrinsic::amdgcn_struct_buffer_load_format:
4637     return legalizeBufferLoad(MI, MRI, B, true, false);
4638   case Intrinsic::amdgcn_raw_tbuffer_load:
4639   case Intrinsic::amdgcn_struct_tbuffer_load:
4640     return legalizeBufferLoad(MI, MRI, B, true, true);
4641   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4642   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4643   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4644   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4645   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4646   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4647   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4648   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4649   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4650   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4651   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4652   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4653   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4654   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4655   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4656   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4657   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4658   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4659   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4660   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4661   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4662   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4663   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4664   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4665   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4666   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4667   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4668   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4669     return legalizeBufferAtomic(MI, B, IntrID);
4670   case Intrinsic::amdgcn_atomic_inc:
4671     return legalizeAtomicIncDec(MI, B, true);
4672   case Intrinsic::amdgcn_atomic_dec:
4673     return legalizeAtomicIncDec(MI, B, false);
4674   case Intrinsic::trap:
4675     return legalizeTrapIntrinsic(MI, MRI, B);
4676   case Intrinsic::debugtrap:
4677     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4678   case Intrinsic::amdgcn_rsq_clamp:
4679     return legalizeRsqClampIntrinsic(MI, MRI, B);
4680   case Intrinsic::amdgcn_ds_fadd:
4681   case Intrinsic::amdgcn_ds_fmin:
4682   case Intrinsic::amdgcn_ds_fmax:
4683     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4684   default: {
4685     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4686             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4687       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4688     return true;
4689   }
4690   }
4691 
4692   return true;
4693 }
4694