1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338 
339   LLT EltTy = Ty.getElementType();
340   if (EltTy.isPointer())
341     return true;
342 
343   unsigned EltSize = EltTy.getSizeInBits();
344   return EltSize != 32 && EltSize != 64;
345 }
346 
347 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
348                              unsigned Opcode) {
349   const LLT Ty = Query.Types[0];
350   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
351          !loadStoreBitcastWorkaround(Ty);
352 }
353 
354 /// Return true if a load or store of the type should be lowered with a bitcast
355 /// to a different type.
356 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
357                                        const unsigned MemSizeInBits) {
358   const unsigned Size = Ty.getSizeInBits();
359     if (Size != MemSizeInBits)
360       return Size <= 32 && Ty.isVector();
361 
362   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
363     return true;
364   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
365          !isRegisterVectorElementType(Ty.getElementType());
366 }
367 
368 /// Return true if we should legalize a load by widening an odd sized memory
369 /// access up to the alignment. Note this case when the memory access itself
370 /// changes, not the size of the result register.
371 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
372                             unsigned AlignInBits, unsigned AddrSpace,
373                             unsigned Opcode) {
374   // We don't want to widen cases that are naturally legal.
375   if (isPowerOf2_32(SizeInBits))
376     return false;
377 
378   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
379   // end up widening these for a scalar load during RegBankSelect, since there
380   // aren't 96-bit scalar loads.
381   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
382     return false;
383 
384   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
385     return false;
386 
387   // A load is known dereferenceable up to the alignment, so it's legal to widen
388   // to it.
389   //
390   // TODO: Could check dereferenceable for less aligned cases.
391   unsigned RoundedSize = NextPowerOf2(SizeInBits);
392   if (AlignInBits < RoundedSize)
393     return false;
394 
395   // Do not widen if it would introduce a slow unaligned load.
396   const SITargetLowering *TLI = ST.getTargetLowering();
397   bool Fast = false;
398   return TLI->allowsMisalignedMemoryAccessesImpl(
399              RoundedSize, AddrSpace, Align(AlignInBits / 8),
400              MachineMemOperand::MOLoad, &Fast) &&
401          Fast;
402 }
403 
404 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
405                             unsigned Opcode) {
406   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
407     return false;
408 
409   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
410                          Query.MMODescrs[0].AlignInBits,
411                          Query.Types[1].getAddressSpace(), Opcode);
412 }
413 
414 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
415                                          const GCNTargetMachine &TM)
416   :  ST(ST_) {
417   using namespace TargetOpcode;
418 
419   auto GetAddrSpacePtr = [&TM](unsigned AS) {
420     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
421   };
422 
423   const LLT S1 = LLT::scalar(1);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S16 = LLT::vector(2, 16);
433   const LLT V4S16 = LLT::vector(4, 16);
434 
435   const LLT V2S32 = LLT::vector(2, 32);
436   const LLT V3S32 = LLT::vector(3, 32);
437   const LLT V4S32 = LLT::vector(4, 32);
438   const LLT V5S32 = LLT::vector(5, 32);
439   const LLT V6S32 = LLT::vector(6, 32);
440   const LLT V7S32 = LLT::vector(7, 32);
441   const LLT V8S32 = LLT::vector(8, 32);
442   const LLT V9S32 = LLT::vector(9, 32);
443   const LLT V10S32 = LLT::vector(10, 32);
444   const LLT V11S32 = LLT::vector(11, 32);
445   const LLT V12S32 = LLT::vector(12, 32);
446   const LLT V13S32 = LLT::vector(13, 32);
447   const LLT V14S32 = LLT::vector(14, 32);
448   const LLT V15S32 = LLT::vector(15, 32);
449   const LLT V16S32 = LLT::vector(16, 32);
450   const LLT V32S32 = LLT::vector(32, 32);
451 
452   const LLT V2S64 = LLT::vector(2, 64);
453   const LLT V3S64 = LLT::vector(3, 64);
454   const LLT V4S64 = LLT::vector(4, 64);
455   const LLT V5S64 = LLT::vector(5, 64);
456   const LLT V6S64 = LLT::vector(6, 64);
457   const LLT V7S64 = LLT::vector(7, 64);
458   const LLT V8S64 = LLT::vector(8, 64);
459   const LLT V16S64 = LLT::vector(16, 64);
460 
461   std::initializer_list<LLT> AllS32Vectors =
462     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
463      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
464   std::initializer_list<LLT> AllS64Vectors =
465     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
466 
467   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
468   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
469   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
470   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
471   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
472   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
473   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
474 
475   const LLT CodePtr = FlatPtr;
476 
477   const std::initializer_list<LLT> AddrSpaces64 = {
478     GlobalPtr, ConstantPtr, FlatPtr
479   };
480 
481   const std::initializer_list<LLT> AddrSpaces32 = {
482     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
483   };
484 
485   const std::initializer_list<LLT> FPTypesBase = {
486     S32, S64
487   };
488 
489   const std::initializer_list<LLT> FPTypes16 = {
490     S32, S64, S16
491   };
492 
493   const std::initializer_list<LLT> FPTypesPK16 = {
494     S32, S64, S16, V2S16
495   };
496 
497   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
498 
499   setAction({G_BRCOND, S1}, Legal); // VCC branches
500   setAction({G_BRCOND, S32}, Legal); // SCC branches
501 
502   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
503   // elements for v3s16
504   getActionDefinitionsBuilder(G_PHI)
505     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
506     .legalFor(AllS32Vectors)
507     .legalFor(AllS64Vectors)
508     .legalFor(AddrSpaces64)
509     .legalFor(AddrSpaces32)
510     .legalIf(isPointer(0))
511     .clampScalar(0, S16, S256)
512     .widenScalarToNextPow2(0, 32)
513     .clampMaxNumElements(0, S32, 16)
514     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
515     .scalarize(0);
516 
517   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
518     // Full set of gfx9 features.
519     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
520       .legalFor({S32, S16, V2S16})
521       .clampScalar(0, S16, S32)
522       .clampMaxNumElements(0, S16, 2)
523       .scalarize(0)
524       .widenScalarToNextPow2(0, 32);
525 
526     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
527       .legalFor({S32, S16, V2S16}) // Clamp modifier
528       .minScalarOrElt(0, S16)
529       .clampMaxNumElements(0, S16, 2)
530       .scalarize(0)
531       .widenScalarToNextPow2(0, 32)
532       .lower();
533   } else if (ST.has16BitInsts()) {
534     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
535       .legalFor({S32, S16})
536       .clampScalar(0, S16, S32)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
539 
540     // Technically the saturating operations require clamp bit support, but this
541     // was introduced at the same time as 16-bit operations.
542     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
543       .legalFor({S32, S16}) // Clamp modifier
544       .minScalar(0, S16)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 16)
547       .lower();
548 
549     // We're just lowering this, but it helps get a better result to try to
550     // coerce to the desired type first.
551     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
552       .minScalar(0, S16)
553       .scalarize(0)
554       .lower();
555   } else {
556     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
557       .legalFor({S32})
558       .clampScalar(0, S32, S32)
559       .scalarize(0);
560 
561     if (ST.hasIntClamp()) {
562       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
563         .legalFor({S32}) // Clamp modifier.
564         .scalarize(0)
565         .minScalarOrElt(0, S32)
566         .lower();
567     } else {
568       // Clamp bit support was added in VI, along with 16-bit operations.
569       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
570         .minScalar(0, S32)
571         .scalarize(0)
572         .lower();
573     }
574 
575     // FIXME: DAG expansion gets better results. The widening uses the smaller
576     // range values and goes for the min/max lowering directly.
577     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
578       .minScalar(0, S32)
579       .scalarize(0)
580       .lower();
581   }
582 
583   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
584     .customFor({S32, S64})
585     .clampScalar(0, S32, S64)
586     .widenScalarToNextPow2(0, 32)
587     .scalarize(0);
588 
589   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
590     .legalFor({S32})
591     .clampScalar(0, S32, S32)
592     .scalarize(0);
593 
594   // Report legal for any types we can handle anywhere. For the cases only legal
595   // on the SALU, RegBankSelect will be able to re-legalize.
596   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
597     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
598     .clampScalar(0, S32, S64)
599     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
600     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
601     .widenScalarToNextPow2(0)
602     .scalarize(0);
603 
604   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
605                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
606     .legalFor({{S32, S1}, {S32, S32}})
607     .minScalar(0, S32)
608     // TODO: .scalarize(0)
609     .lower();
610 
611   getActionDefinitionsBuilder(G_BITCAST)
612     // Don't worry about the size constraint.
613     .legalIf(all(isRegisterType(0), isRegisterType(1)))
614     .lower();
615 
616 
617   getActionDefinitionsBuilder(G_CONSTANT)
618     .legalFor({S1, S32, S64, S16, GlobalPtr,
619                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
620     .legalIf(isPointer(0))
621     .clampScalar(0, S32, S64)
622     .widenScalarToNextPow2(0);
623 
624   getActionDefinitionsBuilder(G_FCONSTANT)
625     .legalFor({S32, S64, S16})
626     .clampScalar(0, S16, S64);
627 
628   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
629       .legalIf(isRegisterType(0))
630       // s1 and s16 are special cases because they have legal operations on
631       // them, but don't really occupy registers in the normal way.
632       .legalFor({S1, S16})
633       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
634       .clampScalarOrElt(0, S32, MaxScalar)
635       .widenScalarToNextPow2(0, 32)
636       .clampMaxNumElements(0, S32, 16);
637 
638   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
639 
640   // If the amount is divergent, we have to do a wave reduction to get the
641   // maximum value, so this is expanded during RegBankSelect.
642   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
643     .legalFor({{PrivatePtr, S32}});
644 
645   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
646     .customIf(typeIsNot(0, PrivatePtr));
647 
648   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
649 
650   auto &FPOpActions = getActionDefinitionsBuilder(
651     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
652     .legalFor({S32, S64});
653   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
654     .customFor({S32, S64});
655   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
656     .customFor({S32, S64});
657 
658   if (ST.has16BitInsts()) {
659     if (ST.hasVOP3PInsts())
660       FPOpActions.legalFor({S16, V2S16});
661     else
662       FPOpActions.legalFor({S16});
663 
664     TrigActions.customFor({S16});
665     FDIVActions.customFor({S16});
666   }
667 
668   auto &MinNumMaxNum = getActionDefinitionsBuilder({
669       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
670 
671   if (ST.hasVOP3PInsts()) {
672     MinNumMaxNum.customFor(FPTypesPK16)
673       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
674       .clampMaxNumElements(0, S16, 2)
675       .clampScalar(0, S16, S64)
676       .scalarize(0);
677   } else if (ST.has16BitInsts()) {
678     MinNumMaxNum.customFor(FPTypes16)
679       .clampScalar(0, S16, S64)
680       .scalarize(0);
681   } else {
682     MinNumMaxNum.customFor(FPTypesBase)
683       .clampScalar(0, S32, S64)
684       .scalarize(0);
685   }
686 
687   if (ST.hasVOP3PInsts())
688     FPOpActions.clampMaxNumElements(0, S16, 2);
689 
690   FPOpActions
691     .scalarize(0)
692     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
693 
694   TrigActions
695     .scalarize(0)
696     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
697 
698   FDIVActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   getActionDefinitionsBuilder({G_FNEG, G_FABS})
703     .legalFor(FPTypesPK16)
704     .clampMaxNumElements(0, S16, 2)
705     .scalarize(0)
706     .clampScalar(0, S16, S64);
707 
708   if (ST.has16BitInsts()) {
709     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
710       .legalFor({S32, S64, S16})
711       .scalarize(0)
712       .clampScalar(0, S16, S64);
713   } else {
714     getActionDefinitionsBuilder(G_FSQRT)
715       .legalFor({S32, S64})
716       .scalarize(0)
717       .clampScalar(0, S32, S64);
718 
719     if (ST.hasFractBug()) {
720       getActionDefinitionsBuilder(G_FFLOOR)
721         .customFor({S64})
722         .legalFor({S32, S64})
723         .scalarize(0)
724         .clampScalar(0, S32, S64);
725     } else {
726       getActionDefinitionsBuilder(G_FFLOOR)
727         .legalFor({S32, S64})
728         .scalarize(0)
729         .clampScalar(0, S32, S64);
730     }
731   }
732 
733   getActionDefinitionsBuilder(G_FPTRUNC)
734     .legalFor({{S32, S64}, {S16, S32}})
735     .scalarize(0)
736     .lower();
737 
738   getActionDefinitionsBuilder(G_FPEXT)
739     .legalFor({{S64, S32}, {S32, S16}})
740     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
741     .scalarize(0);
742 
743   getActionDefinitionsBuilder(G_FSUB)
744       // Use actual fsub instruction
745       .legalFor({S32})
746       // Must use fadd + fneg
747       .lowerFor({S64, S16, V2S16})
748       .scalarize(0)
749       .clampScalar(0, S32, S64);
750 
751   // Whether this is legal depends on the floating point mode for the function.
752   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
753   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
754     FMad.customFor({S32, S16});
755   else if (ST.hasMadMacF32Insts())
756     FMad.customFor({S32});
757   else if (ST.hasMadF16())
758     FMad.customFor({S16});
759   FMad.scalarize(0)
760       .lower();
761 
762   auto &FRem = getActionDefinitionsBuilder(G_FREM);
763   if (ST.has16BitInsts()) {
764     FRem.customFor({S16, S32, S64});
765   } else {
766     FRem.minScalar(0, S32)
767         .customFor({S32, S64});
768   }
769   FRem.scalarize(0);
770 
771   // TODO: Do we need to clamp maximum bitwidth?
772   getActionDefinitionsBuilder(G_TRUNC)
773     .legalIf(isScalar(0))
774     .legalFor({{V2S16, V2S32}})
775     .clampMaxNumElements(0, S16, 2)
776     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
777     // situations (like an invalid implicit use), we don't want to infinite loop
778     // in the legalizer.
779     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
780     .alwaysLegal();
781 
782   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
783     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
784                {S32, S1}, {S64, S1}, {S16, S1}})
785     .scalarize(0)
786     .clampScalar(0, S32, S64)
787     .widenScalarToNextPow2(1, 32);
788 
789   // TODO: Split s1->s64 during regbankselect for VALU.
790   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
791     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
792     .lowerFor({{S32, S64}})
793     .lowerIf(typeIs(1, S1))
794     .customFor({{S64, S64}});
795   if (ST.has16BitInsts())
796     IToFP.legalFor({{S16, S16}});
797   IToFP.clampScalar(1, S32, S64)
798        .minScalar(0, S32)
799        .scalarize(0)
800        .widenScalarToNextPow2(1);
801 
802   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
803     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
804     .customFor({{S64, S64}})
805     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
806   if (ST.has16BitInsts())
807     FPToI.legalFor({{S16, S16}});
808   else
809     FPToI.minScalar(1, S32);
810 
811   FPToI.minScalar(0, S32)
812        .scalarize(0)
813        .lower();
814 
815   // Lower roundeven into G_FRINT
816   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
817     .scalarize(0)
818     .lower();
819 
820   if (ST.has16BitInsts()) {
821     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
822       .legalFor({S16, S32, S64})
823       .clampScalar(0, S16, S64)
824       .scalarize(0);
825   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
826     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
827       .legalFor({S32, S64})
828       .clampScalar(0, S32, S64)
829       .scalarize(0);
830   } else {
831     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832       .legalFor({S32})
833       .customFor({S64})
834       .clampScalar(0, S32, S64)
835       .scalarize(0);
836   }
837 
838   getActionDefinitionsBuilder(G_PTR_ADD)
839     .legalIf(all(isPointer(0), sameSize(0, 1)))
840     .scalarize(0)
841     .scalarSameSizeAs(1, 0);
842 
843   getActionDefinitionsBuilder(G_PTRMASK)
844     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
845     .scalarSameSizeAs(1, 0)
846     .scalarize(0);
847 
848   auto &CmpBuilder =
849     getActionDefinitionsBuilder(G_ICMP)
850     // The compare output type differs based on the register bank of the output,
851     // so make both s1 and s32 legal.
852     //
853     // Scalar compares producing output in scc will be promoted to s32, as that
854     // is the allocatable register type that will be needed for the copy from
855     // scc. This will be promoted during RegBankSelect, and we assume something
856     // before that won't try to use s32 result types.
857     //
858     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
859     // bank.
860     .legalForCartesianProduct(
861       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
862     .legalForCartesianProduct(
863       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
864   if (ST.has16BitInsts()) {
865     CmpBuilder.legalFor({{S1, S16}});
866   }
867 
868   CmpBuilder
869     .widenScalarToNextPow2(1)
870     .clampScalar(1, S32, S64)
871     .scalarize(0)
872     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
873 
874   getActionDefinitionsBuilder(G_FCMP)
875     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
876     .widenScalarToNextPow2(1)
877     .clampScalar(1, S32, S64)
878     .scalarize(0);
879 
880   // FIXME: fpow has a selection pattern that should move to custom lowering.
881   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
882   if (ST.has16BitInsts())
883     Exp2Ops.legalFor({S32, S16});
884   else
885     Exp2Ops.legalFor({S32});
886   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
887   Exp2Ops.scalarize(0);
888 
889   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
890   if (ST.has16BitInsts())
891     ExpOps.customFor({{S32}, {S16}});
892   else
893     ExpOps.customFor({S32});
894   ExpOps.clampScalar(0, MinScalarFPTy, S32)
895         .scalarize(0);
896 
897   getActionDefinitionsBuilder(G_FPOWI)
898     .clampScalar(0, MinScalarFPTy, S32)
899     .lower();
900 
901   // The 64-bit versions produce 32-bit results, but only on the SALU.
902   getActionDefinitionsBuilder(G_CTPOP)
903     .legalFor({{S32, S32}, {S32, S64}})
904     .clampScalar(0, S32, S32)
905     .clampScalar(1, S32, S64)
906     .scalarize(0)
907     .widenScalarToNextPow2(0, 32)
908     .widenScalarToNextPow2(1, 32);
909 
910   // The hardware instructions return a different result on 0 than the generic
911   // instructions expect. The hardware produces -1, but these produce the
912   // bitwidth.
913   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
914     .scalarize(0)
915     .clampScalar(0, S32, S32)
916     .clampScalar(1, S32, S64)
917     .widenScalarToNextPow2(0, 32)
918     .widenScalarToNextPow2(1, 32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   getActionDefinitionsBuilder(G_BITREVERSE)
931     .legalFor({S32})
932     .clampScalar(0, S32, S32)
933     .scalarize(0);
934 
935   if (ST.has16BitInsts()) {
936     getActionDefinitionsBuilder(G_BSWAP)
937       .legalFor({S16, S32, V2S16})
938       .clampMaxNumElements(0, S16, 2)
939       // FIXME: Fixing non-power-of-2 before clamp is workaround for
940       // narrowScalar limitation.
941       .widenScalarToNextPow2(0)
942       .clampScalar(0, S16, S32)
943       .scalarize(0);
944 
945     if (ST.hasVOP3PInsts()) {
946       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
947         .legalFor({S32, S16, V2S16})
948         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
949         .clampMaxNumElements(0, S16, 2)
950         .minScalar(0, S16)
951         .widenScalarToNextPow2(0)
952         .scalarize(0)
953         .lower();
954     } else {
955       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
956         .legalFor({S32, S16})
957         .widenScalarToNextPow2(0)
958         .minScalar(0, S16)
959         .scalarize(0)
960         .lower();
961     }
962   } else {
963     // TODO: Should have same legality without v_perm_b32
964     getActionDefinitionsBuilder(G_BSWAP)
965       .legalFor({S32})
966       .lowerIf(scalarNarrowerThan(0, 32))
967       // FIXME: Fixing non-power-of-2 before clamp is workaround for
968       // narrowScalar limitation.
969       .widenScalarToNextPow2(0)
970       .maxScalar(0, S32)
971       .scalarize(0)
972       .lower();
973 
974     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
975       .legalFor({S32})
976       .minScalar(0, S32)
977       .widenScalarToNextPow2(0)
978       .scalarize(0)
979       .lower();
980   }
981 
982   getActionDefinitionsBuilder(G_INTTOPTR)
983     // List the common cases
984     .legalForCartesianProduct(AddrSpaces64, {S64})
985     .legalForCartesianProduct(AddrSpaces32, {S32})
986     .scalarize(0)
987     // Accept any address space as long as the size matches
988     .legalIf(sameSize(0, 1))
989     .widenScalarIf(smallerThan(1, 0),
990       [](const LegalityQuery &Query) {
991         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
992       })
993     .narrowScalarIf(largerThan(1, 0),
994       [](const LegalityQuery &Query) {
995         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
996       });
997 
998   getActionDefinitionsBuilder(G_PTRTOINT)
999     // List the common cases
1000     .legalForCartesianProduct(AddrSpaces64, {S64})
1001     .legalForCartesianProduct(AddrSpaces32, {S32})
1002     .scalarize(0)
1003     // Accept any address space as long as the size matches
1004     .legalIf(sameSize(0, 1))
1005     .widenScalarIf(smallerThan(0, 1),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1008       })
1009     .narrowScalarIf(
1010       largerThan(0, 1),
1011       [](const LegalityQuery &Query) {
1012         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1013       });
1014 
1015   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1016     .scalarize(0)
1017     .custom();
1018 
1019   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1020                                     bool IsLoad) -> bool {
1021     const LLT DstTy = Query.Types[0];
1022 
1023     // Split vector extloads.
1024     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1025     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1026 
1027     if (MemSize < DstTy.getSizeInBits())
1028       MemSize = std::max(MemSize, AlignBits);
1029 
1030     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1031       return true;
1032 
1033     const LLT PtrTy = Query.Types[1];
1034     unsigned AS = PtrTy.getAddressSpace();
1035     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1036       return true;
1037 
1038     // Catch weird sized loads that don't evenly divide into the access sizes
1039     // TODO: May be able to widen depending on alignment etc.
1040     unsigned NumRegs = (MemSize + 31) / 32;
1041     if (NumRegs == 3) {
1042       if (!ST.hasDwordx3LoadStores())
1043         return true;
1044     } else {
1045       // If the alignment allows, these should have been widened.
1046       if (!isPowerOf2_32(NumRegs))
1047         return true;
1048     }
1049 
1050     if (AlignBits < MemSize) {
1051       const SITargetLowering *TLI = ST.getTargetLowering();
1052       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1053                                                       Align(AlignBits / 8));
1054     }
1055 
1056     return false;
1057   };
1058 
1059   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1060   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1061   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1062 
1063   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1064   // LDS
1065   // TODO: Unsupported flat for SI.
1066 
1067   for (unsigned Op : {G_LOAD, G_STORE}) {
1068     const bool IsStore = Op == G_STORE;
1069 
1070     auto &Actions = getActionDefinitionsBuilder(Op);
1071     // Explicitly list some common cases.
1072     // TODO: Does this help compile time at all?
1073     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1074                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1075                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1076                                       {S64, GlobalPtr, 64, GlobalAlign32},
1077                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1078                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1079                                       {S32, GlobalPtr, 8, GlobalAlign8},
1080                                       {S32, GlobalPtr, 16, GlobalAlign16},
1081 
1082                                       {S32, LocalPtr, 32, 32},
1083                                       {S64, LocalPtr, 64, 32},
1084                                       {V2S32, LocalPtr, 64, 32},
1085                                       {S32, LocalPtr, 8, 8},
1086                                       {S32, LocalPtr, 16, 16},
1087                                       {V2S16, LocalPtr, 32, 32},
1088 
1089                                       {S32, PrivatePtr, 32, 32},
1090                                       {S32, PrivatePtr, 8, 8},
1091                                       {S32, PrivatePtr, 16, 16},
1092                                       {V2S16, PrivatePtr, 32, 32},
1093 
1094                                       {S32, ConstantPtr, 32, GlobalAlign32},
1095                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1096                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1097                                       {S64, ConstantPtr, 64, GlobalAlign32},
1098                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1099     Actions.legalIf(
1100       [=](const LegalityQuery &Query) -> bool {
1101         return isLoadStoreLegal(ST, Query, Op);
1102       });
1103 
1104     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1105     // 64-bits.
1106     //
1107     // TODO: Should generalize bitcast action into coerce, which will also cover
1108     // inserting addrspacecasts.
1109     Actions.customIf(typeIs(1, Constant32Ptr));
1110 
1111     // Turn any illegal element vectors into something easier to deal
1112     // with. These will ultimately produce 32-bit scalar shifts to extract the
1113     // parts anyway.
1114     //
1115     // For odd 16-bit element vectors, prefer to split those into pieces with
1116     // 16-bit vector parts.
1117     Actions.bitcastIf(
1118       [=](const LegalityQuery &Query) -> bool {
1119         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1120                                           Query.MMODescrs[0].SizeInBits);
1121       }, bitcastToRegisterType(0));
1122 
1123     if (!IsStore) {
1124       // Widen suitably aligned loads by loading extra bytes. The standard
1125       // legalization actions can't properly express widening memory operands.
1126       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1127         return shouldWidenLoad(ST, Query, G_LOAD);
1128       });
1129     }
1130 
1131     // FIXME: load/store narrowing should be moved to lower action
1132     Actions
1133         .narrowScalarIf(
1134             [=](const LegalityQuery &Query) -> bool {
1135               return !Query.Types[0].isVector() &&
1136                      needToSplitMemOp(Query, Op == G_LOAD);
1137             },
1138             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1139               const LLT DstTy = Query.Types[0];
1140               const LLT PtrTy = Query.Types[1];
1141 
1142               const unsigned DstSize = DstTy.getSizeInBits();
1143               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1144 
1145               // Split extloads.
1146               if (DstSize > MemSize)
1147                 return std::make_pair(0, LLT::scalar(MemSize));
1148 
1149               if (!isPowerOf2_32(DstSize)) {
1150                 // We're probably decomposing an odd sized store. Try to split
1151                 // to the widest type. TODO: Account for alignment. As-is it
1152                 // should be OK, since the new parts will be further legalized.
1153                 unsigned FloorSize = PowerOf2Floor(DstSize);
1154                 return std::make_pair(0, LLT::scalar(FloorSize));
1155               }
1156 
1157               if (DstSize > 32 && (DstSize % 32 != 0)) {
1158                 // FIXME: Need a way to specify non-extload of larger size if
1159                 // suitably aligned.
1160                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1161               }
1162 
1163               unsigned MaxSize = maxSizeForAddrSpace(ST,
1164                                                      PtrTy.getAddressSpace(),
1165                                                      Op == G_LOAD);
1166               if (MemSize > MaxSize)
1167                 return std::make_pair(0, LLT::scalar(MaxSize));
1168 
1169               unsigned Align = Query.MMODescrs[0].AlignInBits;
1170               return std::make_pair(0, LLT::scalar(Align));
1171             })
1172         .fewerElementsIf(
1173             [=](const LegalityQuery &Query) -> bool {
1174               return Query.Types[0].isVector() &&
1175                      needToSplitMemOp(Query, Op == G_LOAD);
1176             },
1177             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1178               const LLT DstTy = Query.Types[0];
1179               const LLT PtrTy = Query.Types[1];
1180 
1181               LLT EltTy = DstTy.getElementType();
1182               unsigned MaxSize = maxSizeForAddrSpace(ST,
1183                                                      PtrTy.getAddressSpace(),
1184                                                      Op == G_LOAD);
1185 
1186               // FIXME: Handle widened to power of 2 results better. This ends
1187               // up scalarizing.
1188               // FIXME: 3 element stores scalarized on SI
1189 
1190               // Split if it's too large for the address space.
1191               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1192                 unsigned NumElts = DstTy.getNumElements();
1193                 unsigned EltSize = EltTy.getSizeInBits();
1194 
1195                 if (MaxSize % EltSize == 0) {
1196                   return std::make_pair(
1197                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1198                 }
1199 
1200                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1201 
1202                 // FIXME: Refine when odd breakdowns handled
1203                 // The scalars will need to be re-legalized.
1204                 if (NumPieces == 1 || NumPieces >= NumElts ||
1205                     NumElts % NumPieces != 0)
1206                   return std::make_pair(0, EltTy);
1207 
1208                 return std::make_pair(0,
1209                                       LLT::vector(NumElts / NumPieces, EltTy));
1210               }
1211 
1212               // FIXME: We could probably handle weird extending loads better.
1213               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1214               if (DstTy.getSizeInBits() > MemSize)
1215                 return std::make_pair(0, EltTy);
1216 
1217               unsigned EltSize = EltTy.getSizeInBits();
1218               unsigned DstSize = DstTy.getSizeInBits();
1219               if (!isPowerOf2_32(DstSize)) {
1220                 // We're probably decomposing an odd sized store. Try to split
1221                 // to the widest type. TODO: Account for alignment. As-is it
1222                 // should be OK, since the new parts will be further legalized.
1223                 unsigned FloorSize = PowerOf2Floor(DstSize);
1224                 return std::make_pair(
1225                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1226               }
1227 
1228               // Need to split because of alignment.
1229               unsigned Align = Query.MMODescrs[0].AlignInBits;
1230               if (EltSize > Align &&
1231                   (EltSize / Align < DstTy.getNumElements())) {
1232                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1233               }
1234 
1235               // May need relegalization for the scalars.
1236               return std::make_pair(0, EltTy);
1237             })
1238     .lowerIfMemSizeNotPow2()
1239     .minScalar(0, S32);
1240 
1241     if (IsStore)
1242       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1243 
1244     Actions
1245         .widenScalarToNextPow2(0)
1246         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1247         .lower();
1248   }
1249 
1250   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1251                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1252                                                   {S32, GlobalPtr, 16, 2 * 8},
1253                                                   {S32, LocalPtr, 8, 8},
1254                                                   {S32, LocalPtr, 16, 16},
1255                                                   {S32, PrivatePtr, 8, 8},
1256                                                   {S32, PrivatePtr, 16, 16},
1257                                                   {S32, ConstantPtr, 8, 8},
1258                                                   {S32, ConstantPtr, 16, 2 * 8}});
1259   if (ST.hasFlatAddressSpace()) {
1260     ExtLoads.legalForTypesWithMemDesc(
1261         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1262   }
1263 
1264   ExtLoads.clampScalar(0, S32, S32)
1265           .widenScalarToNextPow2(0)
1266           .unsupportedIfMemSizeNotPow2()
1267           .lower();
1268 
1269   auto &Atomics = getActionDefinitionsBuilder(
1270     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1271      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1272      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1273      G_ATOMICRMW_UMIN})
1274     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1275                {S64, GlobalPtr}, {S64, LocalPtr},
1276                {S32, RegionPtr}, {S64, RegionPtr}});
1277   if (ST.hasFlatAddressSpace()) {
1278     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1279   }
1280 
1281   if (ST.hasLDSFPAtomics()) {
1282     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1283       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1284   }
1285 
1286   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1287   // demarshalling
1288   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1289     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1290                 {S32, FlatPtr}, {S64, FlatPtr}})
1291     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1292                {S32, RegionPtr}, {S64, RegionPtr}});
1293   // TODO: Pointer types, any 32-bit or 64-bit vector
1294 
1295   // Condition should be s32 for scalar, s1 for vector.
1296   getActionDefinitionsBuilder(G_SELECT)
1297     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1298           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1299           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1300     .clampScalar(0, S16, S64)
1301     .scalarize(1)
1302     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1303     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1304     .clampMaxNumElements(0, S32, 2)
1305     .clampMaxNumElements(0, LocalPtr, 2)
1306     .clampMaxNumElements(0, PrivatePtr, 2)
1307     .scalarize(0)
1308     .widenScalarToNextPow2(0)
1309     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1310 
1311   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1312   // be more flexible with the shift amount type.
1313   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1314     .legalFor({{S32, S32}, {S64, S32}});
1315   if (ST.has16BitInsts()) {
1316     if (ST.hasVOP3PInsts()) {
1317       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1318             .clampMaxNumElements(0, S16, 2);
1319     } else
1320       Shifts.legalFor({{S16, S16}});
1321 
1322     // TODO: Support 16-bit shift amounts for all types
1323     Shifts.widenScalarIf(
1324       [=](const LegalityQuery &Query) {
1325         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1326         // 32-bit amount.
1327         const LLT ValTy = Query.Types[0];
1328         const LLT AmountTy = Query.Types[1];
1329         return ValTy.getSizeInBits() <= 16 &&
1330                AmountTy.getSizeInBits() < 16;
1331       }, changeTo(1, S16));
1332     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1333     Shifts.clampScalar(1, S32, S32);
1334     Shifts.clampScalar(0, S16, S64);
1335     Shifts.widenScalarToNextPow2(0, 16);
1336 
1337     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1338       .minScalar(0, S16)
1339       .scalarize(0)
1340       .lower();
1341   } else {
1342     // Make sure we legalize the shift amount type first, as the general
1343     // expansion for the shifted type will produce much worse code if it hasn't
1344     // been truncated already.
1345     Shifts.clampScalar(1, S32, S32);
1346     Shifts.clampScalar(0, S32, S64);
1347     Shifts.widenScalarToNextPow2(0, 32);
1348 
1349     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350       .minScalar(0, S32)
1351       .scalarize(0)
1352       .lower();
1353   }
1354   Shifts.scalarize(0);
1355 
1356   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1357     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1358     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1359     unsigned IdxTypeIdx = 2;
1360 
1361     getActionDefinitionsBuilder(Op)
1362       .customIf([=](const LegalityQuery &Query) {
1363           const LLT EltTy = Query.Types[EltTypeIdx];
1364           const LLT VecTy = Query.Types[VecTypeIdx];
1365           const LLT IdxTy = Query.Types[IdxTypeIdx];
1366           const unsigned EltSize = EltTy.getSizeInBits();
1367           return (EltSize == 32 || EltSize == 64) &&
1368                   VecTy.getSizeInBits() % 32 == 0 &&
1369                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1370                   IdxTy.getSizeInBits() == 32;
1371         })
1372       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1373                  bitcastToVectorElement32(VecTypeIdx))
1374       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1375       .bitcastIf(
1376         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1377         [=](const LegalityQuery &Query) {
1378           // For > 64-bit element types, try to turn this into a 64-bit
1379           // element vector since we may be able to do better indexing
1380           // if this is scalar. If not, fall back to 32.
1381           const LLT EltTy = Query.Types[EltTypeIdx];
1382           const LLT VecTy = Query.Types[VecTypeIdx];
1383           const unsigned DstEltSize = EltTy.getSizeInBits();
1384           const unsigned VecSize = VecTy.getSizeInBits();
1385 
1386           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1387           return std::make_pair(
1388             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1389         })
1390       .clampScalar(EltTypeIdx, S32, S64)
1391       .clampScalar(VecTypeIdx, S32, S64)
1392       .clampScalar(IdxTypeIdx, S32, S32)
1393       .clampMaxNumElements(VecTypeIdx, S32, 32)
1394       // TODO: Clamp elements for 64-bit vectors?
1395       // It should only be necessary with variable indexes.
1396       // As a last resort, lower to the stack
1397       .lower();
1398   }
1399 
1400   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1401     .unsupportedIf([=](const LegalityQuery &Query) {
1402         const LLT &EltTy = Query.Types[1].getElementType();
1403         return Query.Types[0] != EltTy;
1404       });
1405 
1406   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1407     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1408     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1409 
1410     // FIXME: Doesn't handle extract of illegal sizes.
1411     getActionDefinitionsBuilder(Op)
1412       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1413       // FIXME: Multiples of 16 should not be legal.
1414       .legalIf([=](const LegalityQuery &Query) {
1415           const LLT BigTy = Query.Types[BigTyIdx];
1416           const LLT LitTy = Query.Types[LitTyIdx];
1417           return (BigTy.getSizeInBits() % 32 == 0) &&
1418                  (LitTy.getSizeInBits() % 16 == 0);
1419         })
1420       .widenScalarIf(
1421         [=](const LegalityQuery &Query) {
1422           const LLT BigTy = Query.Types[BigTyIdx];
1423           return (BigTy.getScalarSizeInBits() < 16);
1424         },
1425         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1426       .widenScalarIf(
1427         [=](const LegalityQuery &Query) {
1428           const LLT LitTy = Query.Types[LitTyIdx];
1429           return (LitTy.getScalarSizeInBits() < 16);
1430         },
1431         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1432       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1433       .widenScalarToNextPow2(BigTyIdx, 32);
1434 
1435   }
1436 
1437   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1438     .legalForCartesianProduct(AllS32Vectors, {S32})
1439     .legalForCartesianProduct(AllS64Vectors, {S64})
1440     .clampNumElements(0, V16S32, V32S32)
1441     .clampNumElements(0, V2S64, V16S64)
1442     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1443 
1444   if (ST.hasScalarPackInsts()) {
1445     BuildVector
1446       // FIXME: Should probably widen s1 vectors straight to s32
1447       .minScalarOrElt(0, S16)
1448       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1449       .minScalar(1, S32);
1450 
1451     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1452       .legalFor({V2S16, S32})
1453       .lower();
1454     BuildVector.minScalarOrElt(0, S32);
1455   } else {
1456     BuildVector.customFor({V2S16, S16});
1457     BuildVector.minScalarOrElt(0, S32);
1458 
1459     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460       .customFor({V2S16, S32})
1461       .lower();
1462   }
1463 
1464   BuildVector.legalIf(isRegisterType(0));
1465 
1466   // FIXME: Clamp maximum size
1467   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1468     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1469     .clampMaxNumElements(0, S32, 32)
1470     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1471     .clampMaxNumElements(0, S16, 64);
1472 
1473   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1474   // pre-legalize.
1475   if (ST.hasVOP3PInsts()) {
1476     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1477       .customFor({V2S16, V2S16})
1478       .lower();
1479   } else
1480     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1481 
1482   // Merge/Unmerge
1483   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1484     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1485     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1486 
1487     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1488       const LLT Ty = Query.Types[TypeIdx];
1489       if (Ty.isVector()) {
1490         const LLT &EltTy = Ty.getElementType();
1491         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1492           return true;
1493         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1494           return true;
1495       }
1496       return false;
1497     };
1498 
1499     auto &Builder = getActionDefinitionsBuilder(Op)
1500       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1501       .lowerFor({{S16, V2S16}})
1502       .lowerIf([=](const LegalityQuery &Query) {
1503           const LLT BigTy = Query.Types[BigTyIdx];
1504           return BigTy.getSizeInBits() == 32;
1505         })
1506       // Try to widen to s16 first for small types.
1507       // TODO: Only do this on targets with legal s16 shifts
1508       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1509       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1510       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1511       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1512                            elementTypeIs(1, S16)),
1513                        changeTo(1, V2S16))
1514       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1515       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1516       // valid.
1517       .clampScalar(LitTyIdx, S32, S512)
1518       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1519       // Break up vectors with weird elements into scalars
1520       .fewerElementsIf(
1521         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1522         scalarize(0))
1523       .fewerElementsIf(
1524         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1525         scalarize(1))
1526       .clampScalar(BigTyIdx, S32, MaxScalar);
1527 
1528     if (Op == G_MERGE_VALUES) {
1529       Builder.widenScalarIf(
1530         // TODO: Use 16-bit shifts if legal for 8-bit values?
1531         [=](const LegalityQuery &Query) {
1532           const LLT Ty = Query.Types[LitTyIdx];
1533           return Ty.getSizeInBits() < 32;
1534         },
1535         changeTo(LitTyIdx, S32));
1536     }
1537 
1538     Builder.widenScalarIf(
1539       [=](const LegalityQuery &Query) {
1540         const LLT Ty = Query.Types[BigTyIdx];
1541         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1542           Ty.getSizeInBits() % 16 != 0;
1543       },
1544       [=](const LegalityQuery &Query) {
1545         // Pick the next power of 2, or a multiple of 64 over 128.
1546         // Whichever is smaller.
1547         const LLT &Ty = Query.Types[BigTyIdx];
1548         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1549         if (NewSizeInBits >= 256) {
1550           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1551           if (RoundedTo < NewSizeInBits)
1552             NewSizeInBits = RoundedTo;
1553         }
1554         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1555       })
1556       // Any vectors left are the wrong size. Scalarize them.
1557       .scalarize(0)
1558       .scalarize(1);
1559   }
1560 
1561   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1562   // RegBankSelect.
1563   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1564     .legalFor({{S32}, {S64}});
1565 
1566   if (ST.hasVOP3PInsts()) {
1567     SextInReg.lowerFor({{V2S16}})
1568       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1569       // get more vector shift opportunities, since we'll get those when
1570       // expanded.
1571       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1572   } else if (ST.has16BitInsts()) {
1573     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1574   } else {
1575     // Prefer to promote to s32 before lowering if we don't have 16-bit
1576     // shifts. This avoid a lot of intermediate truncate and extend operations.
1577     SextInReg.lowerFor({{S32}, {S64}});
1578   }
1579 
1580   SextInReg
1581     .scalarize(0)
1582     .clampScalar(0, S32, S64)
1583     .lower();
1584 
1585   getActionDefinitionsBuilder(G_FSHR)
1586     .legalFor({{S32, S32}})
1587     .scalarize(0)
1588     .lower();
1589 
1590   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1591     .legalFor({S64});
1592 
1593   getActionDefinitionsBuilder(G_FENCE)
1594     .alwaysLegal();
1595 
1596   getActionDefinitionsBuilder({
1597       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1598       G_FCOPYSIGN,
1599 
1600       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1601       G_ATOMICRMW_NAND,
1602       G_ATOMICRMW_FSUB,
1603       G_READ_REGISTER,
1604       G_WRITE_REGISTER,
1605 
1606       G_SADDO, G_SSUBO,
1607 
1608        // TODO: Implement
1609       G_FMINIMUM, G_FMAXIMUM,
1610       G_FSHL
1611     }).lower();
1612 
1613   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1614         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1615         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1616     .unsupported();
1617 
1618   computeTables();
1619   verify(*ST.getInstrInfo());
1620 }
1621 
1622 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1623                                          MachineInstr &MI) const {
1624   MachineIRBuilder &B = Helper.MIRBuilder;
1625   MachineRegisterInfo &MRI = *B.getMRI();
1626 
1627   switch (MI.getOpcode()) {
1628   case TargetOpcode::G_ADDRSPACE_CAST:
1629     return legalizeAddrSpaceCast(MI, MRI, B);
1630   case TargetOpcode::G_FRINT:
1631     return legalizeFrint(MI, MRI, B);
1632   case TargetOpcode::G_FCEIL:
1633     return legalizeFceil(MI, MRI, B);
1634   case TargetOpcode::G_FREM:
1635     return legalizeFrem(MI, MRI, B);
1636   case TargetOpcode::G_INTRINSIC_TRUNC:
1637     return legalizeIntrinsicTrunc(MI, MRI, B);
1638   case TargetOpcode::G_SITOFP:
1639     return legalizeITOFP(MI, MRI, B, true);
1640   case TargetOpcode::G_UITOFP:
1641     return legalizeITOFP(MI, MRI, B, false);
1642   case TargetOpcode::G_FPTOSI:
1643     return legalizeFPTOI(MI, MRI, B, true);
1644   case TargetOpcode::G_FPTOUI:
1645     return legalizeFPTOI(MI, MRI, B, false);
1646   case TargetOpcode::G_FMINNUM:
1647   case TargetOpcode::G_FMAXNUM:
1648   case TargetOpcode::G_FMINNUM_IEEE:
1649   case TargetOpcode::G_FMAXNUM_IEEE:
1650     return legalizeMinNumMaxNum(Helper, MI);
1651   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1652     return legalizeExtractVectorElt(MI, MRI, B);
1653   case TargetOpcode::G_INSERT_VECTOR_ELT:
1654     return legalizeInsertVectorElt(MI, MRI, B);
1655   case TargetOpcode::G_SHUFFLE_VECTOR:
1656     return legalizeShuffleVector(MI, MRI, B);
1657   case TargetOpcode::G_FSIN:
1658   case TargetOpcode::G_FCOS:
1659     return legalizeSinCos(MI, MRI, B);
1660   case TargetOpcode::G_GLOBAL_VALUE:
1661     return legalizeGlobalValue(MI, MRI, B);
1662   case TargetOpcode::G_LOAD:
1663     return legalizeLoad(Helper, MI);
1664   case TargetOpcode::G_FMAD:
1665     return legalizeFMad(MI, MRI, B);
1666   case TargetOpcode::G_FDIV:
1667     return legalizeFDIV(MI, MRI, B);
1668   case TargetOpcode::G_UDIV:
1669   case TargetOpcode::G_UREM:
1670     return legalizeUDIV_UREM(MI, MRI, B);
1671   case TargetOpcode::G_SDIV:
1672   case TargetOpcode::G_SREM:
1673     return legalizeSDIV_SREM(MI, MRI, B);
1674   case TargetOpcode::G_ATOMIC_CMPXCHG:
1675     return legalizeAtomicCmpXChg(MI, MRI, B);
1676   case TargetOpcode::G_FLOG:
1677     return legalizeFlog(MI, B, numbers::ln2f);
1678   case TargetOpcode::G_FLOG10:
1679     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1680   case TargetOpcode::G_FEXP:
1681     return legalizeFExp(MI, B);
1682   case TargetOpcode::G_FPOW:
1683     return legalizeFPow(MI, B);
1684   case TargetOpcode::G_FFLOOR:
1685     return legalizeFFloor(MI, MRI, B);
1686   case TargetOpcode::G_BUILD_VECTOR:
1687     return legalizeBuildVector(MI, MRI, B);
1688   default:
1689     return false;
1690   }
1691 
1692   llvm_unreachable("expected switch to return");
1693 }
1694 
1695 Register AMDGPULegalizerInfo::getSegmentAperture(
1696   unsigned AS,
1697   MachineRegisterInfo &MRI,
1698   MachineIRBuilder &B) const {
1699   MachineFunction &MF = B.getMF();
1700   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1701   const LLT S32 = LLT::scalar(32);
1702 
1703   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1704 
1705   if (ST.hasApertureRegs()) {
1706     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1707     // getreg.
1708     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1709         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1710         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1711     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1712         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1713         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1714     unsigned Encoding =
1715         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1716         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1717         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1718 
1719     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1720 
1721     B.buildInstr(AMDGPU::S_GETREG_B32)
1722       .addDef(GetReg)
1723       .addImm(Encoding);
1724     MRI.setType(GetReg, S32);
1725 
1726     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1727     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1728   }
1729 
1730   Register QueuePtr = MRI.createGenericVirtualRegister(
1731     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1732 
1733   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1734     return Register();
1735 
1736   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1737   // private_segment_aperture_base_hi.
1738   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1739 
1740   // TODO: can we be smarter about machine pointer info?
1741   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1742   MachineMemOperand *MMO = MF.getMachineMemOperand(
1743       PtrInfo,
1744       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1745           MachineMemOperand::MOInvariant,
1746       4, commonAlignment(Align(64), StructOffset));
1747 
1748   Register LoadAddr;
1749 
1750   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1751   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1752 }
1753 
1754 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1755   MachineInstr &MI, MachineRegisterInfo &MRI,
1756   MachineIRBuilder &B) const {
1757   MachineFunction &MF = B.getMF();
1758 
1759   const LLT S32 = LLT::scalar(32);
1760   Register Dst = MI.getOperand(0).getReg();
1761   Register Src = MI.getOperand(1).getReg();
1762 
1763   LLT DstTy = MRI.getType(Dst);
1764   LLT SrcTy = MRI.getType(Src);
1765   unsigned DestAS = DstTy.getAddressSpace();
1766   unsigned SrcAS = SrcTy.getAddressSpace();
1767 
1768   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1769   // vector element.
1770   assert(!DstTy.isVector());
1771 
1772   const AMDGPUTargetMachine &TM
1773     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1774 
1775   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1776     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1777     return true;
1778   }
1779 
1780   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1781     // Truncate.
1782     B.buildExtract(Dst, Src, 0);
1783     MI.eraseFromParent();
1784     return true;
1785   }
1786 
1787   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1788     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1789     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1790 
1791     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1792     // another. Merge operands are required to be the same type, but creating an
1793     // extra ptrtoint would be kind of pointless.
1794     auto HighAddr = B.buildConstant(
1795       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1796     B.buildMerge(Dst, {Src, HighAddr});
1797     MI.eraseFromParent();
1798     return true;
1799   }
1800 
1801   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1802     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1803            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1804     unsigned NullVal = TM.getNullPointerValue(DestAS);
1805 
1806     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1807     auto FlatNull = B.buildConstant(SrcTy, 0);
1808 
1809     // Extract low 32-bits of the pointer.
1810     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1811 
1812     auto CmpRes =
1813         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1814     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1815 
1816     MI.eraseFromParent();
1817     return true;
1818   }
1819 
1820   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1821     return false;
1822 
1823   if (!ST.hasFlatAddressSpace())
1824     return false;
1825 
1826   auto SegmentNull =
1827       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1828   auto FlatNull =
1829       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1830 
1831   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1832   if (!ApertureReg.isValid())
1833     return false;
1834 
1835   auto CmpRes =
1836       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1837 
1838   // Coerce the type of the low half of the result so we can use merge_values.
1839   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1840 
1841   // TODO: Should we allow mismatched types but matching sizes in merges to
1842   // avoid the ptrtoint?
1843   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1844   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1845 
1846   MI.eraseFromParent();
1847   return true;
1848 }
1849 
1850 bool AMDGPULegalizerInfo::legalizeFrint(
1851   MachineInstr &MI, MachineRegisterInfo &MRI,
1852   MachineIRBuilder &B) const {
1853   Register Src = MI.getOperand(1).getReg();
1854   LLT Ty = MRI.getType(Src);
1855   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1856 
1857   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1858   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1859 
1860   auto C1 = B.buildFConstant(Ty, C1Val);
1861   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1862 
1863   // TODO: Should this propagate fast-math-flags?
1864   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1865   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1866 
1867   auto C2 = B.buildFConstant(Ty, C2Val);
1868   auto Fabs = B.buildFAbs(Ty, Src);
1869 
1870   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1871   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1872   MI.eraseFromParent();
1873   return true;
1874 }
1875 
1876 bool AMDGPULegalizerInfo::legalizeFceil(
1877   MachineInstr &MI, MachineRegisterInfo &MRI,
1878   MachineIRBuilder &B) const {
1879 
1880   const LLT S1 = LLT::scalar(1);
1881   const LLT S64 = LLT::scalar(64);
1882 
1883   Register Src = MI.getOperand(1).getReg();
1884   assert(MRI.getType(Src) == S64);
1885 
1886   // result = trunc(src)
1887   // if (src > 0.0 && src != result)
1888   //   result += 1.0
1889 
1890   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1891 
1892   const auto Zero = B.buildFConstant(S64, 0.0);
1893   const auto One = B.buildFConstant(S64, 1.0);
1894   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1895   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1896   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1897   auto Add = B.buildSelect(S64, And, One, Zero);
1898 
1899   // TODO: Should this propagate fast-math-flags?
1900   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1901   return true;
1902 }
1903 
1904 bool AMDGPULegalizerInfo::legalizeFrem(
1905   MachineInstr &MI, MachineRegisterInfo &MRI,
1906   MachineIRBuilder &B) const {
1907     Register DstReg = MI.getOperand(0).getReg();
1908     Register Src0Reg = MI.getOperand(1).getReg();
1909     Register Src1Reg = MI.getOperand(2).getReg();
1910     auto Flags = MI.getFlags();
1911     LLT Ty = MRI.getType(DstReg);
1912 
1913     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1914     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1915     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1916     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1917     MI.eraseFromParent();
1918     return true;
1919 }
1920 
1921 static MachineInstrBuilder extractF64Exponent(Register Hi,
1922                                               MachineIRBuilder &B) {
1923   const unsigned FractBits = 52;
1924   const unsigned ExpBits = 11;
1925   LLT S32 = LLT::scalar(32);
1926 
1927   auto Const0 = B.buildConstant(S32, FractBits - 32);
1928   auto Const1 = B.buildConstant(S32, ExpBits);
1929 
1930   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1931     .addUse(Hi)
1932     .addUse(Const0.getReg(0))
1933     .addUse(Const1.getReg(0));
1934 
1935   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1936 }
1937 
1938 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1939   MachineInstr &MI, MachineRegisterInfo &MRI,
1940   MachineIRBuilder &B) const {
1941   const LLT S1 = LLT::scalar(1);
1942   const LLT S32 = LLT::scalar(32);
1943   const LLT S64 = LLT::scalar(64);
1944 
1945   Register Src = MI.getOperand(1).getReg();
1946   assert(MRI.getType(Src) == S64);
1947 
1948   // TODO: Should this use extract since the low half is unused?
1949   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1950   Register Hi = Unmerge.getReg(1);
1951 
1952   // Extract the upper half, since this is where we will find the sign and
1953   // exponent.
1954   auto Exp = extractF64Exponent(Hi, B);
1955 
1956   const unsigned FractBits = 52;
1957 
1958   // Extract the sign bit.
1959   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1960   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1961 
1962   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1963 
1964   const auto Zero32 = B.buildConstant(S32, 0);
1965 
1966   // Extend back to 64-bits.
1967   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1968 
1969   auto Shr = B.buildAShr(S64, FractMask, Exp);
1970   auto Not = B.buildNot(S64, Shr);
1971   auto Tmp0 = B.buildAnd(S64, Src, Not);
1972   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1973 
1974   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1975   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1976 
1977   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1978   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1979   MI.eraseFromParent();
1980   return true;
1981 }
1982 
1983 bool AMDGPULegalizerInfo::legalizeITOFP(
1984   MachineInstr &MI, MachineRegisterInfo &MRI,
1985   MachineIRBuilder &B, bool Signed) const {
1986 
1987   Register Dst = MI.getOperand(0).getReg();
1988   Register Src = MI.getOperand(1).getReg();
1989 
1990   const LLT S64 = LLT::scalar(64);
1991   const LLT S32 = LLT::scalar(32);
1992 
1993   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1994 
1995   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1996 
1997   auto CvtHi = Signed ?
1998     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1999     B.buildUITOFP(S64, Unmerge.getReg(1));
2000 
2001   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2002 
2003   auto ThirtyTwo = B.buildConstant(S32, 32);
2004   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2005     .addUse(CvtHi.getReg(0))
2006     .addUse(ThirtyTwo.getReg(0));
2007 
2008   // TODO: Should this propagate fast-math-flags?
2009   B.buildFAdd(Dst, LdExp, CvtLo);
2010   MI.eraseFromParent();
2011   return true;
2012 }
2013 
2014 // TODO: Copied from DAG implementation. Verify logic and document how this
2015 // actually works.
2016 bool AMDGPULegalizerInfo::legalizeFPTOI(
2017   MachineInstr &MI, MachineRegisterInfo &MRI,
2018   MachineIRBuilder &B, bool Signed) const {
2019 
2020   Register Dst = MI.getOperand(0).getReg();
2021   Register Src = MI.getOperand(1).getReg();
2022 
2023   const LLT S64 = LLT::scalar(64);
2024   const LLT S32 = LLT::scalar(32);
2025 
2026   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2027 
2028   unsigned Flags = MI.getFlags();
2029 
2030   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2031   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2032   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2033 
2034   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2035   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2036   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2037 
2038   auto Hi = Signed ?
2039     B.buildFPTOSI(S32, FloorMul) :
2040     B.buildFPTOUI(S32, FloorMul);
2041   auto Lo = B.buildFPTOUI(S32, Fma);
2042 
2043   B.buildMerge(Dst, { Lo, Hi });
2044   MI.eraseFromParent();
2045 
2046   return true;
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2050                                                MachineInstr &MI) const {
2051   MachineFunction &MF = Helper.MIRBuilder.getMF();
2052   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2053 
2054   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2055                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2056 
2057   // With ieee_mode disabled, the instructions have the correct behavior
2058   // already for G_FMINNUM/G_FMAXNUM
2059   if (!MFI->getMode().IEEE)
2060     return !IsIEEEOp;
2061 
2062   if (IsIEEEOp)
2063     return true;
2064 
2065   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2066 }
2067 
2068 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2069   MachineInstr &MI, MachineRegisterInfo &MRI,
2070   MachineIRBuilder &B) const {
2071   // TODO: Should move some of this into LegalizerHelper.
2072 
2073   // TODO: Promote dynamic indexing of s16 to s32
2074 
2075   // FIXME: Artifact combiner probably should have replaced the truncated
2076   // constant before this, so we shouldn't need
2077   // getConstantVRegValWithLookThrough.
2078   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2079     MI.getOperand(2).getReg(), MRI);
2080   if (!IdxVal) // Dynamic case will be selected to register indexing.
2081     return true;
2082 
2083   Register Dst = MI.getOperand(0).getReg();
2084   Register Vec = MI.getOperand(1).getReg();
2085 
2086   LLT VecTy = MRI.getType(Vec);
2087   LLT EltTy = VecTy.getElementType();
2088   assert(EltTy == MRI.getType(Dst));
2089 
2090   if (IdxVal->Value < VecTy.getNumElements())
2091     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2092   else
2093     B.buildUndef(Dst);
2094 
2095   MI.eraseFromParent();
2096   return true;
2097 }
2098 
2099 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2100   MachineInstr &MI, MachineRegisterInfo &MRI,
2101   MachineIRBuilder &B) const {
2102   // TODO: Should move some of this into LegalizerHelper.
2103 
2104   // TODO: Promote dynamic indexing of s16 to s32
2105 
2106   // FIXME: Artifact combiner probably should have replaced the truncated
2107   // constant before this, so we shouldn't need
2108   // getConstantVRegValWithLookThrough.
2109   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2110     MI.getOperand(3).getReg(), MRI);
2111   if (!IdxVal) // Dynamic case will be selected to register indexing.
2112     return true;
2113 
2114   Register Dst = MI.getOperand(0).getReg();
2115   Register Vec = MI.getOperand(1).getReg();
2116   Register Ins = MI.getOperand(2).getReg();
2117 
2118   LLT VecTy = MRI.getType(Vec);
2119   LLT EltTy = VecTy.getElementType();
2120   assert(EltTy == MRI.getType(Ins));
2121 
2122   if (IdxVal->Value < VecTy.getNumElements())
2123     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2124   else
2125     B.buildUndef(Dst);
2126 
2127   MI.eraseFromParent();
2128   return true;
2129 }
2130 
2131 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2132   MachineInstr &MI, MachineRegisterInfo &MRI,
2133   MachineIRBuilder &B) const {
2134   const LLT V2S16 = LLT::vector(2, 16);
2135 
2136   Register Dst = MI.getOperand(0).getReg();
2137   Register Src0 = MI.getOperand(1).getReg();
2138   LLT DstTy = MRI.getType(Dst);
2139   LLT SrcTy = MRI.getType(Src0);
2140 
2141   if (SrcTy == V2S16 && DstTy == V2S16 &&
2142       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2143     return true;
2144 
2145   MachineIRBuilder HelperBuilder(MI);
2146   GISelObserverWrapper DummyObserver;
2147   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2148   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2149 }
2150 
2151 bool AMDGPULegalizerInfo::legalizeSinCos(
2152   MachineInstr &MI, MachineRegisterInfo &MRI,
2153   MachineIRBuilder &B) const {
2154 
2155   Register DstReg = MI.getOperand(0).getReg();
2156   Register SrcReg = MI.getOperand(1).getReg();
2157   LLT Ty = MRI.getType(DstReg);
2158   unsigned Flags = MI.getFlags();
2159 
2160   Register TrigVal;
2161   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2162   if (ST.hasTrigReducedRange()) {
2163     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2164     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2165       .addUse(MulVal.getReg(0))
2166       .setMIFlags(Flags).getReg(0);
2167   } else
2168     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2169 
2170   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2171     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2172   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2173     .addUse(TrigVal)
2174     .setMIFlags(Flags);
2175   MI.eraseFromParent();
2176   return true;
2177 }
2178 
2179 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2180                                                   MachineIRBuilder &B,
2181                                                   const GlobalValue *GV,
2182                                                   int64_t Offset,
2183                                                   unsigned GAFlags) const {
2184   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2185   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2186   // to the following code sequence:
2187   //
2188   // For constant address space:
2189   //   s_getpc_b64 s[0:1]
2190   //   s_add_u32 s0, s0, $symbol
2191   //   s_addc_u32 s1, s1, 0
2192   //
2193   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2194   //   a fixup or relocation is emitted to replace $symbol with a literal
2195   //   constant, which is a pc-relative offset from the encoding of the $symbol
2196   //   operand to the global variable.
2197   //
2198   // For global address space:
2199   //   s_getpc_b64 s[0:1]
2200   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2201   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2202   //
2203   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204   //   fixups or relocations are emitted to replace $symbol@*@lo and
2205   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2206   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2207   //   operand to the global variable.
2208   //
2209   // What we want here is an offset from the value returned by s_getpc
2210   // (which is the address of the s_add_u32 instruction) to the global
2211   // variable, but since the encoding of $symbol starts 4 bytes after the start
2212   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2213   // small. This requires us to add 4 to the global variable offset in order to
2214   // compute the correct address.
2215 
2216   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2217 
2218   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2219     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2220 
2221   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2222     .addDef(PCReg);
2223 
2224   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2225   if (GAFlags == SIInstrInfo::MO_NONE)
2226     MIB.addImm(0);
2227   else
2228     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2229 
2230   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2231 
2232   if (PtrTy.getSizeInBits() == 32)
2233     B.buildExtract(DstReg, PCReg, 0);
2234   return true;
2235  }
2236 
2237 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2238   MachineInstr &MI, MachineRegisterInfo &MRI,
2239   MachineIRBuilder &B) const {
2240   Register DstReg = MI.getOperand(0).getReg();
2241   LLT Ty = MRI.getType(DstReg);
2242   unsigned AS = Ty.getAddressSpace();
2243 
2244   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2245   MachineFunction &MF = B.getMF();
2246   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2247 
2248   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2249     if (!MFI->isEntryFunction()) {
2250       const Function &Fn = MF.getFunction();
2251       DiagnosticInfoUnsupported BadLDSDecl(
2252         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2253         DS_Warning);
2254       Fn.getContext().diagnose(BadLDSDecl);
2255 
2256       // We currently don't have a way to correctly allocate LDS objects that
2257       // aren't directly associated with a kernel. We do force inlining of
2258       // functions that use local objects. However, if these dead functions are
2259       // not eliminated, we don't want a compile time error. Just emit a warning
2260       // and a trap, since there should be no callable path here.
2261       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2262       B.buildUndef(DstReg);
2263       MI.eraseFromParent();
2264       return true;
2265     }
2266 
2267     // TODO: We could emit code to handle the initialization somewhere.
2268     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2269       const SITargetLowering *TLI = ST.getTargetLowering();
2270       if (!TLI->shouldUseLDSConstAddress(GV)) {
2271         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2272         return true; // Leave in place;
2273       }
2274 
2275       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2276         Type *Ty = GV->getValueType();
2277         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2278         // zero-sized type in other languages to declare the dynamic shared
2279         // memory which size is not known at the compile time. They will be
2280         // allocated by the runtime and placed directly after the static
2281         // allocated ones. They all share the same offset.
2282         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2283           // Adjust alignment for that dynamic shared memory array.
2284           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2285           LLT S32 = LLT::scalar(32);
2286           auto Sz =
2287               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2288           B.buildIntToPtr(DstReg, Sz);
2289           MI.eraseFromParent();
2290           return true;
2291         }
2292       }
2293 
2294       B.buildConstant(
2295           DstReg,
2296           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2297       MI.eraseFromParent();
2298       return true;
2299     }
2300 
2301     const Function &Fn = MF.getFunction();
2302     DiagnosticInfoUnsupported BadInit(
2303       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2304     Fn.getContext().diagnose(BadInit);
2305     return true;
2306   }
2307 
2308   const SITargetLowering *TLI = ST.getTargetLowering();
2309 
2310   if (TLI->shouldEmitFixup(GV)) {
2311     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2312     MI.eraseFromParent();
2313     return true;
2314   }
2315 
2316   if (TLI->shouldEmitPCReloc(GV)) {
2317     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2318     MI.eraseFromParent();
2319     return true;
2320   }
2321 
2322   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2323   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2324 
2325   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2326       MachinePointerInfo::getGOT(MF),
2327       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2328           MachineMemOperand::MOInvariant,
2329       8 /*Size*/, Align(8));
2330 
2331   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2332 
2333   if (Ty.getSizeInBits() == 32) {
2334     // Truncate if this is a 32-bit constant adrdess.
2335     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2336     B.buildExtract(DstReg, Load, 0);
2337   } else
2338     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2339 
2340   MI.eraseFromParent();
2341   return true;
2342 }
2343 
2344 static LLT widenToNextPowerOf2(LLT Ty) {
2345   if (Ty.isVector())
2346     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2347   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2348 }
2349 
2350 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2351                                        MachineInstr &MI) const {
2352   MachineIRBuilder &B = Helper.MIRBuilder;
2353   MachineRegisterInfo &MRI = *B.getMRI();
2354   GISelChangeObserver &Observer = Helper.Observer;
2355 
2356   Register PtrReg = MI.getOperand(1).getReg();
2357   LLT PtrTy = MRI.getType(PtrReg);
2358   unsigned AddrSpace = PtrTy.getAddressSpace();
2359 
2360   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2361     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2362     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2363     Observer.changingInstr(MI);
2364     MI.getOperand(1).setReg(Cast.getReg(0));
2365     Observer.changedInstr(MI);
2366     return true;
2367   }
2368 
2369   Register ValReg = MI.getOperand(0).getReg();
2370   LLT ValTy = MRI.getType(ValReg);
2371 
2372   MachineMemOperand *MMO = *MI.memoperands_begin();
2373   const unsigned ValSize = ValTy.getSizeInBits();
2374   const unsigned MemSize = 8 * MMO->getSize();
2375   const Align MemAlign = MMO->getAlign();
2376   const unsigned AlignInBits = 8 * MemAlign.value();
2377 
2378   // Widen non-power-of-2 loads to the alignment if needed
2379   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2380     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2381 
2382     // This was already the correct extending load result type, so just adjust
2383     // the memory type.
2384     if (WideMemSize == ValSize) {
2385       MachineFunction &MF = B.getMF();
2386 
2387       MachineMemOperand *WideMMO =
2388           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2389       Observer.changingInstr(MI);
2390       MI.setMemRefs(MF, {WideMMO});
2391       Observer.changedInstr(MI);
2392       return true;
2393     }
2394 
2395     // Don't bother handling edge case that should probably never be produced.
2396     if (ValSize > WideMemSize)
2397       return false;
2398 
2399     LLT WideTy = widenToNextPowerOf2(ValTy);
2400 
2401     Register WideLoad;
2402     if (!WideTy.isVector()) {
2403       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2404       B.buildTrunc(ValReg, WideLoad).getReg(0);
2405     } else {
2406       // Extract the subvector.
2407 
2408       if (isRegisterType(ValTy)) {
2409         // If this a case where G_EXTRACT is legal, use it.
2410         // (e.g. <3 x s32> -> <4 x s32>)
2411         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2412         B.buildExtract(ValReg, WideLoad, 0);
2413       } else {
2414         // For cases where the widened type isn't a nice register value, unmerge
2415         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2416         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2417         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2418         B.setInsertPt(B.getMBB(), MI.getIterator());
2419         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2420       }
2421     }
2422 
2423     MI.eraseFromParent();
2424     return true;
2425   }
2426 
2427   return false;
2428 }
2429 
2430 bool AMDGPULegalizerInfo::legalizeFMad(
2431   MachineInstr &MI, MachineRegisterInfo &MRI,
2432   MachineIRBuilder &B) const {
2433   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2434   assert(Ty.isScalar());
2435 
2436   MachineFunction &MF = B.getMF();
2437   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2438 
2439   // TODO: Always legal with future ftz flag.
2440   // FIXME: Do we need just output?
2441   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2442     return true;
2443   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2444     return true;
2445 
2446   MachineIRBuilder HelperBuilder(MI);
2447   GISelObserverWrapper DummyObserver;
2448   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2449   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2450 }
2451 
2452 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2453   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2454   Register DstReg = MI.getOperand(0).getReg();
2455   Register PtrReg = MI.getOperand(1).getReg();
2456   Register CmpVal = MI.getOperand(2).getReg();
2457   Register NewVal = MI.getOperand(3).getReg();
2458 
2459   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2460          "this should not have been custom lowered");
2461 
2462   LLT ValTy = MRI.getType(CmpVal);
2463   LLT VecTy = LLT::vector(2, ValTy);
2464 
2465   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2466 
2467   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2468     .addDef(DstReg)
2469     .addUse(PtrReg)
2470     .addUse(PackedVal)
2471     .setMemRefs(MI.memoperands());
2472 
2473   MI.eraseFromParent();
2474   return true;
2475 }
2476 
2477 bool AMDGPULegalizerInfo::legalizeFlog(
2478   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2479   Register Dst = MI.getOperand(0).getReg();
2480   Register Src = MI.getOperand(1).getReg();
2481   LLT Ty = B.getMRI()->getType(Dst);
2482   unsigned Flags = MI.getFlags();
2483 
2484   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2485   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2486 
2487   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2488   MI.eraseFromParent();
2489   return true;
2490 }
2491 
2492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2493                                        MachineIRBuilder &B) const {
2494   Register Dst = MI.getOperand(0).getReg();
2495   Register Src = MI.getOperand(1).getReg();
2496   unsigned Flags = MI.getFlags();
2497   LLT Ty = B.getMRI()->getType(Dst);
2498 
2499   auto K = B.buildFConstant(Ty, numbers::log2e);
2500   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2501   B.buildFExp2(Dst, Mul, Flags);
2502   MI.eraseFromParent();
2503   return true;
2504 }
2505 
2506 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2507                                        MachineIRBuilder &B) const {
2508   Register Dst = MI.getOperand(0).getReg();
2509   Register Src0 = MI.getOperand(1).getReg();
2510   Register Src1 = MI.getOperand(2).getReg();
2511   unsigned Flags = MI.getFlags();
2512   LLT Ty = B.getMRI()->getType(Dst);
2513   const LLT S16 = LLT::scalar(16);
2514   const LLT S32 = LLT::scalar(32);
2515 
2516   if (Ty == S32) {
2517     auto Log = B.buildFLog2(S32, Src0, Flags);
2518     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2519       .addUse(Log.getReg(0))
2520       .addUse(Src1)
2521       .setMIFlags(Flags);
2522     B.buildFExp2(Dst, Mul, Flags);
2523   } else if (Ty == S16) {
2524     // There's no f16 fmul_legacy, so we need to convert for it.
2525     auto Log = B.buildFLog2(S16, Src0, Flags);
2526     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2527     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2528     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2529       .addUse(Ext0.getReg(0))
2530       .addUse(Ext1.getReg(0))
2531       .setMIFlags(Flags);
2532 
2533     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2534   } else
2535     return false;
2536 
2537   MI.eraseFromParent();
2538   return true;
2539 }
2540 
2541 // Find a source register, ignoring any possible source modifiers.
2542 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2543   Register ModSrc = OrigSrc;
2544   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2545     ModSrc = SrcFNeg->getOperand(1).getReg();
2546     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2547       ModSrc = SrcFAbs->getOperand(1).getReg();
2548   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2549     ModSrc = SrcFAbs->getOperand(1).getReg();
2550   return ModSrc;
2551 }
2552 
2553 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2554                                          MachineRegisterInfo &MRI,
2555                                          MachineIRBuilder &B) const {
2556 
2557   const LLT S1 = LLT::scalar(1);
2558   const LLT S64 = LLT::scalar(64);
2559   Register Dst = MI.getOperand(0).getReg();
2560   Register OrigSrc = MI.getOperand(1).getReg();
2561   unsigned Flags = MI.getFlags();
2562   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2563          "this should not have been custom lowered");
2564 
2565   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2566   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2567   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2568   // V_FRACT bug is:
2569   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2570   //
2571   // Convert floor(x) to (x - fract(x))
2572 
2573   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2574     .addUse(OrigSrc)
2575     .setMIFlags(Flags);
2576 
2577   // Give source modifier matching some assistance before obscuring a foldable
2578   // pattern.
2579 
2580   // TODO: We can avoid the neg on the fract? The input sign to fract
2581   // shouldn't matter?
2582   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2583 
2584   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2585 
2586   Register Min = MRI.createGenericVirtualRegister(S64);
2587 
2588   // We don't need to concern ourselves with the snan handling difference, so
2589   // use the one which will directly select.
2590   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2591   if (MFI->getMode().IEEE)
2592     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2593   else
2594     B.buildFMinNum(Min, Fract, Const, Flags);
2595 
2596   Register CorrectedFract = Min;
2597   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2598     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2599     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2600   }
2601 
2602   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2603   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2604 
2605   MI.eraseFromParent();
2606   return true;
2607 }
2608 
2609 // Turn an illegal packed v2s16 build vector into bit operations.
2610 // TODO: This should probably be a bitcast action in LegalizerHelper.
2611 bool AMDGPULegalizerInfo::legalizeBuildVector(
2612   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2613   Register Dst = MI.getOperand(0).getReg();
2614   const LLT S32 = LLT::scalar(32);
2615   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2616 
2617   Register Src0 = MI.getOperand(1).getReg();
2618   Register Src1 = MI.getOperand(2).getReg();
2619   assert(MRI.getType(Src0) == LLT::scalar(16));
2620 
2621   auto Merge = B.buildMerge(S32, {Src0, Src1});
2622   B.buildBitcast(Dst, Merge);
2623 
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627 
2628 // Return the use branch instruction, otherwise null if the usage is invalid.
2629 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2630                                        MachineRegisterInfo &MRI,
2631                                        MachineInstr *&Br,
2632                                        MachineBasicBlock *&UncondBrTarget) {
2633   Register CondDef = MI.getOperand(0).getReg();
2634   if (!MRI.hasOneNonDBGUse(CondDef))
2635     return nullptr;
2636 
2637   MachineBasicBlock *Parent = MI.getParent();
2638   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2639   if (UseMI.getParent() != Parent ||
2640       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2641     return nullptr;
2642 
2643   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2644   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2645   if (Next == Parent->end()) {
2646     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2647     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2648       return nullptr;
2649     UncondBrTarget = &*NextMBB;
2650   } else {
2651     if (Next->getOpcode() != AMDGPU::G_BR)
2652       return nullptr;
2653     Br = &*Next;
2654     UncondBrTarget = Br->getOperand(0).getMBB();
2655   }
2656 
2657   return &UseMI;
2658 }
2659 
2660 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2661                                          const ArgDescriptor *Arg,
2662                                          const TargetRegisterClass *ArgRC,
2663                                          LLT ArgTy) const {
2664   MCRegister SrcReg = Arg->getRegister();
2665   assert(SrcReg.isPhysical() && "Physical register expected");
2666   assert(DstReg.isVirtual() && "Virtual register expected");
2667 
2668   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2669                                              ArgTy);
2670   if (Arg->isMasked()) {
2671     // TODO: Should we try to emit this once in the entry block?
2672     const LLT S32 = LLT::scalar(32);
2673     const unsigned Mask = Arg->getMask();
2674     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2675 
2676     Register AndMaskSrc = LiveIn;
2677 
2678     if (Shift != 0) {
2679       auto ShiftAmt = B.buildConstant(S32, Shift);
2680       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2681     }
2682 
2683     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2684   } else {
2685     B.buildCopy(DstReg, LiveIn);
2686   }
2687 
2688   return true;
2689 }
2690 
2691 bool AMDGPULegalizerInfo::loadInputValue(
2692     Register DstReg, MachineIRBuilder &B,
2693     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2694   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2695   const ArgDescriptor *Arg;
2696   const TargetRegisterClass *ArgRC;
2697   LLT ArgTy;
2698   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2699 
2700   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2701     return false; // TODO: Handle these
2702   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2703 }
2704 
2705 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2706     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2707     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2708   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2709     return false;
2710 
2711   MI.eraseFromParent();
2712   return true;
2713 }
2714 
2715 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2716                                        MachineRegisterInfo &MRI,
2717                                        MachineIRBuilder &B) const {
2718   Register Dst = MI.getOperand(0).getReg();
2719   LLT DstTy = MRI.getType(Dst);
2720   LLT S16 = LLT::scalar(16);
2721   LLT S32 = LLT::scalar(32);
2722   LLT S64 = LLT::scalar(64);
2723 
2724   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2725     return true;
2726 
2727   if (DstTy == S16)
2728     return legalizeFDIV16(MI, MRI, B);
2729   if (DstTy == S32)
2730     return legalizeFDIV32(MI, MRI, B);
2731   if (DstTy == S64)
2732     return legalizeFDIV64(MI, MRI, B);
2733 
2734   return false;
2735 }
2736 
2737 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2738                                                   Register DstReg,
2739                                                   Register X,
2740                                                   Register Y,
2741                                                   bool IsDiv) const {
2742   const LLT S1 = LLT::scalar(1);
2743   const LLT S32 = LLT::scalar(32);
2744 
2745   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2746   // algorithm used here.
2747 
2748   // Initial estimate of inv(y).
2749   auto FloatY = B.buildUITOFP(S32, Y);
2750   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2751   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2752   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2753   auto Z = B.buildFPTOUI(S32, ScaledY);
2754 
2755   // One round of UNR.
2756   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2757   auto NegYZ = B.buildMul(S32, NegY, Z);
2758   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2759 
2760   // Quotient/remainder estimate.
2761   auto Q = B.buildUMulH(S32, X, Z);
2762   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2763 
2764   // First quotient/remainder refinement.
2765   auto One = B.buildConstant(S32, 1);
2766   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2767   if (IsDiv)
2768     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2769   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2770 
2771   // Second quotient/remainder refinement.
2772   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2773   if (IsDiv)
2774     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2775   else
2776     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2777 }
2778 
2779 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2780                                               MachineRegisterInfo &MRI,
2781                                               MachineIRBuilder &B) const {
2782   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2783   Register DstReg = MI.getOperand(0).getReg();
2784   Register Num = MI.getOperand(1).getReg();
2785   Register Den = MI.getOperand(2).getReg();
2786   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2787   MI.eraseFromParent();
2788   return true;
2789 }
2790 
2791 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2792 //
2793 // Return lo, hi of result
2794 //
2795 // %cvt.lo = G_UITOFP Val.lo
2796 // %cvt.hi = G_UITOFP Val.hi
2797 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2798 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2799 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2800 // %mul2 = G_FMUL %mul1, 2**(-32)
2801 // %trunc = G_INTRINSIC_TRUNC %mul2
2802 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2803 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2804 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2805                                                        Register Val) {
2806   const LLT S32 = LLT::scalar(32);
2807   auto Unmerge = B.buildUnmerge(S32, Val);
2808 
2809   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2810   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2811 
2812   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2813                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2814 
2815   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2816   auto Mul1 =
2817       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2818 
2819   // 2**(-32)
2820   auto Mul2 =
2821       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2822   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2823 
2824   // -(2**32)
2825   auto Mad2 = B.buildFMAD(S32, Trunc,
2826                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2827 
2828   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2829   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2830 
2831   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2832 }
2833 
2834 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2835                                                   Register DstReg,
2836                                                   Register Numer,
2837                                                   Register Denom,
2838                                                   bool IsDiv) const {
2839   const LLT S32 = LLT::scalar(32);
2840   const LLT S64 = LLT::scalar(64);
2841   const LLT S1 = LLT::scalar(1);
2842   Register RcpLo, RcpHi;
2843 
2844   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2845 
2846   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2847 
2848   auto Zero64 = B.buildConstant(S64, 0);
2849   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2850 
2851   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2852   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2853 
2854   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2855   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2856   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2857 
2858   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2859   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2860   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2861   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2862 
2863   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2864   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2865   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2866   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2867   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2868 
2869   auto Zero32 = B.buildConstant(S32, 0);
2870   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2871   auto Add2_HiC =
2872       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2873   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2874   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2875 
2876   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2877   Register NumerLo = UnmergeNumer.getReg(0);
2878   Register NumerHi = UnmergeNumer.getReg(1);
2879 
2880   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2881   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2882   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2883   Register Mul3_Lo = UnmergeMul3.getReg(0);
2884   Register Mul3_Hi = UnmergeMul3.getReg(1);
2885   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2886   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2887   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2888   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2889 
2890   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2891   Register DenomLo = UnmergeDenom.getReg(0);
2892   Register DenomHi = UnmergeDenom.getReg(1);
2893 
2894   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2895   auto C1 = B.buildSExt(S32, CmpHi);
2896 
2897   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2898   auto C2 = B.buildSExt(S32, CmpLo);
2899 
2900   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2901   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2902 
2903   // TODO: Here and below portions of the code can be enclosed into if/endif.
2904   // Currently control flow is unconditional and we have 4 selects after
2905   // potential endif to substitute PHIs.
2906 
2907   // if C3 != 0 ...
2908   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2909   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2910   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2911   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2912 
2913   auto One64 = B.buildConstant(S64, 1);
2914   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2915 
2916   auto C4 =
2917       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2918   auto C5 =
2919       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2920   auto C6 = B.buildSelect(
2921       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2922 
2923   // if (C6 != 0)
2924   auto Add4 = B.buildAdd(S64, Add3, One64);
2925   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2926 
2927   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2928   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2929   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2930 
2931   // endif C6
2932   // endif C3
2933 
2934   if (IsDiv) {
2935     auto Sel1 = B.buildSelect(
2936         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2937     B.buildSelect(DstReg,
2938                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2939   } else {
2940     auto Sel2 = B.buildSelect(
2941         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2942     B.buildSelect(DstReg,
2943                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2944   }
2945 }
2946 
2947 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2948                                             MachineRegisterInfo &MRI,
2949                                             MachineIRBuilder &B) const {
2950   const LLT S64 = LLT::scalar(64);
2951   const LLT S32 = LLT::scalar(32);
2952   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2953   Register DstReg = MI.getOperand(0).getReg();
2954   Register Num = MI.getOperand(1).getReg();
2955   Register Den = MI.getOperand(2).getReg();
2956   LLT Ty = MRI.getType(DstReg);
2957 
2958   if (Ty == S32)
2959     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2960   else if (Ty == S64)
2961     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2962   else
2963     return false;
2964 
2965   MI.eraseFromParent();
2966   return true;
2967 
2968 }
2969 
2970 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2971                                             MachineRegisterInfo &MRI,
2972                                             MachineIRBuilder &B) const {
2973   const LLT S64 = LLT::scalar(64);
2974   const LLT S32 = LLT::scalar(32);
2975 
2976   Register DstReg = MI.getOperand(0).getReg();
2977   const LLT Ty = MRI.getType(DstReg);
2978   if (Ty != S32 && Ty != S64)
2979     return false;
2980 
2981   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2982 
2983   Register LHS = MI.getOperand(1).getReg();
2984   Register RHS = MI.getOperand(2).getReg();
2985 
2986   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2987   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2988   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2989 
2990   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2991   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2992 
2993   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2994   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2995 
2996   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2997   if (Ty == S32)
2998     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2999   else
3000     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3001 
3002   Register Sign;
3003   if (IsDiv)
3004     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3005   else
3006     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3007 
3008   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3009   B.buildSub(DstReg, UDivRem, Sign);
3010 
3011   MI.eraseFromParent();
3012   return true;
3013 }
3014 
3015 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3016                                                  MachineRegisterInfo &MRI,
3017                                                  MachineIRBuilder &B) const {
3018   Register Res = MI.getOperand(0).getReg();
3019   Register LHS = MI.getOperand(1).getReg();
3020   Register RHS = MI.getOperand(2).getReg();
3021 
3022   uint16_t Flags = MI.getFlags();
3023 
3024   LLT ResTy = MRI.getType(Res);
3025   LLT S32 = LLT::scalar(32);
3026   LLT S64 = LLT::scalar(64);
3027 
3028   const MachineFunction &MF = B.getMF();
3029   bool Unsafe =
3030     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3031 
3032   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3033     return false;
3034 
3035   if (!Unsafe && ResTy == S32 &&
3036       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3037     return false;
3038 
3039   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3040     // 1 / x -> RCP(x)
3041     if (CLHS->isExactlyValue(1.0)) {
3042       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3043         .addUse(RHS)
3044         .setMIFlags(Flags);
3045 
3046       MI.eraseFromParent();
3047       return true;
3048     }
3049 
3050     // -1 / x -> RCP( FNEG(x) )
3051     if (CLHS->isExactlyValue(-1.0)) {
3052       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3053       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3054         .addUse(FNeg.getReg(0))
3055         .setMIFlags(Flags);
3056 
3057       MI.eraseFromParent();
3058       return true;
3059     }
3060   }
3061 
3062   // x / y -> x * (1.0 / y)
3063   if (Unsafe) {
3064     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3065       .addUse(RHS)
3066       .setMIFlags(Flags);
3067     B.buildFMul(Res, LHS, RCP, Flags);
3068 
3069     MI.eraseFromParent();
3070     return true;
3071   }
3072 
3073   return false;
3074 }
3075 
3076 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3077                                          MachineRegisterInfo &MRI,
3078                                          MachineIRBuilder &B) const {
3079   Register Res = MI.getOperand(0).getReg();
3080   Register LHS = MI.getOperand(1).getReg();
3081   Register RHS = MI.getOperand(2).getReg();
3082 
3083   uint16_t Flags = MI.getFlags();
3084 
3085   LLT S16 = LLT::scalar(16);
3086   LLT S32 = LLT::scalar(32);
3087 
3088   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3089   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3090 
3091   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3092     .addUse(RHSExt.getReg(0))
3093     .setMIFlags(Flags);
3094 
3095   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3096   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3097 
3098   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3099     .addUse(RDst.getReg(0))
3100     .addUse(RHS)
3101     .addUse(LHS)
3102     .setMIFlags(Flags);
3103 
3104   MI.eraseFromParent();
3105   return true;
3106 }
3107 
3108 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3109 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3110 static void toggleSPDenormMode(bool Enable,
3111                                MachineIRBuilder &B,
3112                                const GCNSubtarget &ST,
3113                                AMDGPU::SIModeRegisterDefaults Mode) {
3114   // Set SP denorm mode to this value.
3115   unsigned SPDenormMode =
3116     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3117 
3118   if (ST.hasDenormModeInst()) {
3119     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3120     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3121 
3122     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3123     B.buildInstr(AMDGPU::S_DENORM_MODE)
3124       .addImm(NewDenormModeValue);
3125 
3126   } else {
3127     // Select FP32 bit field in mode register.
3128     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3129                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3130                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3131 
3132     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3133       .addImm(SPDenormMode)
3134       .addImm(SPDenormModeBitField);
3135   }
3136 }
3137 
3138 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3139                                          MachineRegisterInfo &MRI,
3140                                          MachineIRBuilder &B) const {
3141   Register Res = MI.getOperand(0).getReg();
3142   Register LHS = MI.getOperand(1).getReg();
3143   Register RHS = MI.getOperand(2).getReg();
3144   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3145   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3146 
3147   uint16_t Flags = MI.getFlags();
3148 
3149   LLT S32 = LLT::scalar(32);
3150   LLT S1 = LLT::scalar(1);
3151 
3152   auto One = B.buildFConstant(S32, 1.0f);
3153 
3154   auto DenominatorScaled =
3155     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3156       .addUse(LHS)
3157       .addUse(RHS)
3158       .addImm(0)
3159       .setMIFlags(Flags);
3160   auto NumeratorScaled =
3161     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3162       .addUse(LHS)
3163       .addUse(RHS)
3164       .addImm(1)
3165       .setMIFlags(Flags);
3166 
3167   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3168     .addUse(DenominatorScaled.getReg(0))
3169     .setMIFlags(Flags);
3170   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3171 
3172   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3173   // aren't modeled as reading it.
3174   if (!Mode.allFP32Denormals())
3175     toggleSPDenormMode(true, B, ST, Mode);
3176 
3177   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3178   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3179   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3180   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3181   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3182   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3183 
3184   if (!Mode.allFP32Denormals())
3185     toggleSPDenormMode(false, B, ST, Mode);
3186 
3187   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3188     .addUse(Fma4.getReg(0))
3189     .addUse(Fma1.getReg(0))
3190     .addUse(Fma3.getReg(0))
3191     .addUse(NumeratorScaled.getReg(1))
3192     .setMIFlags(Flags);
3193 
3194   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3195     .addUse(Fmas.getReg(0))
3196     .addUse(RHS)
3197     .addUse(LHS)
3198     .setMIFlags(Flags);
3199 
3200   MI.eraseFromParent();
3201   return true;
3202 }
3203 
3204 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3205                                          MachineRegisterInfo &MRI,
3206                                          MachineIRBuilder &B) const {
3207   Register Res = MI.getOperand(0).getReg();
3208   Register LHS = MI.getOperand(1).getReg();
3209   Register RHS = MI.getOperand(2).getReg();
3210 
3211   uint16_t Flags = MI.getFlags();
3212 
3213   LLT S64 = LLT::scalar(64);
3214   LLT S1 = LLT::scalar(1);
3215 
3216   auto One = B.buildFConstant(S64, 1.0);
3217 
3218   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3219     .addUse(LHS)
3220     .addUse(RHS)
3221     .addImm(0)
3222     .setMIFlags(Flags);
3223 
3224   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3225 
3226   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3227     .addUse(DivScale0.getReg(0))
3228     .setMIFlags(Flags);
3229 
3230   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3231   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3232   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3233 
3234   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3235     .addUse(LHS)
3236     .addUse(RHS)
3237     .addImm(1)
3238     .setMIFlags(Flags);
3239 
3240   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3241   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3242   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3243 
3244   Register Scale;
3245   if (!ST.hasUsableDivScaleConditionOutput()) {
3246     // Workaround a hardware bug on SI where the condition output from div_scale
3247     // is not usable.
3248 
3249     LLT S32 = LLT::scalar(32);
3250 
3251     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3252     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3253     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3254     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3255 
3256     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3257                               Scale1Unmerge.getReg(1));
3258     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3259                               Scale0Unmerge.getReg(1));
3260     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3261   } else {
3262     Scale = DivScale1.getReg(1);
3263   }
3264 
3265   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3266     .addUse(Fma4.getReg(0))
3267     .addUse(Fma3.getReg(0))
3268     .addUse(Mul.getReg(0))
3269     .addUse(Scale)
3270     .setMIFlags(Flags);
3271 
3272   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3273     .addUse(Fmas.getReg(0))
3274     .addUse(RHS)
3275     .addUse(LHS)
3276     .setMIFlags(Flags);
3277 
3278   MI.eraseFromParent();
3279   return true;
3280 }
3281 
3282 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3283                                                  MachineRegisterInfo &MRI,
3284                                                  MachineIRBuilder &B) const {
3285   Register Res = MI.getOperand(0).getReg();
3286   Register LHS = MI.getOperand(2).getReg();
3287   Register RHS = MI.getOperand(3).getReg();
3288   uint16_t Flags = MI.getFlags();
3289 
3290   LLT S32 = LLT::scalar(32);
3291   LLT S1 = LLT::scalar(1);
3292 
3293   auto Abs = B.buildFAbs(S32, RHS, Flags);
3294   const APFloat C0Val(1.0f);
3295 
3296   auto C0 = B.buildConstant(S32, 0x6f800000);
3297   auto C1 = B.buildConstant(S32, 0x2f800000);
3298   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3299 
3300   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3301   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3302 
3303   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3304 
3305   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3306     .addUse(Mul0.getReg(0))
3307     .setMIFlags(Flags);
3308 
3309   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3310 
3311   B.buildFMul(Res, Sel, Mul1, Flags);
3312 
3313   MI.eraseFromParent();
3314   return true;
3315 }
3316 
3317 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3318 // FIXME: Why do we handle this one but not other removed instructions?
3319 //
3320 // Reciprocal square root.  The clamp prevents infinite results, clamping
3321 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3322 // +-max_float.
3323 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3324                                                     MachineRegisterInfo &MRI,
3325                                                     MachineIRBuilder &B) const {
3326   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3327     return true;
3328 
3329   Register Dst = MI.getOperand(0).getReg();
3330   Register Src = MI.getOperand(2).getReg();
3331   auto Flags = MI.getFlags();
3332 
3333   LLT Ty = MRI.getType(Dst);
3334 
3335   const fltSemantics *FltSemantics;
3336   if (Ty == LLT::scalar(32))
3337     FltSemantics = &APFloat::IEEEsingle();
3338   else if (Ty == LLT::scalar(64))
3339     FltSemantics = &APFloat::IEEEdouble();
3340   else
3341     return false;
3342 
3343   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3344     .addUse(Src)
3345     .setMIFlags(Flags);
3346 
3347   // We don't need to concern ourselves with the snan handling difference, since
3348   // the rsq quieted (or not) so use the one which will directly select.
3349   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3350   const bool UseIEEE = MFI->getMode().IEEE;
3351 
3352   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3353   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3354                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3355 
3356   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3357 
3358   if (UseIEEE)
3359     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3360   else
3361     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3362   MI.eraseFromParent();
3363   return true;
3364 }
3365 
3366 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3367   switch (IID) {
3368   case Intrinsic::amdgcn_ds_fadd:
3369     return AMDGPU::G_ATOMICRMW_FADD;
3370   case Intrinsic::amdgcn_ds_fmin:
3371     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3372   case Intrinsic::amdgcn_ds_fmax:
3373     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3374   default:
3375     llvm_unreachable("not a DS FP intrinsic");
3376   }
3377 }
3378 
3379 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3380                                                       MachineInstr &MI,
3381                                                       Intrinsic::ID IID) const {
3382   GISelChangeObserver &Observer = Helper.Observer;
3383   Observer.changingInstr(MI);
3384 
3385   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3386 
3387   // The remaining operands were used to set fields in the MemOperand on
3388   // construction.
3389   for (int I = 6; I > 3; --I)
3390     MI.RemoveOperand(I);
3391 
3392   MI.RemoveOperand(1); // Remove the intrinsic ID.
3393   Observer.changedInstr(MI);
3394   return true;
3395 }
3396 
3397 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3398                                             MachineRegisterInfo &MRI,
3399                                             MachineIRBuilder &B) const {
3400   uint64_t Offset =
3401     ST.getTargetLowering()->getImplicitParameterOffset(
3402       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3403   LLT DstTy = MRI.getType(DstReg);
3404   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3405 
3406   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3407   if (!loadInputValue(KernargPtrReg, B,
3408                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3409     return false;
3410 
3411   // FIXME: This should be nuw
3412   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3413   return true;
3414 }
3415 
3416 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3417                                                  MachineRegisterInfo &MRI,
3418                                                  MachineIRBuilder &B) const {
3419   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3420   if (!MFI->isEntryFunction()) {
3421     return legalizePreloadedArgIntrin(MI, MRI, B,
3422                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3423   }
3424 
3425   Register DstReg = MI.getOperand(0).getReg();
3426   if (!getImplicitArgPtr(DstReg, MRI, B))
3427     return false;
3428 
3429   MI.eraseFromParent();
3430   return true;
3431 }
3432 
3433 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3434                                               MachineRegisterInfo &MRI,
3435                                               MachineIRBuilder &B,
3436                                               unsigned AddrSpace) const {
3437   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3438   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3439   Register Hi32 = Unmerge.getReg(1);
3440 
3441   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3442   MI.eraseFromParent();
3443   return true;
3444 }
3445 
3446 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3447 // offset (the offset that is included in bounds checking and swizzling, to be
3448 // split between the instruction's voffset and immoffset fields) and soffset
3449 // (the offset that is excluded from bounds checking and swizzling, to go in
3450 // the instruction's soffset field).  This function takes the first kind of
3451 // offset and figures out how to split it between voffset and immoffset.
3452 std::tuple<Register, unsigned, unsigned>
3453 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3454                                         Register OrigOffset) const {
3455   const unsigned MaxImm = 4095;
3456   Register BaseReg;
3457   unsigned TotalConstOffset;
3458   MachineInstr *OffsetDef;
3459   const LLT S32 = LLT::scalar(32);
3460 
3461   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3462     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3463 
3464   unsigned ImmOffset = TotalConstOffset;
3465 
3466   // If the immediate value is too big for the immoffset field, put the value
3467   // and -4096 into the immoffset field so that the value that is copied/added
3468   // for the voffset field is a multiple of 4096, and it stands more chance
3469   // of being CSEd with the copy/add for another similar load/store.
3470   // However, do not do that rounding down to a multiple of 4096 if that is a
3471   // negative number, as it appears to be illegal to have a negative offset
3472   // in the vgpr, even if adding the immediate offset makes it positive.
3473   unsigned Overflow = ImmOffset & ~MaxImm;
3474   ImmOffset -= Overflow;
3475   if ((int32_t)Overflow < 0) {
3476     Overflow += ImmOffset;
3477     ImmOffset = 0;
3478   }
3479 
3480   if (Overflow != 0) {
3481     if (!BaseReg) {
3482       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3483     } else {
3484       auto OverflowVal = B.buildConstant(S32, Overflow);
3485       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3486     }
3487   }
3488 
3489   if (!BaseReg)
3490     BaseReg = B.buildConstant(S32, 0).getReg(0);
3491 
3492   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3493 }
3494 
3495 /// Handle register layout difference for f16 images for some subtargets.
3496 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3497                                              MachineRegisterInfo &MRI,
3498                                              Register Reg) const {
3499   if (!ST.hasUnpackedD16VMem())
3500     return Reg;
3501 
3502   const LLT S16 = LLT::scalar(16);
3503   const LLT S32 = LLT::scalar(32);
3504   LLT StoreVT = MRI.getType(Reg);
3505   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3506 
3507   auto Unmerge = B.buildUnmerge(S16, Reg);
3508 
3509   SmallVector<Register, 4> WideRegs;
3510   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3511     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3512 
3513   int NumElts = StoreVT.getNumElements();
3514 
3515   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3516 }
3517 
3518 Register AMDGPULegalizerInfo::fixStoreSourceType(
3519   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3520   MachineRegisterInfo *MRI = B.getMRI();
3521   LLT Ty = MRI->getType(VData);
3522 
3523   const LLT S16 = LLT::scalar(16);
3524 
3525   // Fixup illegal register types for i8 stores.
3526   if (Ty == LLT::scalar(8) || Ty == S16) {
3527     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3528     return AnyExt;
3529   }
3530 
3531   if (Ty.isVector()) {
3532     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3533       if (IsFormat)
3534         return handleD16VData(B, *MRI, VData);
3535     }
3536   }
3537 
3538   return VData;
3539 }
3540 
3541 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3542                                               MachineRegisterInfo &MRI,
3543                                               MachineIRBuilder &B,
3544                                               bool IsTyped,
3545                                               bool IsFormat) const {
3546   Register VData = MI.getOperand(1).getReg();
3547   LLT Ty = MRI.getType(VData);
3548   LLT EltTy = Ty.getScalarType();
3549   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3550   const LLT S32 = LLT::scalar(32);
3551 
3552   VData = fixStoreSourceType(B, VData, IsFormat);
3553   Register RSrc = MI.getOperand(2).getReg();
3554 
3555   MachineMemOperand *MMO = *MI.memoperands_begin();
3556   const int MemSize = MMO->getSize();
3557 
3558   unsigned ImmOffset;
3559   unsigned TotalOffset;
3560 
3561   // The typed intrinsics add an immediate after the registers.
3562   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3563 
3564   // The struct intrinsic variants add one additional operand over raw.
3565   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3566   Register VIndex;
3567   int OpOffset = 0;
3568   if (HasVIndex) {
3569     VIndex = MI.getOperand(3).getReg();
3570     OpOffset = 1;
3571   }
3572 
3573   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3574   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3575 
3576   unsigned Format = 0;
3577   if (IsTyped) {
3578     Format = MI.getOperand(5 + OpOffset).getImm();
3579     ++OpOffset;
3580   }
3581 
3582   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3583 
3584   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3585   if (TotalOffset != 0)
3586     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3587 
3588   unsigned Opc;
3589   if (IsTyped) {
3590     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3591                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3592   } else if (IsFormat) {
3593     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3594                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3595   } else {
3596     switch (MemSize) {
3597     case 1:
3598       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3599       break;
3600     case 2:
3601       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3602       break;
3603     default:
3604       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3605       break;
3606     }
3607   }
3608 
3609   if (!VIndex)
3610     VIndex = B.buildConstant(S32, 0).getReg(0);
3611 
3612   auto MIB = B.buildInstr(Opc)
3613     .addUse(VData)              // vdata
3614     .addUse(RSrc)               // rsrc
3615     .addUse(VIndex)             // vindex
3616     .addUse(VOffset)            // voffset
3617     .addUse(SOffset)            // soffset
3618     .addImm(ImmOffset);         // offset(imm)
3619 
3620   if (IsTyped)
3621     MIB.addImm(Format);
3622 
3623   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3624      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3625      .addMemOperand(MMO);
3626 
3627   MI.eraseFromParent();
3628   return true;
3629 }
3630 
3631 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3632                                              MachineRegisterInfo &MRI,
3633                                              MachineIRBuilder &B,
3634                                              bool IsFormat,
3635                                              bool IsTyped) const {
3636   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3637   MachineMemOperand *MMO = *MI.memoperands_begin();
3638   const int MemSize = MMO->getSize();
3639   const LLT S32 = LLT::scalar(32);
3640 
3641   Register Dst = MI.getOperand(0).getReg();
3642   Register RSrc = MI.getOperand(2).getReg();
3643 
3644   // The typed intrinsics add an immediate after the registers.
3645   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3646 
3647   // The struct intrinsic variants add one additional operand over raw.
3648   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3649   Register VIndex;
3650   int OpOffset = 0;
3651   if (HasVIndex) {
3652     VIndex = MI.getOperand(3).getReg();
3653     OpOffset = 1;
3654   }
3655 
3656   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3657   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3658 
3659   unsigned Format = 0;
3660   if (IsTyped) {
3661     Format = MI.getOperand(5 + OpOffset).getImm();
3662     ++OpOffset;
3663   }
3664 
3665   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3666   unsigned ImmOffset;
3667   unsigned TotalOffset;
3668 
3669   LLT Ty = MRI.getType(Dst);
3670   LLT EltTy = Ty.getScalarType();
3671   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3672   const bool Unpacked = ST.hasUnpackedD16VMem();
3673 
3674   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3675   if (TotalOffset != 0)
3676     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3677 
3678   unsigned Opc;
3679 
3680   if (IsTyped) {
3681     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3682                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3683   } else if (IsFormat) {
3684     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3685                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3686   } else {
3687     switch (MemSize) {
3688     case 1:
3689       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3690       break;
3691     case 2:
3692       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3693       break;
3694     default:
3695       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3696       break;
3697     }
3698   }
3699 
3700   Register LoadDstReg;
3701 
3702   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3703   LLT UnpackedTy = Ty.changeElementSize(32);
3704 
3705   if (IsExtLoad)
3706     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3707   else if (Unpacked && IsD16 && Ty.isVector())
3708     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3709   else
3710     LoadDstReg = Dst;
3711 
3712   if (!VIndex)
3713     VIndex = B.buildConstant(S32, 0).getReg(0);
3714 
3715   auto MIB = B.buildInstr(Opc)
3716     .addDef(LoadDstReg)         // vdata
3717     .addUse(RSrc)               // rsrc
3718     .addUse(VIndex)             // vindex
3719     .addUse(VOffset)            // voffset
3720     .addUse(SOffset)            // soffset
3721     .addImm(ImmOffset);         // offset(imm)
3722 
3723   if (IsTyped)
3724     MIB.addImm(Format);
3725 
3726   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3727      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3728      .addMemOperand(MMO);
3729 
3730   if (LoadDstReg != Dst) {
3731     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3732 
3733     // Widen result for extending loads was widened.
3734     if (IsExtLoad)
3735       B.buildTrunc(Dst, LoadDstReg);
3736     else {
3737       // Repack to original 16-bit vector result
3738       // FIXME: G_TRUNC should work, but legalization currently fails
3739       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3740       SmallVector<Register, 4> Repack;
3741       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3742         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3743       B.buildMerge(Dst, Repack);
3744     }
3745   }
3746 
3747   MI.eraseFromParent();
3748   return true;
3749 }
3750 
3751 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3752                                                MachineIRBuilder &B,
3753                                                bool IsInc) const {
3754   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3755                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3756   B.buildInstr(Opc)
3757     .addDef(MI.getOperand(0).getReg())
3758     .addUse(MI.getOperand(2).getReg())
3759     .addUse(MI.getOperand(3).getReg())
3760     .cloneMemRefs(MI);
3761   MI.eraseFromParent();
3762   return true;
3763 }
3764 
3765 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3766   switch (IntrID) {
3767   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3768   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3769     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3770   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3771   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3772     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3773   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3774   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3775     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3776   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3777   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3778     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3779   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3780   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3781     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3782   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3783   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3784     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3785   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3786   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3787     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3788   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3789   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3790     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3791   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3792   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3793     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3794   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3795   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3796     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3797   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3798   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3799     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3800   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3801   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3802     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3803   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3804   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3805     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3806   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3807   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3808     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3809   default:
3810     llvm_unreachable("unhandled atomic opcode");
3811   }
3812 }
3813 
3814 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3815                                                MachineIRBuilder &B,
3816                                                Intrinsic::ID IID) const {
3817   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3818                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3819   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3820 
3821   Register Dst;
3822 
3823   int OpOffset = 0;
3824   if (HasReturn) {
3825     // A few FP atomics do not support return values.
3826     Dst = MI.getOperand(0).getReg();
3827   } else {
3828     OpOffset = -1;
3829   }
3830 
3831   Register VData = MI.getOperand(2 + OpOffset).getReg();
3832   Register CmpVal;
3833 
3834   if (IsCmpSwap) {
3835     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3836     ++OpOffset;
3837   }
3838 
3839   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3840   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3841 
3842   // The struct intrinsic variants add one additional operand over raw.
3843   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3844   Register VIndex;
3845   if (HasVIndex) {
3846     VIndex = MI.getOperand(4 + OpOffset).getReg();
3847     ++OpOffset;
3848   }
3849 
3850   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3851   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3852   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3853 
3854   MachineMemOperand *MMO = *MI.memoperands_begin();
3855 
3856   unsigned ImmOffset;
3857   unsigned TotalOffset;
3858   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3859   if (TotalOffset != 0)
3860     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3861 
3862   if (!VIndex)
3863     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3864 
3865   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3866 
3867   if (HasReturn)
3868     MIB.addDef(Dst);
3869 
3870   MIB.addUse(VData); // vdata
3871 
3872   if (IsCmpSwap)
3873     MIB.addReg(CmpVal);
3874 
3875   MIB.addUse(RSrc)               // rsrc
3876      .addUse(VIndex)             // vindex
3877      .addUse(VOffset)            // voffset
3878      .addUse(SOffset)            // soffset
3879      .addImm(ImmOffset)          // offset(imm)
3880      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3881      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3882      .addMemOperand(MMO);
3883 
3884   MI.eraseFromParent();
3885   return true;
3886 }
3887 
3888 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3889 /// vector with s16 typed elements.
3890 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3891                                         SmallVectorImpl<Register> &PackedAddrs,
3892                                         int AddrIdx, int DimIdx, int EndIdx,
3893                                         int NumGradients) {
3894   const LLT S16 = LLT::scalar(16);
3895   const LLT V2S16 = LLT::vector(2, 16);
3896 
3897   for (int I = AddrIdx; I < EndIdx; ++I) {
3898     MachineOperand &SrcOp = MI.getOperand(I);
3899     if (!SrcOp.isReg())
3900       continue; // _L to _LZ may have eliminated this.
3901 
3902     Register AddrReg = SrcOp.getReg();
3903 
3904     if (I < DimIdx) {
3905       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3906       PackedAddrs.push_back(AddrReg);
3907     } else {
3908       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3909       // derivatives dx/dh and dx/dv are packed with undef.
3910       if (((I + 1) >= EndIdx) ||
3911           ((NumGradients / 2) % 2 == 1 &&
3912            (I == DimIdx + (NumGradients / 2) - 1 ||
3913             I == DimIdx + NumGradients - 1)) ||
3914           // Check for _L to _LZ optimization
3915           !MI.getOperand(I + 1).isReg()) {
3916         PackedAddrs.push_back(
3917             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3918                 .getReg(0));
3919       } else {
3920         PackedAddrs.push_back(
3921             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3922                 .getReg(0));
3923         ++I;
3924       }
3925     }
3926   }
3927 }
3928 
3929 /// Convert from separate vaddr components to a single vector address register,
3930 /// and replace the remaining operands with $noreg.
3931 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3932                                      int DimIdx, int NumVAddrs) {
3933   const LLT S32 = LLT::scalar(32);
3934 
3935   SmallVector<Register, 8> AddrRegs;
3936   for (int I = 0; I != NumVAddrs; ++I) {
3937     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3938     if (SrcOp.isReg()) {
3939       AddrRegs.push_back(SrcOp.getReg());
3940       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3941     }
3942   }
3943 
3944   int NumAddrRegs = AddrRegs.size();
3945   if (NumAddrRegs != 1) {
3946     // Round up to 8 elements for v5-v7
3947     // FIXME: Missing intermediate sized register classes and instructions.
3948     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3949       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3950       auto Undef = B.buildUndef(S32);
3951       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3952       NumAddrRegs = RoundedNumRegs;
3953     }
3954 
3955     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3956     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3957   }
3958 
3959   for (int I = 1; I != NumVAddrs; ++I) {
3960     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3961     if (SrcOp.isReg())
3962       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3963   }
3964 }
3965 
3966 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3967 ///
3968 /// Depending on the subtarget, load/store with 16-bit element data need to be
3969 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3970 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3971 /// registers.
3972 ///
3973 /// We don't want to directly select image instructions just yet, but also want
3974 /// to exposes all register repacking to the legalizer/combiners. We also don't
3975 /// want a selected instrution entering RegBankSelect. In order to avoid
3976 /// defining a multitude of intermediate image instructions, directly hack on
3977 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3978 /// now unnecessary arguments with $noreg.
3979 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3980     MachineInstr &MI, MachineIRBuilder &B,
3981     GISelChangeObserver &Observer,
3982     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3983 
3984   const int NumDefs = MI.getNumExplicitDefs();
3985   bool IsTFE = NumDefs == 2;
3986   // We are only processing the operands of d16 image operations on subtargets
3987   // that use the unpacked register layout, or need to repack the TFE result.
3988 
3989   // TODO: Do we need to guard against already legalized intrinsics?
3990   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3991     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3992 
3993   MachineRegisterInfo *MRI = B.getMRI();
3994   const LLT S32 = LLT::scalar(32);
3995   const LLT S16 = LLT::scalar(16);
3996   const LLT V2S16 = LLT::vector(2, 16);
3997 
3998   // Index of first address argument
3999   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
4000 
4001   int NumVAddrs, NumGradients;
4002   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
4003   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
4004     getDMaskIdx(BaseOpcode, NumDefs);
4005   unsigned DMask = 0;
4006 
4007   // Check for 16 bit addresses and pack if true.
4008   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4009   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
4010   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
4011   const bool IsG16 = GradTy == S16;
4012   const bool IsA16 = AddrTy == S16;
4013 
4014   int DMaskLanes = 0;
4015   if (!BaseOpcode->Atomic) {
4016     DMask = MI.getOperand(DMaskIdx).getImm();
4017     if (BaseOpcode->Gather4) {
4018       DMaskLanes = 4;
4019     } else if (DMask != 0) {
4020       DMaskLanes = countPopulation(DMask);
4021     } else if (!IsTFE && !BaseOpcode->Store) {
4022       // If dmask is 0, this is a no-op load. This can be eliminated.
4023       B.buildUndef(MI.getOperand(0));
4024       MI.eraseFromParent();
4025       return true;
4026     }
4027   }
4028 
4029   Observer.changingInstr(MI);
4030   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4031 
4032   unsigned NewOpcode = NumDefs == 0 ?
4033     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4034 
4035   // Track that we legalized this
4036   MI.setDesc(B.getTII().get(NewOpcode));
4037 
4038   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4039   // dmask to be at least 1 otherwise the instruction will fail
4040   if (IsTFE && DMask == 0) {
4041     DMask = 0x1;
4042     DMaskLanes = 1;
4043     MI.getOperand(DMaskIdx).setImm(DMask);
4044   }
4045 
4046   if (BaseOpcode->Atomic) {
4047     Register VData0 = MI.getOperand(2).getReg();
4048     LLT Ty = MRI->getType(VData0);
4049 
4050     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4051     if (Ty.isVector())
4052       return false;
4053 
4054     if (BaseOpcode->AtomicX2) {
4055       Register VData1 = MI.getOperand(3).getReg();
4056       // The two values are packed in one register.
4057       LLT PackedTy = LLT::vector(2, Ty);
4058       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4059       MI.getOperand(2).setReg(Concat.getReg(0));
4060       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4061     }
4062   }
4063 
4064   int CorrectedNumVAddrs = NumVAddrs;
4065 
4066   // Optimize _L to _LZ when _L is zero
4067   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4068         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
4069     const ConstantFP *ConstantLod;
4070     const int LodIdx = AddrIdx + NumVAddrs - 1;
4071 
4072     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
4073       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4074         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4075         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
4076           LZMappingInfo->LZ, ImageDimIntr->Dim);
4077 
4078         // The starting indexes should remain in the same place.
4079         --NumVAddrs;
4080         --CorrectedNumVAddrs;
4081 
4082         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
4083           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
4084         MI.RemoveOperand(LodIdx);
4085       }
4086     }
4087   }
4088 
4089   // Optimize _mip away, when 'lod' is zero
4090   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
4091     int64_t ConstantLod;
4092     const int LodIdx = AddrIdx + NumVAddrs - 1;
4093 
4094     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
4095       if (ConstantLod == 0) {
4096         // TODO: Change intrinsic opcode and remove operand instead or replacing
4097         // it with 0, as the _L to _LZ handling is done above.
4098         MI.getOperand(LodIdx).ChangeToImmediate(0);
4099         --CorrectedNumVAddrs;
4100       }
4101     }
4102   }
4103 
4104   // Rewrite the addressing register layout before doing anything else.
4105   if (IsA16 || IsG16) {
4106     if (IsA16) {
4107       // Target must support the feature and gradients need to be 16 bit too
4108       if (!ST.hasA16() || !IsG16)
4109         return false;
4110     } else if (!ST.hasG16())
4111       return false;
4112 
4113     if (NumVAddrs > 1) {
4114       SmallVector<Register, 4> PackedRegs;
4115       // Don't compress addresses for G16
4116       const int PackEndIdx =
4117           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4118       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4119                                   PackEndIdx, NumGradients);
4120 
4121       if (!IsA16) {
4122         // Add uncompressed address
4123         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4124           int AddrReg = MI.getOperand(I).getReg();
4125           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4126           PackedRegs.push_back(AddrReg);
4127         }
4128       }
4129 
4130       // See also below in the non-a16 branch
4131       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4132 
4133       if (!UseNSA && PackedRegs.size() > 1) {
4134         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4135         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4136         PackedRegs[0] = Concat.getReg(0);
4137         PackedRegs.resize(1);
4138       }
4139 
4140       const int NumPacked = PackedRegs.size();
4141       for (int I = 0; I != NumVAddrs; ++I) {
4142         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4143         if (!SrcOp.isReg()) {
4144           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4145           continue;
4146         }
4147 
4148         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4149 
4150         if (I < NumPacked)
4151           SrcOp.setReg(PackedRegs[I]);
4152         else
4153           SrcOp.setReg(AMDGPU::NoRegister);
4154       }
4155     }
4156   } else {
4157     // If the register allocator cannot place the address registers contiguously
4158     // without introducing moves, then using the non-sequential address encoding
4159     // is always preferable, since it saves VALU instructions and is usually a
4160     // wash in terms of code size or even better.
4161     //
4162     // However, we currently have no way of hinting to the register allocator
4163     // that MIMG addresses should be placed contiguously when it is possible to
4164     // do so, so force non-NSA for the common 2-address case as a heuristic.
4165     //
4166     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4167     // allocation when possible.
4168     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4169 
4170     if (!UseNSA && NumVAddrs > 1)
4171       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4172   }
4173 
4174   int Flags = 0;
4175   if (IsA16)
4176     Flags |= 1;
4177   if (IsG16)
4178     Flags |= 2;
4179   MI.addOperand(MachineOperand::CreateImm(Flags));
4180 
4181   if (BaseOpcode->Store) { // No TFE for stores?
4182     // TODO: Handle dmask trim
4183     Register VData = MI.getOperand(1).getReg();
4184     LLT Ty = MRI->getType(VData);
4185     if (!Ty.isVector() || Ty.getElementType() != S16)
4186       return true;
4187 
4188     Register RepackedReg = handleD16VData(B, *MRI, VData);
4189     if (RepackedReg != VData) {
4190       MI.getOperand(1).setReg(RepackedReg);
4191     }
4192 
4193     return true;
4194   }
4195 
4196   Register DstReg = MI.getOperand(0).getReg();
4197   LLT Ty = MRI->getType(DstReg);
4198   const LLT EltTy = Ty.getScalarType();
4199   const bool IsD16 = Ty.getScalarType() == S16;
4200   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4201 
4202   // Confirm that the return type is large enough for the dmask specified
4203   if (NumElts < DMaskLanes)
4204     return false;
4205 
4206   if (NumElts > 4 || DMaskLanes > 4)
4207     return false;
4208 
4209   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4210   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4211 
4212   // The raw dword aligned data component of the load. The only legal cases
4213   // where this matters should be when using the packed D16 format, for
4214   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4215   LLT RoundedTy;
4216 
4217   // S32 vector to to cover all data, plus TFE result element.
4218   LLT TFETy;
4219 
4220   // Register type to use for each loaded component. Will be S32 or V2S16.
4221   LLT RegTy;
4222 
4223   if (IsD16 && ST.hasUnpackedD16VMem()) {
4224     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4225     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4226     RegTy = S32;
4227   } else {
4228     unsigned EltSize = EltTy.getSizeInBits();
4229     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4230     unsigned RoundedSize = 32 * RoundedElts;
4231     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4232     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4233     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4234   }
4235 
4236   // The return type does not need adjustment.
4237   // TODO: Should we change s16 case to s32 or <2 x s16>?
4238   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4239     return true;
4240 
4241   Register Dst1Reg;
4242 
4243   // Insert after the instruction.
4244   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4245 
4246   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4247   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4248   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4249   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4250 
4251   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4252 
4253   MI.getOperand(0).setReg(NewResultReg);
4254 
4255   // In the IR, TFE is supposed to be used with a 2 element struct return
4256   // type. The intruction really returns these two values in one contiguous
4257   // register, with one additional dword beyond the loaded data. Rewrite the
4258   // return type to use a single register result.
4259 
4260   if (IsTFE) {
4261     Dst1Reg = MI.getOperand(1).getReg();
4262     if (MRI->getType(Dst1Reg) != S32)
4263       return false;
4264 
4265     // TODO: Make sure the TFE operand bit is set.
4266     MI.RemoveOperand(1);
4267 
4268     // Handle the easy case that requires no repack instructions.
4269     if (Ty == S32) {
4270       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4271       return true;
4272     }
4273   }
4274 
4275   // Now figure out how to copy the new result register back into the old
4276   // result.
4277   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4278 
4279   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4280 
4281   if (ResultNumRegs == 1) {
4282     assert(!IsTFE);
4283     ResultRegs[0] = NewResultReg;
4284   } else {
4285     // We have to repack into a new vector of some kind.
4286     for (int I = 0; I != NumDataRegs; ++I)
4287       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4288     B.buildUnmerge(ResultRegs, NewResultReg);
4289 
4290     // Drop the final TFE element to get the data part. The TFE result is
4291     // directly written to the right place already.
4292     if (IsTFE)
4293       ResultRegs.resize(NumDataRegs);
4294   }
4295 
4296   // For an s16 scalar result, we form an s32 result with a truncate regardless
4297   // of packed vs. unpacked.
4298   if (IsD16 && !Ty.isVector()) {
4299     B.buildTrunc(DstReg, ResultRegs[0]);
4300     return true;
4301   }
4302 
4303   // Avoid a build/concat_vector of 1 entry.
4304   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4305     B.buildBitcast(DstReg, ResultRegs[0]);
4306     return true;
4307   }
4308 
4309   assert(Ty.isVector());
4310 
4311   if (IsD16) {
4312     // For packed D16 results with TFE enabled, all the data components are
4313     // S32. Cast back to the expected type.
4314     //
4315     // TODO: We don't really need to use load s32 elements. We would only need one
4316     // cast for the TFE result if a multiple of v2s16 was used.
4317     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4318       for (Register &Reg : ResultRegs)
4319         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4320     } else if (ST.hasUnpackedD16VMem()) {
4321       for (Register &Reg : ResultRegs)
4322         Reg = B.buildTrunc(S16, Reg).getReg(0);
4323     }
4324   }
4325 
4326   auto padWithUndef = [&](LLT Ty, int NumElts) {
4327     if (NumElts == 0)
4328       return;
4329     Register Undef = B.buildUndef(Ty).getReg(0);
4330     for (int I = 0; I != NumElts; ++I)
4331       ResultRegs.push_back(Undef);
4332   };
4333 
4334   // Pad out any elements eliminated due to the dmask.
4335   LLT ResTy = MRI->getType(ResultRegs[0]);
4336   if (!ResTy.isVector()) {
4337     padWithUndef(ResTy, NumElts - ResultRegs.size());
4338     B.buildBuildVector(DstReg, ResultRegs);
4339     return true;
4340   }
4341 
4342   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4343   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4344 
4345   // Deal with the one annoying legal case.
4346   const LLT V3S16 = LLT::vector(3, 16);
4347   if (Ty == V3S16) {
4348     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4349     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4350     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4351     return true;
4352   }
4353 
4354   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4355   B.buildConcatVectors(DstReg, ResultRegs);
4356   return true;
4357 }
4358 
4359 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4360   LegalizerHelper &Helper, MachineInstr &MI) const {
4361   MachineIRBuilder &B = Helper.MIRBuilder;
4362   GISelChangeObserver &Observer = Helper.Observer;
4363 
4364   Register Dst = MI.getOperand(0).getReg();
4365   LLT Ty = B.getMRI()->getType(Dst);
4366   unsigned Size = Ty.getSizeInBits();
4367   MachineFunction &MF = B.getMF();
4368 
4369   Observer.changingInstr(MI);
4370 
4371   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4372     Ty = getBitcastRegisterType(Ty);
4373     Helper.bitcastDst(MI, Ty, 0);
4374     Dst = MI.getOperand(0).getReg();
4375     B.setInsertPt(B.getMBB(), MI);
4376   }
4377 
4378   // FIXME: We don't really need this intermediate instruction. The intrinsic
4379   // should be fixed to have a memory operand. Since it's readnone, we're not
4380   // allowed to add one.
4381   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4382   MI.RemoveOperand(1); // Remove intrinsic ID
4383 
4384   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4385   // TODO: Should this use datalayout alignment?
4386   const unsigned MemSize = (Size + 7) / 8;
4387   const Align MemAlign(4);
4388   MachineMemOperand *MMO = MF.getMachineMemOperand(
4389       MachinePointerInfo(),
4390       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4391           MachineMemOperand::MOInvariant,
4392       MemSize, MemAlign);
4393   MI.addMemOperand(MF, MMO);
4394 
4395   // There are no 96-bit result scalar loads, but widening to 128-bit should
4396   // always be legal. We may need to restore this to a 96-bit result if it turns
4397   // out this needs to be converted to a vector load during RegBankSelect.
4398   if (!isPowerOf2_32(Size)) {
4399     if (Ty.isVector())
4400       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4401     else
4402       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4403   }
4404 
4405   Observer.changedInstr(MI);
4406   return true;
4407 }
4408 
4409 // TODO: Move to selection
4410 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4411                                                 MachineRegisterInfo &MRI,
4412                                                 MachineIRBuilder &B) const {
4413   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4414   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4415       !ST.isTrapHandlerEnabled()) {
4416     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4417   } else {
4418     // Pass queue pointer to trap handler as input, and insert trap instruction
4419     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4420     MachineRegisterInfo &MRI = *B.getMRI();
4421 
4422     Register LiveIn =
4423       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4424     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4425       return false;
4426 
4427     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4428     B.buildCopy(SGPR01, LiveIn);
4429     B.buildInstr(AMDGPU::S_TRAP)
4430         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4431         .addReg(SGPR01, RegState::Implicit);
4432   }
4433 
4434   MI.eraseFromParent();
4435   return true;
4436 }
4437 
4438 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4439     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4440   // Is non-HSA path or trap-handler disabled? then, report a warning
4441   // accordingly
4442   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4443       !ST.isTrapHandlerEnabled()) {
4444     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4445                                      "debugtrap handler not supported",
4446                                      MI.getDebugLoc(), DS_Warning);
4447     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4448     Ctx.diagnose(NoTrap);
4449   } else {
4450     // Insert debug-trap instruction
4451     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4452   }
4453 
4454   MI.eraseFromParent();
4455   return true;
4456 }
4457 
4458 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4459                                             MachineInstr &MI) const {
4460   MachineIRBuilder &B = Helper.MIRBuilder;
4461   MachineRegisterInfo &MRI = *B.getMRI();
4462 
4463   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4464   auto IntrID = MI.getIntrinsicID();
4465   switch (IntrID) {
4466   case Intrinsic::amdgcn_if:
4467   case Intrinsic::amdgcn_else: {
4468     MachineInstr *Br = nullptr;
4469     MachineBasicBlock *UncondBrTarget = nullptr;
4470     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4471       const SIRegisterInfo *TRI
4472         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4473 
4474       Register Def = MI.getOperand(1).getReg();
4475       Register Use = MI.getOperand(3).getReg();
4476 
4477       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4478       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4479       if (IntrID == Intrinsic::amdgcn_if) {
4480         B.buildInstr(AMDGPU::SI_IF)
4481           .addDef(Def)
4482           .addUse(Use)
4483           .addMBB(UncondBrTarget);
4484       } else {
4485         B.buildInstr(AMDGPU::SI_ELSE)
4486           .addDef(Def)
4487           .addUse(Use)
4488           .addMBB(UncondBrTarget)
4489           .addImm(0);
4490       }
4491 
4492       if (Br) {
4493         Br->getOperand(0).setMBB(CondBrTarget);
4494       } else {
4495         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4496         // since we're swapping branch targets it needs to be reinserted.
4497         // FIXME: IRTranslator should probably not do this
4498         B.buildBr(*CondBrTarget);
4499       }
4500 
4501       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4502       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4503       MI.eraseFromParent();
4504       BrCond->eraseFromParent();
4505       return true;
4506     }
4507 
4508     return false;
4509   }
4510   case Intrinsic::amdgcn_loop: {
4511     MachineInstr *Br = nullptr;
4512     MachineBasicBlock *UncondBrTarget = nullptr;
4513     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4514       const SIRegisterInfo *TRI
4515         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4516 
4517       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4518       Register Reg = MI.getOperand(2).getReg();
4519 
4520       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4521       B.buildInstr(AMDGPU::SI_LOOP)
4522         .addUse(Reg)
4523         .addMBB(UncondBrTarget);
4524 
4525       if (Br)
4526         Br->getOperand(0).setMBB(CondBrTarget);
4527       else
4528         B.buildBr(*CondBrTarget);
4529 
4530       MI.eraseFromParent();
4531       BrCond->eraseFromParent();
4532       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4533       return true;
4534     }
4535 
4536     return false;
4537   }
4538   case Intrinsic::amdgcn_kernarg_segment_ptr:
4539     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4540       // This only makes sense to call in a kernel, so just lower to null.
4541       B.buildConstant(MI.getOperand(0).getReg(), 0);
4542       MI.eraseFromParent();
4543       return true;
4544     }
4545 
4546     return legalizePreloadedArgIntrin(
4547       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4548   case Intrinsic::amdgcn_implicitarg_ptr:
4549     return legalizeImplicitArgPtr(MI, MRI, B);
4550   case Intrinsic::amdgcn_workitem_id_x:
4551     return legalizePreloadedArgIntrin(MI, MRI, B,
4552                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4553   case Intrinsic::amdgcn_workitem_id_y:
4554     return legalizePreloadedArgIntrin(MI, MRI, B,
4555                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4556   case Intrinsic::amdgcn_workitem_id_z:
4557     return legalizePreloadedArgIntrin(MI, MRI, B,
4558                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4559   case Intrinsic::amdgcn_workgroup_id_x:
4560     return legalizePreloadedArgIntrin(MI, MRI, B,
4561                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4562   case Intrinsic::amdgcn_workgroup_id_y:
4563     return legalizePreloadedArgIntrin(MI, MRI, B,
4564                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4565   case Intrinsic::amdgcn_workgroup_id_z:
4566     return legalizePreloadedArgIntrin(MI, MRI, B,
4567                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4568   case Intrinsic::amdgcn_dispatch_ptr:
4569     return legalizePreloadedArgIntrin(MI, MRI, B,
4570                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4571   case Intrinsic::amdgcn_queue_ptr:
4572     return legalizePreloadedArgIntrin(MI, MRI, B,
4573                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4574   case Intrinsic::amdgcn_implicit_buffer_ptr:
4575     return legalizePreloadedArgIntrin(
4576       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4577   case Intrinsic::amdgcn_dispatch_id:
4578     return legalizePreloadedArgIntrin(MI, MRI, B,
4579                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4580   case Intrinsic::amdgcn_fdiv_fast:
4581     return legalizeFDIVFastIntrin(MI, MRI, B);
4582   case Intrinsic::amdgcn_is_shared:
4583     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4584   case Intrinsic::amdgcn_is_private:
4585     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4586   case Intrinsic::amdgcn_wavefrontsize: {
4587     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4588     MI.eraseFromParent();
4589     return true;
4590   }
4591   case Intrinsic::amdgcn_s_buffer_load:
4592     return legalizeSBufferLoad(Helper, MI);
4593   case Intrinsic::amdgcn_raw_buffer_store:
4594   case Intrinsic::amdgcn_struct_buffer_store:
4595     return legalizeBufferStore(MI, MRI, B, false, false);
4596   case Intrinsic::amdgcn_raw_buffer_store_format:
4597   case Intrinsic::amdgcn_struct_buffer_store_format:
4598     return legalizeBufferStore(MI, MRI, B, false, true);
4599   case Intrinsic::amdgcn_raw_tbuffer_store:
4600   case Intrinsic::amdgcn_struct_tbuffer_store:
4601     return legalizeBufferStore(MI, MRI, B, true, true);
4602   case Intrinsic::amdgcn_raw_buffer_load:
4603   case Intrinsic::amdgcn_struct_buffer_load:
4604     return legalizeBufferLoad(MI, MRI, B, false, false);
4605   case Intrinsic::amdgcn_raw_buffer_load_format:
4606   case Intrinsic::amdgcn_struct_buffer_load_format:
4607     return legalizeBufferLoad(MI, MRI, B, true, false);
4608   case Intrinsic::amdgcn_raw_tbuffer_load:
4609   case Intrinsic::amdgcn_struct_tbuffer_load:
4610     return legalizeBufferLoad(MI, MRI, B, true, true);
4611   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4612   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4613   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4614   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4615   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4616   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4617   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4618   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4619   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4620   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4621   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4622   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4623   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4624   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4625   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4626   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4627   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4628   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4629   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4630   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4631   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4632   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4633   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4634   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4635   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4636   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4637   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4638   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4639     return legalizeBufferAtomic(MI, B, IntrID);
4640   case Intrinsic::amdgcn_atomic_inc:
4641     return legalizeAtomicIncDec(MI, B, true);
4642   case Intrinsic::amdgcn_atomic_dec:
4643     return legalizeAtomicIncDec(MI, B, false);
4644   case Intrinsic::trap:
4645     return legalizeTrapIntrinsic(MI, MRI, B);
4646   case Intrinsic::debugtrap:
4647     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4648   case Intrinsic::amdgcn_rsq_clamp:
4649     return legalizeRsqClampIntrinsic(MI, MRI, B);
4650   case Intrinsic::amdgcn_ds_fadd:
4651   case Intrinsic::amdgcn_ds_fmin:
4652   case Intrinsic::amdgcn_ds_fmax:
4653     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4654   default: {
4655     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4656             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4657       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4658     return true;
4659   }
4660   }
4661 
4662   return true;
4663 }
4664