1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338 
339   LLT EltTy = Ty.getElementType();
340   if (EltTy.isPointer())
341     return true;
342 
343   unsigned EltSize = EltTy.getSizeInBits();
344   return EltSize != 32 && EltSize != 64;
345 }
346 
347 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
348                              unsigned Opcode) {
349   const LLT Ty = Query.Types[0];
350   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
351          !loadStoreBitcastWorkaround(Ty);
352 }
353 
354 /// Return true if a load or store of the type should be lowered with a bitcast
355 /// to a different type.
356 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
357                                        const unsigned MemSizeInBits) {
358   const unsigned Size = Ty.getSizeInBits();
359     if (Size != MemSizeInBits)
360       return Size <= 32 && Ty.isVector();
361 
362   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
363     return true;
364   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
365          !isRegisterVectorElementType(Ty.getElementType());
366 }
367 
368 /// Return true if we should legalize a load by widening an odd sized memory
369 /// access up to the alignment. Note this case when the memory access itself
370 /// changes, not the size of the result register.
371 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
372                             unsigned AlignInBits, unsigned AddrSpace,
373                             unsigned Opcode) {
374   // We don't want to widen cases that are naturally legal.
375   if (isPowerOf2_32(SizeInBits))
376     return false;
377 
378   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
379   // end up widening these for a scalar load during RegBankSelect, since there
380   // aren't 96-bit scalar loads.
381   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
382     return false;
383 
384   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
385     return false;
386 
387   // A load is known dereferenceable up to the alignment, so it's legal to widen
388   // to it.
389   //
390   // TODO: Could check dereferenceable for less aligned cases.
391   unsigned RoundedSize = NextPowerOf2(SizeInBits);
392   if (AlignInBits < RoundedSize)
393     return false;
394 
395   // Do not widen if it would introduce a slow unaligned load.
396   const SITargetLowering *TLI = ST.getTargetLowering();
397   bool Fast = false;
398   return TLI->allowsMisalignedMemoryAccessesImpl(
399              RoundedSize, AddrSpace, Align(AlignInBits / 8),
400              MachineMemOperand::MOLoad, &Fast) &&
401          Fast;
402 }
403 
404 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
405                             unsigned Opcode) {
406   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
407     return false;
408 
409   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
410                          Query.MMODescrs[0].AlignInBits,
411                          Query.Types[1].getAddressSpace(), Opcode);
412 }
413 
414 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
415                                          const GCNTargetMachine &TM)
416   :  ST(ST_) {
417   using namespace TargetOpcode;
418 
419   auto GetAddrSpacePtr = [&TM](unsigned AS) {
420     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
421   };
422 
423   const LLT S1 = LLT::scalar(1);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S16 = LLT::vector(2, 16);
433   const LLT V4S16 = LLT::vector(4, 16);
434 
435   const LLT V2S32 = LLT::vector(2, 32);
436   const LLT V3S32 = LLT::vector(3, 32);
437   const LLT V4S32 = LLT::vector(4, 32);
438   const LLT V5S32 = LLT::vector(5, 32);
439   const LLT V6S32 = LLT::vector(6, 32);
440   const LLT V7S32 = LLT::vector(7, 32);
441   const LLT V8S32 = LLT::vector(8, 32);
442   const LLT V9S32 = LLT::vector(9, 32);
443   const LLT V10S32 = LLT::vector(10, 32);
444   const LLT V11S32 = LLT::vector(11, 32);
445   const LLT V12S32 = LLT::vector(12, 32);
446   const LLT V13S32 = LLT::vector(13, 32);
447   const LLT V14S32 = LLT::vector(14, 32);
448   const LLT V15S32 = LLT::vector(15, 32);
449   const LLT V16S32 = LLT::vector(16, 32);
450   const LLT V32S32 = LLT::vector(32, 32);
451 
452   const LLT V2S64 = LLT::vector(2, 64);
453   const LLT V3S64 = LLT::vector(3, 64);
454   const LLT V4S64 = LLT::vector(4, 64);
455   const LLT V5S64 = LLT::vector(5, 64);
456   const LLT V6S64 = LLT::vector(6, 64);
457   const LLT V7S64 = LLT::vector(7, 64);
458   const LLT V8S64 = LLT::vector(8, 64);
459   const LLT V16S64 = LLT::vector(16, 64);
460 
461   std::initializer_list<LLT> AllS32Vectors =
462     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
463      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
464   std::initializer_list<LLT> AllS64Vectors =
465     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
466 
467   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
468   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
469   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
470   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
471   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
472   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
473   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
474 
475   const LLT CodePtr = FlatPtr;
476 
477   const std::initializer_list<LLT> AddrSpaces64 = {
478     GlobalPtr, ConstantPtr, FlatPtr
479   };
480 
481   const std::initializer_list<LLT> AddrSpaces32 = {
482     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
483   };
484 
485   const std::initializer_list<LLT> FPTypesBase = {
486     S32, S64
487   };
488 
489   const std::initializer_list<LLT> FPTypes16 = {
490     S32, S64, S16
491   };
492 
493   const std::initializer_list<LLT> FPTypesPK16 = {
494     S32, S64, S16, V2S16
495   };
496 
497   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
498 
499   setAction({G_BRCOND, S1}, Legal); // VCC branches
500   setAction({G_BRCOND, S32}, Legal); // SCC branches
501 
502   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
503   // elements for v3s16
504   getActionDefinitionsBuilder(G_PHI)
505     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
506     .legalFor(AllS32Vectors)
507     .legalFor(AllS64Vectors)
508     .legalFor(AddrSpaces64)
509     .legalFor(AddrSpaces32)
510     .legalIf(isPointer(0))
511     .clampScalar(0, S16, S256)
512     .widenScalarToNextPow2(0, 32)
513     .clampMaxNumElements(0, S32, 16)
514     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
515     .scalarize(0);
516 
517   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
518     // Full set of gfx9 features.
519     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
520       .legalFor({S32, S16, V2S16})
521       .clampScalar(0, S16, S32)
522       .clampMaxNumElements(0, S16, 2)
523       .scalarize(0)
524       .widenScalarToNextPow2(0, 32);
525 
526     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
527       .legalFor({S32, S16, V2S16}) // Clamp modifier
528       .minScalarOrElt(0, S16)
529       .clampMaxNumElements(0, S16, 2)
530       .scalarize(0)
531       .widenScalarToNextPow2(0, 32)
532       .lower();
533   } else if (ST.has16BitInsts()) {
534     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
535       .legalFor({S32, S16})
536       .clampScalar(0, S16, S32)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
539 
540     // Technically the saturating operations require clamp bit support, but this
541     // was introduced at the same time as 16-bit operations.
542     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
543       .legalFor({S32, S16}) // Clamp modifier
544       .minScalar(0, S16)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 16)
547       .lower();
548 
549     // We're just lowering this, but it helps get a better result to try to
550     // coerce to the desired type first.
551     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
552       .minScalar(0, S16)
553       .scalarize(0)
554       .lower();
555   } else {
556     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
557       .legalFor({S32})
558       .clampScalar(0, S32, S32)
559       .scalarize(0);
560 
561     if (ST.hasIntClamp()) {
562       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
563         .legalFor({S32}) // Clamp modifier.
564         .scalarize(0)
565         .minScalarOrElt(0, S32)
566         .lower();
567     } else {
568       // Clamp bit support was added in VI, along with 16-bit operations.
569       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
570         .minScalar(0, S32)
571         .scalarize(0)
572         .lower();
573     }
574 
575     // FIXME: DAG expansion gets better results. The widening uses the smaller
576     // range values and goes for the min/max lowering directly.
577     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
578       .minScalar(0, S32)
579       .scalarize(0)
580       .lower();
581   }
582 
583   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
584     .customFor({S32, S64})
585     .clampScalar(0, S32, S64)
586     .widenScalarToNextPow2(0, 32)
587     .scalarize(0);
588 
589   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
590     .legalFor({S32})
591     .clampScalar(0, S32, S32)
592     .scalarize(0);
593 
594   // Report legal for any types we can handle anywhere. For the cases only legal
595   // on the SALU, RegBankSelect will be able to re-legalize.
596   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
597     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
598     .clampScalar(0, S32, S64)
599     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
600     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
601     .widenScalarToNextPow2(0)
602     .scalarize(0);
603 
604   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
605                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
606     .legalFor({{S32, S1}, {S32, S32}})
607     .minScalar(0, S32)
608     // TODO: .scalarize(0)
609     .lower();
610 
611   getActionDefinitionsBuilder(G_BITCAST)
612     // Don't worry about the size constraint.
613     .legalIf(all(isRegisterType(0), isRegisterType(1)))
614     .lower();
615 
616 
617   getActionDefinitionsBuilder(G_CONSTANT)
618     .legalFor({S1, S32, S64, S16, GlobalPtr,
619                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
620     .legalIf(isPointer(0))
621     .clampScalar(0, S32, S64)
622     .widenScalarToNextPow2(0);
623 
624   getActionDefinitionsBuilder(G_FCONSTANT)
625     .legalFor({S32, S64, S16})
626     .clampScalar(0, S16, S64);
627 
628   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
629       .legalIf(isRegisterType(0))
630       // s1 and s16 are special cases because they have legal operations on
631       // them, but don't really occupy registers in the normal way.
632       .legalFor({S1, S16})
633       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
634       .clampScalarOrElt(0, S32, MaxScalar)
635       .widenScalarToNextPow2(0, 32)
636       .clampMaxNumElements(0, S32, 16);
637 
638   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
639 
640   // If the amount is divergent, we have to do a wave reduction to get the
641   // maximum value, so this is expanded during RegBankSelect.
642   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
643     .legalFor({{PrivatePtr, S32}});
644 
645   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
646     .customIf(typeIsNot(0, PrivatePtr));
647 
648   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
649 
650   auto &FPOpActions = getActionDefinitionsBuilder(
651     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
652     .legalFor({S32, S64});
653   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
654     .customFor({S32, S64});
655   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
656     .customFor({S32, S64});
657 
658   if (ST.has16BitInsts()) {
659     if (ST.hasVOP3PInsts())
660       FPOpActions.legalFor({S16, V2S16});
661     else
662       FPOpActions.legalFor({S16});
663 
664     TrigActions.customFor({S16});
665     FDIVActions.customFor({S16});
666   }
667 
668   auto &MinNumMaxNum = getActionDefinitionsBuilder({
669       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
670 
671   if (ST.hasVOP3PInsts()) {
672     MinNumMaxNum.customFor(FPTypesPK16)
673       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
674       .clampMaxNumElements(0, S16, 2)
675       .clampScalar(0, S16, S64)
676       .scalarize(0);
677   } else if (ST.has16BitInsts()) {
678     MinNumMaxNum.customFor(FPTypes16)
679       .clampScalar(0, S16, S64)
680       .scalarize(0);
681   } else {
682     MinNumMaxNum.customFor(FPTypesBase)
683       .clampScalar(0, S32, S64)
684       .scalarize(0);
685   }
686 
687   if (ST.hasVOP3PInsts())
688     FPOpActions.clampMaxNumElements(0, S16, 2);
689 
690   FPOpActions
691     .scalarize(0)
692     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
693 
694   TrigActions
695     .scalarize(0)
696     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
697 
698   FDIVActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   getActionDefinitionsBuilder({G_FNEG, G_FABS})
703     .legalFor(FPTypesPK16)
704     .clampMaxNumElements(0, S16, 2)
705     .scalarize(0)
706     .clampScalar(0, S16, S64);
707 
708   if (ST.has16BitInsts()) {
709     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
710       .legalFor({S32, S64, S16})
711       .scalarize(0)
712       .clampScalar(0, S16, S64);
713   } else {
714     getActionDefinitionsBuilder(G_FSQRT)
715       .legalFor({S32, S64})
716       .scalarize(0)
717       .clampScalar(0, S32, S64);
718 
719     if (ST.hasFractBug()) {
720       getActionDefinitionsBuilder(G_FFLOOR)
721         .customFor({S64})
722         .legalFor({S32, S64})
723         .scalarize(0)
724         .clampScalar(0, S32, S64);
725     } else {
726       getActionDefinitionsBuilder(G_FFLOOR)
727         .legalFor({S32, S64})
728         .scalarize(0)
729         .clampScalar(0, S32, S64);
730     }
731   }
732 
733   getActionDefinitionsBuilder(G_FPTRUNC)
734     .legalFor({{S32, S64}, {S16, S32}})
735     .scalarize(0)
736     .lower();
737 
738   getActionDefinitionsBuilder(G_FPEXT)
739     .legalFor({{S64, S32}, {S32, S16}})
740     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
741     .scalarize(0);
742 
743   getActionDefinitionsBuilder(G_FSUB)
744       // Use actual fsub instruction
745       .legalFor({S32})
746       // Must use fadd + fneg
747       .lowerFor({S64, S16, V2S16})
748       .scalarize(0)
749       .clampScalar(0, S32, S64);
750 
751   // Whether this is legal depends on the floating point mode for the function.
752   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
753   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
754     FMad.customFor({S32, S16});
755   else if (ST.hasMadMacF32Insts())
756     FMad.customFor({S32});
757   else if (ST.hasMadF16())
758     FMad.customFor({S16});
759   FMad.scalarize(0)
760       .lower();
761 
762   auto &FRem = getActionDefinitionsBuilder(G_FREM);
763   if (ST.has16BitInsts()) {
764     FRem.customFor({S16, S32, S64});
765   } else {
766     FRem.minScalar(0, S32)
767         .customFor({S32, S64});
768   }
769   FRem.scalarize(0);
770 
771   // TODO: Do we need to clamp maximum bitwidth?
772   getActionDefinitionsBuilder(G_TRUNC)
773     .legalIf(isScalar(0))
774     .legalFor({{V2S16, V2S32}})
775     .clampMaxNumElements(0, S16, 2)
776     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
777     // situations (like an invalid implicit use), we don't want to infinite loop
778     // in the legalizer.
779     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
780     .alwaysLegal();
781 
782   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
783     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
784                {S32, S1}, {S64, S1}, {S16, S1}})
785     .scalarize(0)
786     .clampScalar(0, S32, S64)
787     .widenScalarToNextPow2(1, 32);
788 
789   // TODO: Split s1->s64 during regbankselect for VALU.
790   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
791     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
792     .lowerFor({{S32, S64}})
793     .lowerIf(typeIs(1, S1))
794     .customFor({{S64, S64}});
795   if (ST.has16BitInsts())
796     IToFP.legalFor({{S16, S16}});
797   IToFP.clampScalar(1, S32, S64)
798        .minScalar(0, S32)
799        .scalarize(0)
800        .widenScalarToNextPow2(1);
801 
802   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
803     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
804     .customFor({{S64, S64}})
805     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
806   if (ST.has16BitInsts())
807     FPToI.legalFor({{S16, S16}});
808   else
809     FPToI.minScalar(1, S32);
810 
811   FPToI.minScalar(0, S32)
812        .scalarize(0)
813        .lower();
814 
815   // Lower roundeven into G_FRINT
816   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
817     .scalarize(0)
818     .lower();
819 
820   if (ST.has16BitInsts()) {
821     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
822       .legalFor({S16, S32, S64})
823       .clampScalar(0, S16, S64)
824       .scalarize(0);
825   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
826     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
827       .legalFor({S32, S64})
828       .clampScalar(0, S32, S64)
829       .scalarize(0);
830   } else {
831     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832       .legalFor({S32})
833       .customFor({S64})
834       .clampScalar(0, S32, S64)
835       .scalarize(0);
836   }
837 
838   getActionDefinitionsBuilder(G_PTR_ADD)
839     .legalIf(all(isPointer(0), sameSize(0, 1)))
840     .scalarize(0)
841     .scalarSameSizeAs(1, 0);
842 
843   getActionDefinitionsBuilder(G_PTRMASK)
844     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
845     .scalarSameSizeAs(1, 0)
846     .scalarize(0);
847 
848   auto &CmpBuilder =
849     getActionDefinitionsBuilder(G_ICMP)
850     // The compare output type differs based on the register bank of the output,
851     // so make both s1 and s32 legal.
852     //
853     // Scalar compares producing output in scc will be promoted to s32, as that
854     // is the allocatable register type that will be needed for the copy from
855     // scc. This will be promoted during RegBankSelect, and we assume something
856     // before that won't try to use s32 result types.
857     //
858     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
859     // bank.
860     .legalForCartesianProduct(
861       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
862     .legalForCartesianProduct(
863       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
864   if (ST.has16BitInsts()) {
865     CmpBuilder.legalFor({{S1, S16}});
866   }
867 
868   CmpBuilder
869     .widenScalarToNextPow2(1)
870     .clampScalar(1, S32, S64)
871     .scalarize(0)
872     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
873 
874   getActionDefinitionsBuilder(G_FCMP)
875     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
876     .widenScalarToNextPow2(1)
877     .clampScalar(1, S32, S64)
878     .scalarize(0);
879 
880   // FIXME: fpow has a selection pattern that should move to custom lowering.
881   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
882   if (ST.has16BitInsts())
883     Exp2Ops.legalFor({S32, S16});
884   else
885     Exp2Ops.legalFor({S32});
886   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
887   Exp2Ops.scalarize(0);
888 
889   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
890   if (ST.has16BitInsts())
891     ExpOps.customFor({{S32}, {S16}});
892   else
893     ExpOps.customFor({S32});
894   ExpOps.clampScalar(0, MinScalarFPTy, S32)
895         .scalarize(0);
896 
897   getActionDefinitionsBuilder(G_FPOWI)
898     .clampScalar(0, MinScalarFPTy, S32)
899     .lower();
900 
901   // The 64-bit versions produce 32-bit results, but only on the SALU.
902   getActionDefinitionsBuilder(G_CTPOP)
903     .legalFor({{S32, S32}, {S32, S64}})
904     .clampScalar(0, S32, S32)
905     .clampScalar(1, S32, S64)
906     .scalarize(0)
907     .widenScalarToNextPow2(0, 32)
908     .widenScalarToNextPow2(1, 32);
909 
910   // The hardware instructions return a different result on 0 than the generic
911   // instructions expect. The hardware produces -1, but these produce the
912   // bitwidth.
913   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
914     .scalarize(0)
915     .clampScalar(0, S32, S32)
916     .clampScalar(1, S32, S64)
917     .widenScalarToNextPow2(0, 32)
918     .widenScalarToNextPow2(1, 32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   getActionDefinitionsBuilder(G_BITREVERSE)
931     .legalFor({S32})
932     .clampScalar(0, S32, S32)
933     .scalarize(0);
934 
935   if (ST.has16BitInsts()) {
936     getActionDefinitionsBuilder(G_BSWAP)
937       .legalFor({S16, S32, V2S16})
938       .clampMaxNumElements(0, S16, 2)
939       // FIXME: Fixing non-power-of-2 before clamp is workaround for
940       // narrowScalar limitation.
941       .widenScalarToNextPow2(0)
942       .clampScalar(0, S16, S32)
943       .scalarize(0);
944 
945     if (ST.hasVOP3PInsts()) {
946       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
947         .legalFor({S32, S16, V2S16})
948         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
949         .clampMaxNumElements(0, S16, 2)
950         .minScalar(0, S16)
951         .widenScalarToNextPow2(0)
952         .scalarize(0)
953         .lower();
954     } else {
955       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
956         .legalFor({S32, S16})
957         .widenScalarToNextPow2(0)
958         .minScalar(0, S16)
959         .scalarize(0)
960         .lower();
961     }
962   } else {
963     // TODO: Should have same legality without v_perm_b32
964     getActionDefinitionsBuilder(G_BSWAP)
965       .legalFor({S32})
966       .lowerIf(scalarNarrowerThan(0, 32))
967       // FIXME: Fixing non-power-of-2 before clamp is workaround for
968       // narrowScalar limitation.
969       .widenScalarToNextPow2(0)
970       .maxScalar(0, S32)
971       .scalarize(0)
972       .lower();
973 
974     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
975       .legalFor({S32})
976       .minScalar(0, S32)
977       .widenScalarToNextPow2(0)
978       .scalarize(0)
979       .lower();
980   }
981 
982   getActionDefinitionsBuilder(G_INTTOPTR)
983     // List the common cases
984     .legalForCartesianProduct(AddrSpaces64, {S64})
985     .legalForCartesianProduct(AddrSpaces32, {S32})
986     .scalarize(0)
987     // Accept any address space as long as the size matches
988     .legalIf(sameSize(0, 1))
989     .widenScalarIf(smallerThan(1, 0),
990       [](const LegalityQuery &Query) {
991         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
992       })
993     .narrowScalarIf(largerThan(1, 0),
994       [](const LegalityQuery &Query) {
995         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
996       });
997 
998   getActionDefinitionsBuilder(G_PTRTOINT)
999     // List the common cases
1000     .legalForCartesianProduct(AddrSpaces64, {S64})
1001     .legalForCartesianProduct(AddrSpaces32, {S32})
1002     .scalarize(0)
1003     // Accept any address space as long as the size matches
1004     .legalIf(sameSize(0, 1))
1005     .widenScalarIf(smallerThan(0, 1),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1008       })
1009     .narrowScalarIf(
1010       largerThan(0, 1),
1011       [](const LegalityQuery &Query) {
1012         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1013       });
1014 
1015   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1016     .scalarize(0)
1017     .custom();
1018 
1019   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1020                                     bool IsLoad) -> bool {
1021     const LLT DstTy = Query.Types[0];
1022 
1023     // Split vector extloads.
1024     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1025     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1026 
1027     if (MemSize < DstTy.getSizeInBits())
1028       MemSize = std::max(MemSize, AlignBits);
1029 
1030     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1031       return true;
1032 
1033     const LLT PtrTy = Query.Types[1];
1034     unsigned AS = PtrTy.getAddressSpace();
1035     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1036       return true;
1037 
1038     // Catch weird sized loads that don't evenly divide into the access sizes
1039     // TODO: May be able to widen depending on alignment etc.
1040     unsigned NumRegs = (MemSize + 31) / 32;
1041     if (NumRegs == 3) {
1042       if (!ST.hasDwordx3LoadStores())
1043         return true;
1044     } else {
1045       // If the alignment allows, these should have been widened.
1046       if (!isPowerOf2_32(NumRegs))
1047         return true;
1048     }
1049 
1050     if (AlignBits < MemSize) {
1051       const SITargetLowering *TLI = ST.getTargetLowering();
1052       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1053                                                       Align(AlignBits / 8));
1054     }
1055 
1056     return false;
1057   };
1058 
1059   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1060   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1061   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1062 
1063   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1064   // LDS
1065   // TODO: Unsupported flat for SI.
1066 
1067   for (unsigned Op : {G_LOAD, G_STORE}) {
1068     const bool IsStore = Op == G_STORE;
1069 
1070     auto &Actions = getActionDefinitionsBuilder(Op);
1071     // Explicitly list some common cases.
1072     // TODO: Does this help compile time at all?
1073     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1074                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1075                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1076                                       {S64, GlobalPtr, 64, GlobalAlign32},
1077                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1078                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1079                                       {S32, GlobalPtr, 8, GlobalAlign8},
1080                                       {S32, GlobalPtr, 16, GlobalAlign16},
1081 
1082                                       {S32, LocalPtr, 32, 32},
1083                                       {S64, LocalPtr, 64, 32},
1084                                       {V2S32, LocalPtr, 64, 32},
1085                                       {S32, LocalPtr, 8, 8},
1086                                       {S32, LocalPtr, 16, 16},
1087                                       {V2S16, LocalPtr, 32, 32},
1088 
1089                                       {S32, PrivatePtr, 32, 32},
1090                                       {S32, PrivatePtr, 8, 8},
1091                                       {S32, PrivatePtr, 16, 16},
1092                                       {V2S16, PrivatePtr, 32, 32},
1093 
1094                                       {S32, ConstantPtr, 32, GlobalAlign32},
1095                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1096                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1097                                       {S64, ConstantPtr, 64, GlobalAlign32},
1098                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1099     Actions.legalIf(
1100       [=](const LegalityQuery &Query) -> bool {
1101         return isLoadStoreLegal(ST, Query, Op);
1102       });
1103 
1104     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1105     // 64-bits.
1106     //
1107     // TODO: Should generalize bitcast action into coerce, which will also cover
1108     // inserting addrspacecasts.
1109     Actions.customIf(typeIs(1, Constant32Ptr));
1110 
1111     // Turn any illegal element vectors into something easier to deal
1112     // with. These will ultimately produce 32-bit scalar shifts to extract the
1113     // parts anyway.
1114     //
1115     // For odd 16-bit element vectors, prefer to split those into pieces with
1116     // 16-bit vector parts.
1117     Actions.bitcastIf(
1118       [=](const LegalityQuery &Query) -> bool {
1119         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1120                                           Query.MMODescrs[0].SizeInBits);
1121       }, bitcastToRegisterType(0));
1122 
1123     if (!IsStore) {
1124       // Widen suitably aligned loads by loading extra bytes. The standard
1125       // legalization actions can't properly express widening memory operands.
1126       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1127         return shouldWidenLoad(ST, Query, G_LOAD);
1128       });
1129     }
1130 
1131     // FIXME: load/store narrowing should be moved to lower action
1132     Actions
1133         .narrowScalarIf(
1134             [=](const LegalityQuery &Query) -> bool {
1135               return !Query.Types[0].isVector() &&
1136                      needToSplitMemOp(Query, Op == G_LOAD);
1137             },
1138             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1139               const LLT DstTy = Query.Types[0];
1140               const LLT PtrTy = Query.Types[1];
1141 
1142               const unsigned DstSize = DstTy.getSizeInBits();
1143               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1144 
1145               // Split extloads.
1146               if (DstSize > MemSize)
1147                 return std::make_pair(0, LLT::scalar(MemSize));
1148 
1149               if (!isPowerOf2_32(DstSize)) {
1150                 // We're probably decomposing an odd sized store. Try to split
1151                 // to the widest type. TODO: Account for alignment. As-is it
1152                 // should be OK, since the new parts will be further legalized.
1153                 unsigned FloorSize = PowerOf2Floor(DstSize);
1154                 return std::make_pair(0, LLT::scalar(FloorSize));
1155               }
1156 
1157               if (DstSize > 32 && (DstSize % 32 != 0)) {
1158                 // FIXME: Need a way to specify non-extload of larger size if
1159                 // suitably aligned.
1160                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1161               }
1162 
1163               unsigned MaxSize = maxSizeForAddrSpace(ST,
1164                                                      PtrTy.getAddressSpace(),
1165                                                      Op == G_LOAD);
1166               if (MemSize > MaxSize)
1167                 return std::make_pair(0, LLT::scalar(MaxSize));
1168 
1169               unsigned Align = Query.MMODescrs[0].AlignInBits;
1170               return std::make_pair(0, LLT::scalar(Align));
1171             })
1172         .fewerElementsIf(
1173             [=](const LegalityQuery &Query) -> bool {
1174               return Query.Types[0].isVector() &&
1175                      needToSplitMemOp(Query, Op == G_LOAD);
1176             },
1177             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1178               const LLT DstTy = Query.Types[0];
1179               const LLT PtrTy = Query.Types[1];
1180 
1181               LLT EltTy = DstTy.getElementType();
1182               unsigned MaxSize = maxSizeForAddrSpace(ST,
1183                                                      PtrTy.getAddressSpace(),
1184                                                      Op == G_LOAD);
1185 
1186               // FIXME: Handle widened to power of 2 results better. This ends
1187               // up scalarizing.
1188               // FIXME: 3 element stores scalarized on SI
1189 
1190               // Split if it's too large for the address space.
1191               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1192                 unsigned NumElts = DstTy.getNumElements();
1193                 unsigned EltSize = EltTy.getSizeInBits();
1194 
1195                 if (MaxSize % EltSize == 0) {
1196                   return std::make_pair(
1197                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1198                 }
1199 
1200                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1201 
1202                 // FIXME: Refine when odd breakdowns handled
1203                 // The scalars will need to be re-legalized.
1204                 if (NumPieces == 1 || NumPieces >= NumElts ||
1205                     NumElts % NumPieces != 0)
1206                   return std::make_pair(0, EltTy);
1207 
1208                 return std::make_pair(0,
1209                                       LLT::vector(NumElts / NumPieces, EltTy));
1210               }
1211 
1212               // FIXME: We could probably handle weird extending loads better.
1213               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1214               if (DstTy.getSizeInBits() > MemSize)
1215                 return std::make_pair(0, EltTy);
1216 
1217               unsigned EltSize = EltTy.getSizeInBits();
1218               unsigned DstSize = DstTy.getSizeInBits();
1219               if (!isPowerOf2_32(DstSize)) {
1220                 // We're probably decomposing an odd sized store. Try to split
1221                 // to the widest type. TODO: Account for alignment. As-is it
1222                 // should be OK, since the new parts will be further legalized.
1223                 unsigned FloorSize = PowerOf2Floor(DstSize);
1224                 return std::make_pair(
1225                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1226               }
1227 
1228               // Need to split because of alignment.
1229               unsigned Align = Query.MMODescrs[0].AlignInBits;
1230               if (EltSize > Align &&
1231                   (EltSize / Align < DstTy.getNumElements())) {
1232                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1233               }
1234 
1235               // May need relegalization for the scalars.
1236               return std::make_pair(0, EltTy);
1237             })
1238     .lowerIfMemSizeNotPow2()
1239     .minScalar(0, S32);
1240 
1241     if (IsStore)
1242       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1243 
1244     Actions
1245         .widenScalarToNextPow2(0)
1246         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1247         .lower();
1248   }
1249 
1250   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1251                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1252                                                   {S32, GlobalPtr, 16, 2 * 8},
1253                                                   {S32, LocalPtr, 8, 8},
1254                                                   {S32, LocalPtr, 16, 16},
1255                                                   {S32, PrivatePtr, 8, 8},
1256                                                   {S32, PrivatePtr, 16, 16},
1257                                                   {S32, ConstantPtr, 8, 8},
1258                                                   {S32, ConstantPtr, 16, 2 * 8}});
1259   if (ST.hasFlatAddressSpace()) {
1260     ExtLoads.legalForTypesWithMemDesc(
1261         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1262   }
1263 
1264   ExtLoads.clampScalar(0, S32, S32)
1265           .widenScalarToNextPow2(0)
1266           .unsupportedIfMemSizeNotPow2()
1267           .lower();
1268 
1269   auto &Atomics = getActionDefinitionsBuilder(
1270     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1271      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1272      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1273      G_ATOMICRMW_UMIN})
1274     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1275                {S64, GlobalPtr}, {S64, LocalPtr},
1276                {S32, RegionPtr}, {S64, RegionPtr}});
1277   if (ST.hasFlatAddressSpace()) {
1278     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1279   }
1280 
1281   if (ST.hasLDSFPAtomics()) {
1282     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1283       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1284   }
1285 
1286   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1287   // demarshalling
1288   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1289     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1290                 {S32, FlatPtr}, {S64, FlatPtr}})
1291     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1292                {S32, RegionPtr}, {S64, RegionPtr}});
1293   // TODO: Pointer types, any 32-bit or 64-bit vector
1294 
1295   // Condition should be s32 for scalar, s1 for vector.
1296   getActionDefinitionsBuilder(G_SELECT)
1297     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1298           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1299           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1300     .clampScalar(0, S16, S64)
1301     .scalarize(1)
1302     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1303     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1304     .clampMaxNumElements(0, S32, 2)
1305     .clampMaxNumElements(0, LocalPtr, 2)
1306     .clampMaxNumElements(0, PrivatePtr, 2)
1307     .scalarize(0)
1308     .widenScalarToNextPow2(0)
1309     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1310 
1311   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1312   // be more flexible with the shift amount type.
1313   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1314     .legalFor({{S32, S32}, {S64, S32}});
1315   if (ST.has16BitInsts()) {
1316     if (ST.hasVOP3PInsts()) {
1317       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1318             .clampMaxNumElements(0, S16, 2);
1319     } else
1320       Shifts.legalFor({{S16, S16}});
1321 
1322     // TODO: Support 16-bit shift amounts for all types
1323     Shifts.widenScalarIf(
1324       [=](const LegalityQuery &Query) {
1325         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1326         // 32-bit amount.
1327         const LLT ValTy = Query.Types[0];
1328         const LLT AmountTy = Query.Types[1];
1329         return ValTy.getSizeInBits() <= 16 &&
1330                AmountTy.getSizeInBits() < 16;
1331       }, changeTo(1, S16));
1332     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1333     Shifts.clampScalar(1, S32, S32);
1334     Shifts.clampScalar(0, S16, S64);
1335     Shifts.widenScalarToNextPow2(0, 16);
1336 
1337     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1338       .minScalar(0, S16)
1339       .scalarize(0)
1340       .lower();
1341   } else {
1342     // Make sure we legalize the shift amount type first, as the general
1343     // expansion for the shifted type will produce much worse code if it hasn't
1344     // been truncated already.
1345     Shifts.clampScalar(1, S32, S32);
1346     Shifts.clampScalar(0, S32, S64);
1347     Shifts.widenScalarToNextPow2(0, 32);
1348 
1349     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350       .minScalar(0, S32)
1351       .scalarize(0)
1352       .lower();
1353   }
1354   Shifts.scalarize(0);
1355 
1356   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1357     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1358     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1359     unsigned IdxTypeIdx = 2;
1360 
1361     getActionDefinitionsBuilder(Op)
1362       .customIf([=](const LegalityQuery &Query) {
1363           const LLT EltTy = Query.Types[EltTypeIdx];
1364           const LLT VecTy = Query.Types[VecTypeIdx];
1365           const LLT IdxTy = Query.Types[IdxTypeIdx];
1366           const unsigned EltSize = EltTy.getSizeInBits();
1367           return (EltSize == 32 || EltSize == 64) &&
1368                   VecTy.getSizeInBits() % 32 == 0 &&
1369                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1370                   IdxTy.getSizeInBits() == 32;
1371         })
1372       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1373                  bitcastToVectorElement32(VecTypeIdx))
1374       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1375       .bitcastIf(
1376         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1377         [=](const LegalityQuery &Query) {
1378           // For > 64-bit element types, try to turn this into a 64-bit
1379           // element vector since we may be able to do better indexing
1380           // if this is scalar. If not, fall back to 32.
1381           const LLT EltTy = Query.Types[EltTypeIdx];
1382           const LLT VecTy = Query.Types[VecTypeIdx];
1383           const unsigned DstEltSize = EltTy.getSizeInBits();
1384           const unsigned VecSize = VecTy.getSizeInBits();
1385 
1386           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1387           return std::make_pair(
1388             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1389         })
1390       .clampScalar(EltTypeIdx, S32, S64)
1391       .clampScalar(VecTypeIdx, S32, S64)
1392       .clampScalar(IdxTypeIdx, S32, S32)
1393       .clampMaxNumElements(VecTypeIdx, S32, 32)
1394       // TODO: Clamp elements for 64-bit vectors?
1395       // It should only be necessary with variable indexes.
1396       // As a last resort, lower to the stack
1397       .lower();
1398   }
1399 
1400   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1401     .unsupportedIf([=](const LegalityQuery &Query) {
1402         const LLT &EltTy = Query.Types[1].getElementType();
1403         return Query.Types[0] != EltTy;
1404       });
1405 
1406   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1407     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1408     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1409 
1410     // FIXME: Doesn't handle extract of illegal sizes.
1411     getActionDefinitionsBuilder(Op)
1412       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1413       // FIXME: Multiples of 16 should not be legal.
1414       .legalIf([=](const LegalityQuery &Query) {
1415           const LLT BigTy = Query.Types[BigTyIdx];
1416           const LLT LitTy = Query.Types[LitTyIdx];
1417           return (BigTy.getSizeInBits() % 32 == 0) &&
1418                  (LitTy.getSizeInBits() % 16 == 0);
1419         })
1420       .widenScalarIf(
1421         [=](const LegalityQuery &Query) {
1422           const LLT BigTy = Query.Types[BigTyIdx];
1423           return (BigTy.getScalarSizeInBits() < 16);
1424         },
1425         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1426       .widenScalarIf(
1427         [=](const LegalityQuery &Query) {
1428           const LLT LitTy = Query.Types[LitTyIdx];
1429           return (LitTy.getScalarSizeInBits() < 16);
1430         },
1431         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1432       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1433       .widenScalarToNextPow2(BigTyIdx, 32);
1434 
1435   }
1436 
1437   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1438     .legalForCartesianProduct(AllS32Vectors, {S32})
1439     .legalForCartesianProduct(AllS64Vectors, {S64})
1440     .clampNumElements(0, V16S32, V32S32)
1441     .clampNumElements(0, V2S64, V16S64)
1442     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1443 
1444   if (ST.hasScalarPackInsts()) {
1445     BuildVector
1446       // FIXME: Should probably widen s1 vectors straight to s32
1447       .minScalarOrElt(0, S16)
1448       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1449       .minScalar(1, S32);
1450 
1451     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1452       .legalFor({V2S16, S32})
1453       .lower();
1454     BuildVector.minScalarOrElt(0, S32);
1455   } else {
1456     BuildVector.customFor({V2S16, S16});
1457     BuildVector.minScalarOrElt(0, S32);
1458 
1459     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460       .customFor({V2S16, S32})
1461       .lower();
1462   }
1463 
1464   BuildVector.legalIf(isRegisterType(0));
1465 
1466   // FIXME: Clamp maximum size
1467   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1468     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1469     .clampMaxNumElements(0, S32, 32)
1470     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1471     .clampMaxNumElements(0, S16, 64);
1472 
1473   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1474   // pre-legalize.
1475   if (ST.hasVOP3PInsts()) {
1476     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1477       .customFor({V2S16, V2S16})
1478       .lower();
1479   } else
1480     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1481 
1482   // Merge/Unmerge
1483   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1484     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1485     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1486 
1487     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1488       const LLT Ty = Query.Types[TypeIdx];
1489       if (Ty.isVector()) {
1490         const LLT &EltTy = Ty.getElementType();
1491         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1492           return true;
1493         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1494           return true;
1495       }
1496       return false;
1497     };
1498 
1499     auto &Builder = getActionDefinitionsBuilder(Op)
1500       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1501       .lowerFor({{S16, V2S16}})
1502       .lowerIf([=](const LegalityQuery &Query) {
1503           const LLT BigTy = Query.Types[BigTyIdx];
1504           return BigTy.getSizeInBits() == 32;
1505         })
1506       // Try to widen to s16 first for small types.
1507       // TODO: Only do this on targets with legal s16 shifts
1508       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1509       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1510       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1511       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1512                            elementTypeIs(1, S16)),
1513                        changeTo(1, V2S16))
1514       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1515       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1516       // valid.
1517       .clampScalar(LitTyIdx, S32, S512)
1518       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1519       // Break up vectors with weird elements into scalars
1520       .fewerElementsIf(
1521         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1522         scalarize(0))
1523       .fewerElementsIf(
1524         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1525         scalarize(1))
1526       .clampScalar(BigTyIdx, S32, MaxScalar);
1527 
1528     if (Op == G_MERGE_VALUES) {
1529       Builder.widenScalarIf(
1530         // TODO: Use 16-bit shifts if legal for 8-bit values?
1531         [=](const LegalityQuery &Query) {
1532           const LLT Ty = Query.Types[LitTyIdx];
1533           return Ty.getSizeInBits() < 32;
1534         },
1535         changeTo(LitTyIdx, S32));
1536     }
1537 
1538     Builder.widenScalarIf(
1539       [=](const LegalityQuery &Query) {
1540         const LLT Ty = Query.Types[BigTyIdx];
1541         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1542           Ty.getSizeInBits() % 16 != 0;
1543       },
1544       [=](const LegalityQuery &Query) {
1545         // Pick the next power of 2, or a multiple of 64 over 128.
1546         // Whichever is smaller.
1547         const LLT &Ty = Query.Types[BigTyIdx];
1548         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1549         if (NewSizeInBits >= 256) {
1550           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1551           if (RoundedTo < NewSizeInBits)
1552             NewSizeInBits = RoundedTo;
1553         }
1554         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1555       })
1556       // Any vectors left are the wrong size. Scalarize them.
1557       .scalarize(0)
1558       .scalarize(1);
1559   }
1560 
1561   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1562   // RegBankSelect.
1563   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1564     .legalFor({{S32}, {S64}});
1565 
1566   if (ST.hasVOP3PInsts()) {
1567     SextInReg.lowerFor({{V2S16}})
1568       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1569       // get more vector shift opportunities, since we'll get those when
1570       // expanded.
1571       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1572   } else if (ST.has16BitInsts()) {
1573     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1574   } else {
1575     // Prefer to promote to s32 before lowering if we don't have 16-bit
1576     // shifts. This avoid a lot of intermediate truncate and extend operations.
1577     SextInReg.lowerFor({{S32}, {S64}});
1578   }
1579 
1580   SextInReg
1581     .scalarize(0)
1582     .clampScalar(0, S32, S64)
1583     .lower();
1584 
1585   getActionDefinitionsBuilder(G_FSHR)
1586     .legalFor({{S32, S32}})
1587     .scalarize(0)
1588     .lower();
1589 
1590   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1591     .legalFor({S64});
1592 
1593   getActionDefinitionsBuilder(G_FENCE)
1594     .alwaysLegal();
1595 
1596   getActionDefinitionsBuilder({
1597       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1598       G_FCOPYSIGN,
1599 
1600       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1601       G_ATOMICRMW_NAND,
1602       G_ATOMICRMW_FSUB,
1603       G_READ_REGISTER,
1604       G_WRITE_REGISTER,
1605 
1606       G_SADDO, G_SSUBO,
1607 
1608        // TODO: Implement
1609       G_FMINIMUM, G_FMAXIMUM,
1610       G_FSHL
1611     }).lower();
1612 
1613   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1614         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1615         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1616     .unsupported();
1617 
1618   computeTables();
1619   verify(*ST.getInstrInfo());
1620 }
1621 
1622 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1623                                          MachineInstr &MI) const {
1624   MachineIRBuilder &B = Helper.MIRBuilder;
1625   MachineRegisterInfo &MRI = *B.getMRI();
1626 
1627   switch (MI.getOpcode()) {
1628   case TargetOpcode::G_ADDRSPACE_CAST:
1629     return legalizeAddrSpaceCast(MI, MRI, B);
1630   case TargetOpcode::G_FRINT:
1631     return legalizeFrint(MI, MRI, B);
1632   case TargetOpcode::G_FCEIL:
1633     return legalizeFceil(MI, MRI, B);
1634   case TargetOpcode::G_FREM:
1635     return legalizeFrem(MI, MRI, B);
1636   case TargetOpcode::G_INTRINSIC_TRUNC:
1637     return legalizeIntrinsicTrunc(MI, MRI, B);
1638   case TargetOpcode::G_SITOFP:
1639     return legalizeITOFP(MI, MRI, B, true);
1640   case TargetOpcode::G_UITOFP:
1641     return legalizeITOFP(MI, MRI, B, false);
1642   case TargetOpcode::G_FPTOSI:
1643     return legalizeFPTOI(MI, MRI, B, true);
1644   case TargetOpcode::G_FPTOUI:
1645     return legalizeFPTOI(MI, MRI, B, false);
1646   case TargetOpcode::G_FMINNUM:
1647   case TargetOpcode::G_FMAXNUM:
1648   case TargetOpcode::G_FMINNUM_IEEE:
1649   case TargetOpcode::G_FMAXNUM_IEEE:
1650     return legalizeMinNumMaxNum(Helper, MI);
1651   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1652     return legalizeExtractVectorElt(MI, MRI, B);
1653   case TargetOpcode::G_INSERT_VECTOR_ELT:
1654     return legalizeInsertVectorElt(MI, MRI, B);
1655   case TargetOpcode::G_SHUFFLE_VECTOR:
1656     return legalizeShuffleVector(MI, MRI, B);
1657   case TargetOpcode::G_FSIN:
1658   case TargetOpcode::G_FCOS:
1659     return legalizeSinCos(MI, MRI, B);
1660   case TargetOpcode::G_GLOBAL_VALUE:
1661     return legalizeGlobalValue(MI, MRI, B);
1662   case TargetOpcode::G_LOAD:
1663     return legalizeLoad(Helper, MI);
1664   case TargetOpcode::G_FMAD:
1665     return legalizeFMad(MI, MRI, B);
1666   case TargetOpcode::G_FDIV:
1667     return legalizeFDIV(MI, MRI, B);
1668   case TargetOpcode::G_UDIV:
1669   case TargetOpcode::G_UREM:
1670     return legalizeUDIV_UREM(MI, MRI, B);
1671   case TargetOpcode::G_SDIV:
1672   case TargetOpcode::G_SREM:
1673     return legalizeSDIV_SREM(MI, MRI, B);
1674   case TargetOpcode::G_ATOMIC_CMPXCHG:
1675     return legalizeAtomicCmpXChg(MI, MRI, B);
1676   case TargetOpcode::G_FLOG:
1677     return legalizeFlog(MI, B, numbers::ln2f);
1678   case TargetOpcode::G_FLOG10:
1679     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1680   case TargetOpcode::G_FEXP:
1681     return legalizeFExp(MI, B);
1682   case TargetOpcode::G_FPOW:
1683     return legalizeFPow(MI, B);
1684   case TargetOpcode::G_FFLOOR:
1685     return legalizeFFloor(MI, MRI, B);
1686   case TargetOpcode::G_BUILD_VECTOR:
1687     return legalizeBuildVector(MI, MRI, B);
1688   default:
1689     return false;
1690   }
1691 
1692   llvm_unreachable("expected switch to return");
1693 }
1694 
1695 Register AMDGPULegalizerInfo::getSegmentAperture(
1696   unsigned AS,
1697   MachineRegisterInfo &MRI,
1698   MachineIRBuilder &B) const {
1699   MachineFunction &MF = B.getMF();
1700   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1701   const LLT S32 = LLT::scalar(32);
1702 
1703   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1704 
1705   if (ST.hasApertureRegs()) {
1706     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1707     // getreg.
1708     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1709         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1710         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1711     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1712         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1713         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1714     unsigned Encoding =
1715         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1716         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1717         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1718 
1719     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1720 
1721     B.buildInstr(AMDGPU::S_GETREG_B32)
1722       .addDef(GetReg)
1723       .addImm(Encoding);
1724     MRI.setType(GetReg, S32);
1725 
1726     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1727     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1728   }
1729 
1730   Register QueuePtr = MRI.createGenericVirtualRegister(
1731     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1732 
1733   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1734     return Register();
1735 
1736   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1737   // private_segment_aperture_base_hi.
1738   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1739 
1740   // TODO: can we be smarter about machine pointer info?
1741   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1742   MachineMemOperand *MMO = MF.getMachineMemOperand(
1743       PtrInfo,
1744       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1745           MachineMemOperand::MOInvariant,
1746       4, commonAlignment(Align(64), StructOffset));
1747 
1748   Register LoadAddr;
1749 
1750   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1751   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1752 }
1753 
1754 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1755   MachineInstr &MI, MachineRegisterInfo &MRI,
1756   MachineIRBuilder &B) const {
1757   MachineFunction &MF = B.getMF();
1758 
1759   const LLT S32 = LLT::scalar(32);
1760   Register Dst = MI.getOperand(0).getReg();
1761   Register Src = MI.getOperand(1).getReg();
1762 
1763   LLT DstTy = MRI.getType(Dst);
1764   LLT SrcTy = MRI.getType(Src);
1765   unsigned DestAS = DstTy.getAddressSpace();
1766   unsigned SrcAS = SrcTy.getAddressSpace();
1767 
1768   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1769   // vector element.
1770   assert(!DstTy.isVector());
1771 
1772   const AMDGPUTargetMachine &TM
1773     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1774 
1775   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1776     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1777     return true;
1778   }
1779 
1780   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1781     // Truncate.
1782     B.buildExtract(Dst, Src, 0);
1783     MI.eraseFromParent();
1784     return true;
1785   }
1786 
1787   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1788     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1789     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1790 
1791     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1792     // another. Merge operands are required to be the same type, but creating an
1793     // extra ptrtoint would be kind of pointless.
1794     auto HighAddr = B.buildConstant(
1795       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1796     B.buildMerge(Dst, {Src, HighAddr});
1797     MI.eraseFromParent();
1798     return true;
1799   }
1800 
1801   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1802     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1803            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1804     unsigned NullVal = TM.getNullPointerValue(DestAS);
1805 
1806     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1807     auto FlatNull = B.buildConstant(SrcTy, 0);
1808 
1809     // Extract low 32-bits of the pointer.
1810     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1811 
1812     auto CmpRes =
1813         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1814     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1815 
1816     MI.eraseFromParent();
1817     return true;
1818   }
1819 
1820   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1821     return false;
1822 
1823   if (!ST.hasFlatAddressSpace())
1824     return false;
1825 
1826   auto SegmentNull =
1827       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1828   auto FlatNull =
1829       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1830 
1831   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1832   if (!ApertureReg.isValid())
1833     return false;
1834 
1835   auto CmpRes =
1836       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1837 
1838   // Coerce the type of the low half of the result so we can use merge_values.
1839   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1840 
1841   // TODO: Should we allow mismatched types but matching sizes in merges to
1842   // avoid the ptrtoint?
1843   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1844   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1845 
1846   MI.eraseFromParent();
1847   return true;
1848 }
1849 
1850 bool AMDGPULegalizerInfo::legalizeFrint(
1851   MachineInstr &MI, MachineRegisterInfo &MRI,
1852   MachineIRBuilder &B) const {
1853   Register Src = MI.getOperand(1).getReg();
1854   LLT Ty = MRI.getType(Src);
1855   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1856 
1857   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1858   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1859 
1860   auto C1 = B.buildFConstant(Ty, C1Val);
1861   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1862 
1863   // TODO: Should this propagate fast-math-flags?
1864   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1865   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1866 
1867   auto C2 = B.buildFConstant(Ty, C2Val);
1868   auto Fabs = B.buildFAbs(Ty, Src);
1869 
1870   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1871   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1872   MI.eraseFromParent();
1873   return true;
1874 }
1875 
1876 bool AMDGPULegalizerInfo::legalizeFceil(
1877   MachineInstr &MI, MachineRegisterInfo &MRI,
1878   MachineIRBuilder &B) const {
1879 
1880   const LLT S1 = LLT::scalar(1);
1881   const LLT S64 = LLT::scalar(64);
1882 
1883   Register Src = MI.getOperand(1).getReg();
1884   assert(MRI.getType(Src) == S64);
1885 
1886   // result = trunc(src)
1887   // if (src > 0.0 && src != result)
1888   //   result += 1.0
1889 
1890   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1891 
1892   const auto Zero = B.buildFConstant(S64, 0.0);
1893   const auto One = B.buildFConstant(S64, 1.0);
1894   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1895   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1896   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1897   auto Add = B.buildSelect(S64, And, One, Zero);
1898 
1899   // TODO: Should this propagate fast-math-flags?
1900   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1901   return true;
1902 }
1903 
1904 bool AMDGPULegalizerInfo::legalizeFrem(
1905   MachineInstr &MI, MachineRegisterInfo &MRI,
1906   MachineIRBuilder &B) const {
1907     Register DstReg = MI.getOperand(0).getReg();
1908     Register Src0Reg = MI.getOperand(1).getReg();
1909     Register Src1Reg = MI.getOperand(2).getReg();
1910     auto Flags = MI.getFlags();
1911     LLT Ty = MRI.getType(DstReg);
1912 
1913     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1914     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1915     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1916     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1917     MI.eraseFromParent();
1918     return true;
1919 }
1920 
1921 static MachineInstrBuilder extractF64Exponent(Register Hi,
1922                                               MachineIRBuilder &B) {
1923   const unsigned FractBits = 52;
1924   const unsigned ExpBits = 11;
1925   LLT S32 = LLT::scalar(32);
1926 
1927   auto Const0 = B.buildConstant(S32, FractBits - 32);
1928   auto Const1 = B.buildConstant(S32, ExpBits);
1929 
1930   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1931     .addUse(Hi)
1932     .addUse(Const0.getReg(0))
1933     .addUse(Const1.getReg(0));
1934 
1935   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1936 }
1937 
1938 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1939   MachineInstr &MI, MachineRegisterInfo &MRI,
1940   MachineIRBuilder &B) const {
1941   const LLT S1 = LLT::scalar(1);
1942   const LLT S32 = LLT::scalar(32);
1943   const LLT S64 = LLT::scalar(64);
1944 
1945   Register Src = MI.getOperand(1).getReg();
1946   assert(MRI.getType(Src) == S64);
1947 
1948   // TODO: Should this use extract since the low half is unused?
1949   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1950   Register Hi = Unmerge.getReg(1);
1951 
1952   // Extract the upper half, since this is where we will find the sign and
1953   // exponent.
1954   auto Exp = extractF64Exponent(Hi, B);
1955 
1956   const unsigned FractBits = 52;
1957 
1958   // Extract the sign bit.
1959   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1960   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1961 
1962   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1963 
1964   const auto Zero32 = B.buildConstant(S32, 0);
1965 
1966   // Extend back to 64-bits.
1967   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1968 
1969   auto Shr = B.buildAShr(S64, FractMask, Exp);
1970   auto Not = B.buildNot(S64, Shr);
1971   auto Tmp0 = B.buildAnd(S64, Src, Not);
1972   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1973 
1974   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1975   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1976 
1977   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1978   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1979   MI.eraseFromParent();
1980   return true;
1981 }
1982 
1983 bool AMDGPULegalizerInfo::legalizeITOFP(
1984   MachineInstr &MI, MachineRegisterInfo &MRI,
1985   MachineIRBuilder &B, bool Signed) const {
1986 
1987   Register Dst = MI.getOperand(0).getReg();
1988   Register Src = MI.getOperand(1).getReg();
1989 
1990   const LLT S64 = LLT::scalar(64);
1991   const LLT S32 = LLT::scalar(32);
1992 
1993   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1994 
1995   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1996 
1997   auto CvtHi = Signed ?
1998     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1999     B.buildUITOFP(S64, Unmerge.getReg(1));
2000 
2001   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2002 
2003   auto ThirtyTwo = B.buildConstant(S32, 32);
2004   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2005     .addUse(CvtHi.getReg(0))
2006     .addUse(ThirtyTwo.getReg(0));
2007 
2008   // TODO: Should this propagate fast-math-flags?
2009   B.buildFAdd(Dst, LdExp, CvtLo);
2010   MI.eraseFromParent();
2011   return true;
2012 }
2013 
2014 // TODO: Copied from DAG implementation. Verify logic and document how this
2015 // actually works.
2016 bool AMDGPULegalizerInfo::legalizeFPTOI(
2017   MachineInstr &MI, MachineRegisterInfo &MRI,
2018   MachineIRBuilder &B, bool Signed) const {
2019 
2020   Register Dst = MI.getOperand(0).getReg();
2021   Register Src = MI.getOperand(1).getReg();
2022 
2023   const LLT S64 = LLT::scalar(64);
2024   const LLT S32 = LLT::scalar(32);
2025 
2026   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2027 
2028   unsigned Flags = MI.getFlags();
2029 
2030   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2031   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2032   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2033 
2034   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2035   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2036   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2037 
2038   auto Hi = Signed ?
2039     B.buildFPTOSI(S32, FloorMul) :
2040     B.buildFPTOUI(S32, FloorMul);
2041   auto Lo = B.buildFPTOUI(S32, Fma);
2042 
2043   B.buildMerge(Dst, { Lo, Hi });
2044   MI.eraseFromParent();
2045 
2046   return true;
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2050                                                MachineInstr &MI) const {
2051   MachineFunction &MF = Helper.MIRBuilder.getMF();
2052   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2053 
2054   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2055                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2056 
2057   // With ieee_mode disabled, the instructions have the correct behavior
2058   // already for G_FMINNUM/G_FMAXNUM
2059   if (!MFI->getMode().IEEE)
2060     return !IsIEEEOp;
2061 
2062   if (IsIEEEOp)
2063     return true;
2064 
2065   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2066 }
2067 
2068 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2069   MachineInstr &MI, MachineRegisterInfo &MRI,
2070   MachineIRBuilder &B) const {
2071   // TODO: Should move some of this into LegalizerHelper.
2072 
2073   // TODO: Promote dynamic indexing of s16 to s32
2074 
2075   // FIXME: Artifact combiner probably should have replaced the truncated
2076   // constant before this, so we shouldn't need
2077   // getConstantVRegValWithLookThrough.
2078   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2079     MI.getOperand(2).getReg(), MRI);
2080   if (!IdxVal) // Dynamic case will be selected to register indexing.
2081     return true;
2082 
2083   Register Dst = MI.getOperand(0).getReg();
2084   Register Vec = MI.getOperand(1).getReg();
2085 
2086   LLT VecTy = MRI.getType(Vec);
2087   LLT EltTy = VecTy.getElementType();
2088   assert(EltTy == MRI.getType(Dst));
2089 
2090   if (IdxVal->Value < VecTy.getNumElements())
2091     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2092   else
2093     B.buildUndef(Dst);
2094 
2095   MI.eraseFromParent();
2096   return true;
2097 }
2098 
2099 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2100   MachineInstr &MI, MachineRegisterInfo &MRI,
2101   MachineIRBuilder &B) const {
2102   // TODO: Should move some of this into LegalizerHelper.
2103 
2104   // TODO: Promote dynamic indexing of s16 to s32
2105 
2106   // FIXME: Artifact combiner probably should have replaced the truncated
2107   // constant before this, so we shouldn't need
2108   // getConstantVRegValWithLookThrough.
2109   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2110     MI.getOperand(3).getReg(), MRI);
2111   if (!IdxVal) // Dynamic case will be selected to register indexing.
2112     return true;
2113 
2114   Register Dst = MI.getOperand(0).getReg();
2115   Register Vec = MI.getOperand(1).getReg();
2116   Register Ins = MI.getOperand(2).getReg();
2117 
2118   LLT VecTy = MRI.getType(Vec);
2119   LLT EltTy = VecTy.getElementType();
2120   assert(EltTy == MRI.getType(Ins));
2121 
2122   if (IdxVal->Value < VecTy.getNumElements())
2123     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2124   else
2125     B.buildUndef(Dst);
2126 
2127   MI.eraseFromParent();
2128   return true;
2129 }
2130 
2131 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2132   MachineInstr &MI, MachineRegisterInfo &MRI,
2133   MachineIRBuilder &B) const {
2134   const LLT V2S16 = LLT::vector(2, 16);
2135 
2136   Register Dst = MI.getOperand(0).getReg();
2137   Register Src0 = MI.getOperand(1).getReg();
2138   LLT DstTy = MRI.getType(Dst);
2139   LLT SrcTy = MRI.getType(Src0);
2140 
2141   if (SrcTy == V2S16 && DstTy == V2S16 &&
2142       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2143     return true;
2144 
2145   MachineIRBuilder HelperBuilder(MI);
2146   GISelObserverWrapper DummyObserver;
2147   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2148   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2149 }
2150 
2151 bool AMDGPULegalizerInfo::legalizeSinCos(
2152   MachineInstr &MI, MachineRegisterInfo &MRI,
2153   MachineIRBuilder &B) const {
2154 
2155   Register DstReg = MI.getOperand(0).getReg();
2156   Register SrcReg = MI.getOperand(1).getReg();
2157   LLT Ty = MRI.getType(DstReg);
2158   unsigned Flags = MI.getFlags();
2159 
2160   Register TrigVal;
2161   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2162   if (ST.hasTrigReducedRange()) {
2163     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2164     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2165       .addUse(MulVal.getReg(0))
2166       .setMIFlags(Flags).getReg(0);
2167   } else
2168     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2169 
2170   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2171     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2172   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2173     .addUse(TrigVal)
2174     .setMIFlags(Flags);
2175   MI.eraseFromParent();
2176   return true;
2177 }
2178 
2179 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2180                                                   MachineIRBuilder &B,
2181                                                   const GlobalValue *GV,
2182                                                   int64_t Offset,
2183                                                   unsigned GAFlags) const {
2184   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2185   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2186   // to the following code sequence:
2187   //
2188   // For constant address space:
2189   //   s_getpc_b64 s[0:1]
2190   //   s_add_u32 s0, s0, $symbol
2191   //   s_addc_u32 s1, s1, 0
2192   //
2193   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2194   //   a fixup or relocation is emitted to replace $symbol with a literal
2195   //   constant, which is a pc-relative offset from the encoding of the $symbol
2196   //   operand to the global variable.
2197   //
2198   // For global address space:
2199   //   s_getpc_b64 s[0:1]
2200   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2201   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2202   //
2203   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204   //   fixups or relocations are emitted to replace $symbol@*@lo and
2205   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2206   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2207   //   operand to the global variable.
2208   //
2209   // What we want here is an offset from the value returned by s_getpc
2210   // (which is the address of the s_add_u32 instruction) to the global
2211   // variable, but since the encoding of $symbol starts 4 bytes after the start
2212   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2213   // small. This requires us to add 4 to the global variable offset in order to
2214   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2215   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2216   // instruction.
2217 
2218   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2219 
2220   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2221     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2222 
2223   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2224     .addDef(PCReg);
2225 
2226   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2227   if (GAFlags == SIInstrInfo::MO_NONE)
2228     MIB.addImm(0);
2229   else
2230     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2231 
2232   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2233 
2234   if (PtrTy.getSizeInBits() == 32)
2235     B.buildExtract(DstReg, PCReg, 0);
2236   return true;
2237  }
2238 
2239 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2240   MachineInstr &MI, MachineRegisterInfo &MRI,
2241   MachineIRBuilder &B) const {
2242   Register DstReg = MI.getOperand(0).getReg();
2243   LLT Ty = MRI.getType(DstReg);
2244   unsigned AS = Ty.getAddressSpace();
2245 
2246   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2247   MachineFunction &MF = B.getMF();
2248   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2249 
2250   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2251     if (!MFI->isEntryFunction()) {
2252       const Function &Fn = MF.getFunction();
2253       DiagnosticInfoUnsupported BadLDSDecl(
2254         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2255         DS_Warning);
2256       Fn.getContext().diagnose(BadLDSDecl);
2257 
2258       // We currently don't have a way to correctly allocate LDS objects that
2259       // aren't directly associated with a kernel. We do force inlining of
2260       // functions that use local objects. However, if these dead functions are
2261       // not eliminated, we don't want a compile time error. Just emit a warning
2262       // and a trap, since there should be no callable path here.
2263       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2264       B.buildUndef(DstReg);
2265       MI.eraseFromParent();
2266       return true;
2267     }
2268 
2269     // TODO: We could emit code to handle the initialization somewhere.
2270     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2271       const SITargetLowering *TLI = ST.getTargetLowering();
2272       if (!TLI->shouldUseLDSConstAddress(GV)) {
2273         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2274         return true; // Leave in place;
2275       }
2276 
2277       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2278         Type *Ty = GV->getValueType();
2279         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2280         // zero-sized type in other languages to declare the dynamic shared
2281         // memory which size is not known at the compile time. They will be
2282         // allocated by the runtime and placed directly after the static
2283         // allocated ones. They all share the same offset.
2284         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2285           // Adjust alignment for that dynamic shared memory array.
2286           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2287           LLT S32 = LLT::scalar(32);
2288           auto Sz =
2289               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2290           B.buildIntToPtr(DstReg, Sz);
2291           MI.eraseFromParent();
2292           return true;
2293         }
2294       }
2295 
2296       B.buildConstant(
2297           DstReg,
2298           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2299       MI.eraseFromParent();
2300       return true;
2301     }
2302 
2303     const Function &Fn = MF.getFunction();
2304     DiagnosticInfoUnsupported BadInit(
2305       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2306     Fn.getContext().diagnose(BadInit);
2307     return true;
2308   }
2309 
2310   const SITargetLowering *TLI = ST.getTargetLowering();
2311 
2312   if (TLI->shouldEmitFixup(GV)) {
2313     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2314     MI.eraseFromParent();
2315     return true;
2316   }
2317 
2318   if (TLI->shouldEmitPCReloc(GV)) {
2319     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2320     MI.eraseFromParent();
2321     return true;
2322   }
2323 
2324   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2325   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2326 
2327   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2328       MachinePointerInfo::getGOT(MF),
2329       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2330           MachineMemOperand::MOInvariant,
2331       8 /*Size*/, Align(8));
2332 
2333   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2334 
2335   if (Ty.getSizeInBits() == 32) {
2336     // Truncate if this is a 32-bit constant adrdess.
2337     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2338     B.buildExtract(DstReg, Load, 0);
2339   } else
2340     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2341 
2342   MI.eraseFromParent();
2343   return true;
2344 }
2345 
2346 static LLT widenToNextPowerOf2(LLT Ty) {
2347   if (Ty.isVector())
2348     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2349   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2350 }
2351 
2352 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2353                                        MachineInstr &MI) const {
2354   MachineIRBuilder &B = Helper.MIRBuilder;
2355   MachineRegisterInfo &MRI = *B.getMRI();
2356   GISelChangeObserver &Observer = Helper.Observer;
2357 
2358   Register PtrReg = MI.getOperand(1).getReg();
2359   LLT PtrTy = MRI.getType(PtrReg);
2360   unsigned AddrSpace = PtrTy.getAddressSpace();
2361 
2362   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2363     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2364     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2365     Observer.changingInstr(MI);
2366     MI.getOperand(1).setReg(Cast.getReg(0));
2367     Observer.changedInstr(MI);
2368     return true;
2369   }
2370 
2371   Register ValReg = MI.getOperand(0).getReg();
2372   LLT ValTy = MRI.getType(ValReg);
2373 
2374   MachineMemOperand *MMO = *MI.memoperands_begin();
2375   const unsigned ValSize = ValTy.getSizeInBits();
2376   const unsigned MemSize = 8 * MMO->getSize();
2377   const Align MemAlign = MMO->getAlign();
2378   const unsigned AlignInBits = 8 * MemAlign.value();
2379 
2380   // Widen non-power-of-2 loads to the alignment if needed
2381   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2382     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2383 
2384     // This was already the correct extending load result type, so just adjust
2385     // the memory type.
2386     if (WideMemSize == ValSize) {
2387       MachineFunction &MF = B.getMF();
2388 
2389       MachineMemOperand *WideMMO =
2390           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2391       Observer.changingInstr(MI);
2392       MI.setMemRefs(MF, {WideMMO});
2393       Observer.changedInstr(MI);
2394       return true;
2395     }
2396 
2397     // Don't bother handling edge case that should probably never be produced.
2398     if (ValSize > WideMemSize)
2399       return false;
2400 
2401     LLT WideTy = widenToNextPowerOf2(ValTy);
2402 
2403     Register WideLoad;
2404     if (!WideTy.isVector()) {
2405       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2406       B.buildTrunc(ValReg, WideLoad).getReg(0);
2407     } else {
2408       // Extract the subvector.
2409 
2410       if (isRegisterType(ValTy)) {
2411         // If this a case where G_EXTRACT is legal, use it.
2412         // (e.g. <3 x s32> -> <4 x s32>)
2413         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2414         B.buildExtract(ValReg, WideLoad, 0);
2415       } else {
2416         // For cases where the widened type isn't a nice register value, unmerge
2417         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2418         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2419         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2420         B.setInsertPt(B.getMBB(), MI.getIterator());
2421         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2422       }
2423     }
2424 
2425     MI.eraseFromParent();
2426     return true;
2427   }
2428 
2429   return false;
2430 }
2431 
2432 bool AMDGPULegalizerInfo::legalizeFMad(
2433   MachineInstr &MI, MachineRegisterInfo &MRI,
2434   MachineIRBuilder &B) const {
2435   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2436   assert(Ty.isScalar());
2437 
2438   MachineFunction &MF = B.getMF();
2439   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2440 
2441   // TODO: Always legal with future ftz flag.
2442   // FIXME: Do we need just output?
2443   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2444     return true;
2445   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2446     return true;
2447 
2448   MachineIRBuilder HelperBuilder(MI);
2449   GISelObserverWrapper DummyObserver;
2450   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2451   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2452 }
2453 
2454 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2455   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2456   Register DstReg = MI.getOperand(0).getReg();
2457   Register PtrReg = MI.getOperand(1).getReg();
2458   Register CmpVal = MI.getOperand(2).getReg();
2459   Register NewVal = MI.getOperand(3).getReg();
2460 
2461   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2462          "this should not have been custom lowered");
2463 
2464   LLT ValTy = MRI.getType(CmpVal);
2465   LLT VecTy = LLT::vector(2, ValTy);
2466 
2467   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2468 
2469   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2470     .addDef(DstReg)
2471     .addUse(PtrReg)
2472     .addUse(PackedVal)
2473     .setMemRefs(MI.memoperands());
2474 
2475   MI.eraseFromParent();
2476   return true;
2477 }
2478 
2479 bool AMDGPULegalizerInfo::legalizeFlog(
2480   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2481   Register Dst = MI.getOperand(0).getReg();
2482   Register Src = MI.getOperand(1).getReg();
2483   LLT Ty = B.getMRI()->getType(Dst);
2484   unsigned Flags = MI.getFlags();
2485 
2486   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2487   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2488 
2489   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2490   MI.eraseFromParent();
2491   return true;
2492 }
2493 
2494 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2495                                        MachineIRBuilder &B) const {
2496   Register Dst = MI.getOperand(0).getReg();
2497   Register Src = MI.getOperand(1).getReg();
2498   unsigned Flags = MI.getFlags();
2499   LLT Ty = B.getMRI()->getType(Dst);
2500 
2501   auto K = B.buildFConstant(Ty, numbers::log2e);
2502   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2503   B.buildFExp2(Dst, Mul, Flags);
2504   MI.eraseFromParent();
2505   return true;
2506 }
2507 
2508 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2509                                        MachineIRBuilder &B) const {
2510   Register Dst = MI.getOperand(0).getReg();
2511   Register Src0 = MI.getOperand(1).getReg();
2512   Register Src1 = MI.getOperand(2).getReg();
2513   unsigned Flags = MI.getFlags();
2514   LLT Ty = B.getMRI()->getType(Dst);
2515   const LLT S16 = LLT::scalar(16);
2516   const LLT S32 = LLT::scalar(32);
2517 
2518   if (Ty == S32) {
2519     auto Log = B.buildFLog2(S32, Src0, Flags);
2520     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2521       .addUse(Log.getReg(0))
2522       .addUse(Src1)
2523       .setMIFlags(Flags);
2524     B.buildFExp2(Dst, Mul, Flags);
2525   } else if (Ty == S16) {
2526     // There's no f16 fmul_legacy, so we need to convert for it.
2527     auto Log = B.buildFLog2(S16, Src0, Flags);
2528     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2529     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2530     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2531       .addUse(Ext0.getReg(0))
2532       .addUse(Ext1.getReg(0))
2533       .setMIFlags(Flags);
2534 
2535     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2536   } else
2537     return false;
2538 
2539   MI.eraseFromParent();
2540   return true;
2541 }
2542 
2543 // Find a source register, ignoring any possible source modifiers.
2544 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2545   Register ModSrc = OrigSrc;
2546   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2547     ModSrc = SrcFNeg->getOperand(1).getReg();
2548     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2549       ModSrc = SrcFAbs->getOperand(1).getReg();
2550   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2551     ModSrc = SrcFAbs->getOperand(1).getReg();
2552   return ModSrc;
2553 }
2554 
2555 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2556                                          MachineRegisterInfo &MRI,
2557                                          MachineIRBuilder &B) const {
2558 
2559   const LLT S1 = LLT::scalar(1);
2560   const LLT S64 = LLT::scalar(64);
2561   Register Dst = MI.getOperand(0).getReg();
2562   Register OrigSrc = MI.getOperand(1).getReg();
2563   unsigned Flags = MI.getFlags();
2564   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2565          "this should not have been custom lowered");
2566 
2567   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2568   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2569   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2570   // V_FRACT bug is:
2571   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2572   //
2573   // Convert floor(x) to (x - fract(x))
2574 
2575   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2576     .addUse(OrigSrc)
2577     .setMIFlags(Flags);
2578 
2579   // Give source modifier matching some assistance before obscuring a foldable
2580   // pattern.
2581 
2582   // TODO: We can avoid the neg on the fract? The input sign to fract
2583   // shouldn't matter?
2584   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2585 
2586   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2587 
2588   Register Min = MRI.createGenericVirtualRegister(S64);
2589 
2590   // We don't need to concern ourselves with the snan handling difference, so
2591   // use the one which will directly select.
2592   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2593   if (MFI->getMode().IEEE)
2594     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2595   else
2596     B.buildFMinNum(Min, Fract, Const, Flags);
2597 
2598   Register CorrectedFract = Min;
2599   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2600     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2601     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2602   }
2603 
2604   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2605   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2606 
2607   MI.eraseFromParent();
2608   return true;
2609 }
2610 
2611 // Turn an illegal packed v2s16 build vector into bit operations.
2612 // TODO: This should probably be a bitcast action in LegalizerHelper.
2613 bool AMDGPULegalizerInfo::legalizeBuildVector(
2614   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2615   Register Dst = MI.getOperand(0).getReg();
2616   const LLT S32 = LLT::scalar(32);
2617   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2618 
2619   Register Src0 = MI.getOperand(1).getReg();
2620   Register Src1 = MI.getOperand(2).getReg();
2621   assert(MRI.getType(Src0) == LLT::scalar(16));
2622 
2623   auto Merge = B.buildMerge(S32, {Src0, Src1});
2624   B.buildBitcast(Dst, Merge);
2625 
2626   MI.eraseFromParent();
2627   return true;
2628 }
2629 
2630 // Check that this is a G_XOR x, -1
2631 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2632   if (MI.getOpcode() != TargetOpcode::G_XOR)
2633     return false;
2634   auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
2635   return ConstVal && *ConstVal == -1;
2636 }
2637 
2638 // Return the use branch instruction, otherwise null if the usage is invalid.
2639 static MachineInstr *
2640 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2641                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2642   Register CondDef = MI.getOperand(0).getReg();
2643   if (!MRI.hasOneNonDBGUse(CondDef))
2644     return nullptr;
2645 
2646   MachineBasicBlock *Parent = MI.getParent();
2647   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2648 
2649   if (isNot(MRI, *UseMI)) {
2650     Register NegatedCond = UseMI->getOperand(0).getReg();
2651     if (!MRI.hasOneNonDBGUse(NegatedCond))
2652       return nullptr;
2653 
2654     // We're deleting the def of this value, so we need to remove it.
2655     UseMI->eraseFromParent();
2656 
2657     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2658     Negated = true;
2659   }
2660 
2661   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2662     return nullptr;
2663 
2664   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2665   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2666   if (Next == Parent->end()) {
2667     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2668     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2669       return nullptr;
2670     UncondBrTarget = &*NextMBB;
2671   } else {
2672     if (Next->getOpcode() != AMDGPU::G_BR)
2673       return nullptr;
2674     Br = &*Next;
2675     UncondBrTarget = Br->getOperand(0).getMBB();
2676   }
2677 
2678   return UseMI;
2679 }
2680 
2681 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2682                                          const ArgDescriptor *Arg,
2683                                          const TargetRegisterClass *ArgRC,
2684                                          LLT ArgTy) const {
2685   MCRegister SrcReg = Arg->getRegister();
2686   assert(SrcReg.isPhysical() && "Physical register expected");
2687   assert(DstReg.isVirtual() && "Virtual register expected");
2688 
2689   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2690                                              ArgTy);
2691   if (Arg->isMasked()) {
2692     // TODO: Should we try to emit this once in the entry block?
2693     const LLT S32 = LLT::scalar(32);
2694     const unsigned Mask = Arg->getMask();
2695     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2696 
2697     Register AndMaskSrc = LiveIn;
2698 
2699     if (Shift != 0) {
2700       auto ShiftAmt = B.buildConstant(S32, Shift);
2701       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2702     }
2703 
2704     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2705   } else {
2706     B.buildCopy(DstReg, LiveIn);
2707   }
2708 
2709   return true;
2710 }
2711 
2712 bool AMDGPULegalizerInfo::loadInputValue(
2713     Register DstReg, MachineIRBuilder &B,
2714     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2715   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2716   const ArgDescriptor *Arg;
2717   const TargetRegisterClass *ArgRC;
2718   LLT ArgTy;
2719   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2720 
2721   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2722     return false; // TODO: Handle these
2723   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2724 }
2725 
2726 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2727     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2728     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2729   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2730     return false;
2731 
2732   MI.eraseFromParent();
2733   return true;
2734 }
2735 
2736 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2737                                        MachineRegisterInfo &MRI,
2738                                        MachineIRBuilder &B) const {
2739   Register Dst = MI.getOperand(0).getReg();
2740   LLT DstTy = MRI.getType(Dst);
2741   LLT S16 = LLT::scalar(16);
2742   LLT S32 = LLT::scalar(32);
2743   LLT S64 = LLT::scalar(64);
2744 
2745   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2746     return true;
2747 
2748   if (DstTy == S16)
2749     return legalizeFDIV16(MI, MRI, B);
2750   if (DstTy == S32)
2751     return legalizeFDIV32(MI, MRI, B);
2752   if (DstTy == S64)
2753     return legalizeFDIV64(MI, MRI, B);
2754 
2755   return false;
2756 }
2757 
2758 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2759                                                   Register DstReg,
2760                                                   Register X,
2761                                                   Register Y,
2762                                                   bool IsDiv) const {
2763   const LLT S1 = LLT::scalar(1);
2764   const LLT S32 = LLT::scalar(32);
2765 
2766   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2767   // algorithm used here.
2768 
2769   // Initial estimate of inv(y).
2770   auto FloatY = B.buildUITOFP(S32, Y);
2771   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2772   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2773   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2774   auto Z = B.buildFPTOUI(S32, ScaledY);
2775 
2776   // One round of UNR.
2777   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2778   auto NegYZ = B.buildMul(S32, NegY, Z);
2779   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2780 
2781   // Quotient/remainder estimate.
2782   auto Q = B.buildUMulH(S32, X, Z);
2783   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2784 
2785   // First quotient/remainder refinement.
2786   auto One = B.buildConstant(S32, 1);
2787   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2788   if (IsDiv)
2789     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2790   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2791 
2792   // Second quotient/remainder refinement.
2793   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2794   if (IsDiv)
2795     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2796   else
2797     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2798 }
2799 
2800 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2801                                               MachineRegisterInfo &MRI,
2802                                               MachineIRBuilder &B) const {
2803   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2804   Register DstReg = MI.getOperand(0).getReg();
2805   Register Num = MI.getOperand(1).getReg();
2806   Register Den = MI.getOperand(2).getReg();
2807   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2808   MI.eraseFromParent();
2809   return true;
2810 }
2811 
2812 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2813 //
2814 // Return lo, hi of result
2815 //
2816 // %cvt.lo = G_UITOFP Val.lo
2817 // %cvt.hi = G_UITOFP Val.hi
2818 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2819 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2820 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2821 // %mul2 = G_FMUL %mul1, 2**(-32)
2822 // %trunc = G_INTRINSIC_TRUNC %mul2
2823 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2824 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2825 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2826                                                        Register Val) {
2827   const LLT S32 = LLT::scalar(32);
2828   auto Unmerge = B.buildUnmerge(S32, Val);
2829 
2830   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2831   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2832 
2833   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2834                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2835 
2836   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2837   auto Mul1 =
2838       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2839 
2840   // 2**(-32)
2841   auto Mul2 =
2842       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2843   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2844 
2845   // -(2**32)
2846   auto Mad2 = B.buildFMAD(S32, Trunc,
2847                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2848 
2849   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2850   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2851 
2852   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2853 }
2854 
2855 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2856                                                   Register DstReg,
2857                                                   Register Numer,
2858                                                   Register Denom,
2859                                                   bool IsDiv) const {
2860   const LLT S32 = LLT::scalar(32);
2861   const LLT S64 = LLT::scalar(64);
2862   const LLT S1 = LLT::scalar(1);
2863   Register RcpLo, RcpHi;
2864 
2865   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2866 
2867   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2868 
2869   auto Zero64 = B.buildConstant(S64, 0);
2870   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2871 
2872   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2873   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2874 
2875   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2876   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2877   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2878 
2879   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2880   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2881   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2882   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2883 
2884   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2885   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2886   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2887   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2888   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2889 
2890   auto Zero32 = B.buildConstant(S32, 0);
2891   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2892   auto Add2_HiC =
2893       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2894   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2895   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2896 
2897   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2898   Register NumerLo = UnmergeNumer.getReg(0);
2899   Register NumerHi = UnmergeNumer.getReg(1);
2900 
2901   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2902   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2903   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2904   Register Mul3_Lo = UnmergeMul3.getReg(0);
2905   Register Mul3_Hi = UnmergeMul3.getReg(1);
2906   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2907   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2908   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2909   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2910 
2911   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2912   Register DenomLo = UnmergeDenom.getReg(0);
2913   Register DenomHi = UnmergeDenom.getReg(1);
2914 
2915   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2916   auto C1 = B.buildSExt(S32, CmpHi);
2917 
2918   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2919   auto C2 = B.buildSExt(S32, CmpLo);
2920 
2921   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2922   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2923 
2924   // TODO: Here and below portions of the code can be enclosed into if/endif.
2925   // Currently control flow is unconditional and we have 4 selects after
2926   // potential endif to substitute PHIs.
2927 
2928   // if C3 != 0 ...
2929   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2930   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2931   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2932   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2933 
2934   auto One64 = B.buildConstant(S64, 1);
2935   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2936 
2937   auto C4 =
2938       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2939   auto C5 =
2940       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2941   auto C6 = B.buildSelect(
2942       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2943 
2944   // if (C6 != 0)
2945   auto Add4 = B.buildAdd(S64, Add3, One64);
2946   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2947 
2948   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2949   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2950   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2951 
2952   // endif C6
2953   // endif C3
2954 
2955   if (IsDiv) {
2956     auto Sel1 = B.buildSelect(
2957         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2958     B.buildSelect(DstReg,
2959                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2960   } else {
2961     auto Sel2 = B.buildSelect(
2962         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2963     B.buildSelect(DstReg,
2964                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2965   }
2966 }
2967 
2968 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2969                                             MachineRegisterInfo &MRI,
2970                                             MachineIRBuilder &B) const {
2971   const LLT S64 = LLT::scalar(64);
2972   const LLT S32 = LLT::scalar(32);
2973   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2974   Register DstReg = MI.getOperand(0).getReg();
2975   Register Num = MI.getOperand(1).getReg();
2976   Register Den = MI.getOperand(2).getReg();
2977   LLT Ty = MRI.getType(DstReg);
2978 
2979   if (Ty == S32)
2980     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2981   else if (Ty == S64)
2982     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2983   else
2984     return false;
2985 
2986   MI.eraseFromParent();
2987   return true;
2988 
2989 }
2990 
2991 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2992                                             MachineRegisterInfo &MRI,
2993                                             MachineIRBuilder &B) const {
2994   const LLT S64 = LLT::scalar(64);
2995   const LLT S32 = LLT::scalar(32);
2996 
2997   Register DstReg = MI.getOperand(0).getReg();
2998   const LLT Ty = MRI.getType(DstReg);
2999   if (Ty != S32 && Ty != S64)
3000     return false;
3001 
3002   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3003 
3004   Register LHS = MI.getOperand(1).getReg();
3005   Register RHS = MI.getOperand(2).getReg();
3006 
3007   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3008   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3009   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3010 
3011   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3012   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3013 
3014   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3015   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3016 
3017   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3018   if (Ty == S32)
3019     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3020   else
3021     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3022 
3023   Register Sign;
3024   if (IsDiv)
3025     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3026   else
3027     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3028 
3029   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3030   B.buildSub(DstReg, UDivRem, Sign);
3031 
3032   MI.eraseFromParent();
3033   return true;
3034 }
3035 
3036 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3037                                                  MachineRegisterInfo &MRI,
3038                                                  MachineIRBuilder &B) const {
3039   Register Res = MI.getOperand(0).getReg();
3040   Register LHS = MI.getOperand(1).getReg();
3041   Register RHS = MI.getOperand(2).getReg();
3042 
3043   uint16_t Flags = MI.getFlags();
3044 
3045   LLT ResTy = MRI.getType(Res);
3046   LLT S32 = LLT::scalar(32);
3047   LLT S64 = LLT::scalar(64);
3048 
3049   const MachineFunction &MF = B.getMF();
3050   bool Unsafe =
3051     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3052 
3053   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3054     return false;
3055 
3056   if (!Unsafe && ResTy == S32 &&
3057       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3058     return false;
3059 
3060   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3061     // 1 / x -> RCP(x)
3062     if (CLHS->isExactlyValue(1.0)) {
3063       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3064         .addUse(RHS)
3065         .setMIFlags(Flags);
3066 
3067       MI.eraseFromParent();
3068       return true;
3069     }
3070 
3071     // -1 / x -> RCP( FNEG(x) )
3072     if (CLHS->isExactlyValue(-1.0)) {
3073       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3074       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3075         .addUse(FNeg.getReg(0))
3076         .setMIFlags(Flags);
3077 
3078       MI.eraseFromParent();
3079       return true;
3080     }
3081   }
3082 
3083   // x / y -> x * (1.0 / y)
3084   if (Unsafe) {
3085     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3086       .addUse(RHS)
3087       .setMIFlags(Flags);
3088     B.buildFMul(Res, LHS, RCP, Flags);
3089 
3090     MI.eraseFromParent();
3091     return true;
3092   }
3093 
3094   return false;
3095 }
3096 
3097 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3098                                          MachineRegisterInfo &MRI,
3099                                          MachineIRBuilder &B) const {
3100   Register Res = MI.getOperand(0).getReg();
3101   Register LHS = MI.getOperand(1).getReg();
3102   Register RHS = MI.getOperand(2).getReg();
3103 
3104   uint16_t Flags = MI.getFlags();
3105 
3106   LLT S16 = LLT::scalar(16);
3107   LLT S32 = LLT::scalar(32);
3108 
3109   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3110   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3111 
3112   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3113     .addUse(RHSExt.getReg(0))
3114     .setMIFlags(Flags);
3115 
3116   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3117   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3118 
3119   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3120     .addUse(RDst.getReg(0))
3121     .addUse(RHS)
3122     .addUse(LHS)
3123     .setMIFlags(Flags);
3124 
3125   MI.eraseFromParent();
3126   return true;
3127 }
3128 
3129 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3130 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3131 static void toggleSPDenormMode(bool Enable,
3132                                MachineIRBuilder &B,
3133                                const GCNSubtarget &ST,
3134                                AMDGPU::SIModeRegisterDefaults Mode) {
3135   // Set SP denorm mode to this value.
3136   unsigned SPDenormMode =
3137     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3138 
3139   if (ST.hasDenormModeInst()) {
3140     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3141     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3142 
3143     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3144     B.buildInstr(AMDGPU::S_DENORM_MODE)
3145       .addImm(NewDenormModeValue);
3146 
3147   } else {
3148     // Select FP32 bit field in mode register.
3149     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3150                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3151                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3152 
3153     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3154       .addImm(SPDenormMode)
3155       .addImm(SPDenormModeBitField);
3156   }
3157 }
3158 
3159 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3160                                          MachineRegisterInfo &MRI,
3161                                          MachineIRBuilder &B) const {
3162   Register Res = MI.getOperand(0).getReg();
3163   Register LHS = MI.getOperand(1).getReg();
3164   Register RHS = MI.getOperand(2).getReg();
3165   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3166   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3167 
3168   uint16_t Flags = MI.getFlags();
3169 
3170   LLT S32 = LLT::scalar(32);
3171   LLT S1 = LLT::scalar(1);
3172 
3173   auto One = B.buildFConstant(S32, 1.0f);
3174 
3175   auto DenominatorScaled =
3176     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3177       .addUse(LHS)
3178       .addUse(RHS)
3179       .addImm(0)
3180       .setMIFlags(Flags);
3181   auto NumeratorScaled =
3182     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3183       .addUse(LHS)
3184       .addUse(RHS)
3185       .addImm(1)
3186       .setMIFlags(Flags);
3187 
3188   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3189     .addUse(DenominatorScaled.getReg(0))
3190     .setMIFlags(Flags);
3191   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3192 
3193   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3194   // aren't modeled as reading it.
3195   if (!Mode.allFP32Denormals())
3196     toggleSPDenormMode(true, B, ST, Mode);
3197 
3198   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3199   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3200   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3201   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3202   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3203   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3204 
3205   if (!Mode.allFP32Denormals())
3206     toggleSPDenormMode(false, B, ST, Mode);
3207 
3208   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3209     .addUse(Fma4.getReg(0))
3210     .addUse(Fma1.getReg(0))
3211     .addUse(Fma3.getReg(0))
3212     .addUse(NumeratorScaled.getReg(1))
3213     .setMIFlags(Flags);
3214 
3215   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3216     .addUse(Fmas.getReg(0))
3217     .addUse(RHS)
3218     .addUse(LHS)
3219     .setMIFlags(Flags);
3220 
3221   MI.eraseFromParent();
3222   return true;
3223 }
3224 
3225 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3226                                          MachineRegisterInfo &MRI,
3227                                          MachineIRBuilder &B) const {
3228   Register Res = MI.getOperand(0).getReg();
3229   Register LHS = MI.getOperand(1).getReg();
3230   Register RHS = MI.getOperand(2).getReg();
3231 
3232   uint16_t Flags = MI.getFlags();
3233 
3234   LLT S64 = LLT::scalar(64);
3235   LLT S1 = LLT::scalar(1);
3236 
3237   auto One = B.buildFConstant(S64, 1.0);
3238 
3239   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3240     .addUse(LHS)
3241     .addUse(RHS)
3242     .addImm(0)
3243     .setMIFlags(Flags);
3244 
3245   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3246 
3247   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3248     .addUse(DivScale0.getReg(0))
3249     .setMIFlags(Flags);
3250 
3251   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3252   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3253   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3254 
3255   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3256     .addUse(LHS)
3257     .addUse(RHS)
3258     .addImm(1)
3259     .setMIFlags(Flags);
3260 
3261   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3262   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3263   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3264 
3265   Register Scale;
3266   if (!ST.hasUsableDivScaleConditionOutput()) {
3267     // Workaround a hardware bug on SI where the condition output from div_scale
3268     // is not usable.
3269 
3270     LLT S32 = LLT::scalar(32);
3271 
3272     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3273     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3274     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3275     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3276 
3277     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3278                               Scale1Unmerge.getReg(1));
3279     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3280                               Scale0Unmerge.getReg(1));
3281     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3282   } else {
3283     Scale = DivScale1.getReg(1);
3284   }
3285 
3286   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3287     .addUse(Fma4.getReg(0))
3288     .addUse(Fma3.getReg(0))
3289     .addUse(Mul.getReg(0))
3290     .addUse(Scale)
3291     .setMIFlags(Flags);
3292 
3293   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3294     .addUse(Fmas.getReg(0))
3295     .addUse(RHS)
3296     .addUse(LHS)
3297     .setMIFlags(Flags);
3298 
3299   MI.eraseFromParent();
3300   return true;
3301 }
3302 
3303 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3304                                                  MachineRegisterInfo &MRI,
3305                                                  MachineIRBuilder &B) const {
3306   Register Res = MI.getOperand(0).getReg();
3307   Register LHS = MI.getOperand(2).getReg();
3308   Register RHS = MI.getOperand(3).getReg();
3309   uint16_t Flags = MI.getFlags();
3310 
3311   LLT S32 = LLT::scalar(32);
3312   LLT S1 = LLT::scalar(1);
3313 
3314   auto Abs = B.buildFAbs(S32, RHS, Flags);
3315   const APFloat C0Val(1.0f);
3316 
3317   auto C0 = B.buildConstant(S32, 0x6f800000);
3318   auto C1 = B.buildConstant(S32, 0x2f800000);
3319   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3320 
3321   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3322   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3323 
3324   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3325 
3326   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3327     .addUse(Mul0.getReg(0))
3328     .setMIFlags(Flags);
3329 
3330   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3331 
3332   B.buildFMul(Res, Sel, Mul1, Flags);
3333 
3334   MI.eraseFromParent();
3335   return true;
3336 }
3337 
3338 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3339 // FIXME: Why do we handle this one but not other removed instructions?
3340 //
3341 // Reciprocal square root.  The clamp prevents infinite results, clamping
3342 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3343 // +-max_float.
3344 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3345                                                     MachineRegisterInfo &MRI,
3346                                                     MachineIRBuilder &B) const {
3347   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3348     return true;
3349 
3350   Register Dst = MI.getOperand(0).getReg();
3351   Register Src = MI.getOperand(2).getReg();
3352   auto Flags = MI.getFlags();
3353 
3354   LLT Ty = MRI.getType(Dst);
3355 
3356   const fltSemantics *FltSemantics;
3357   if (Ty == LLT::scalar(32))
3358     FltSemantics = &APFloat::IEEEsingle();
3359   else if (Ty == LLT::scalar(64))
3360     FltSemantics = &APFloat::IEEEdouble();
3361   else
3362     return false;
3363 
3364   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3365     .addUse(Src)
3366     .setMIFlags(Flags);
3367 
3368   // We don't need to concern ourselves with the snan handling difference, since
3369   // the rsq quieted (or not) so use the one which will directly select.
3370   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3371   const bool UseIEEE = MFI->getMode().IEEE;
3372 
3373   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3374   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3375                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3376 
3377   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3378 
3379   if (UseIEEE)
3380     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3381   else
3382     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3383   MI.eraseFromParent();
3384   return true;
3385 }
3386 
3387 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3388   switch (IID) {
3389   case Intrinsic::amdgcn_ds_fadd:
3390     return AMDGPU::G_ATOMICRMW_FADD;
3391   case Intrinsic::amdgcn_ds_fmin:
3392     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3393   case Intrinsic::amdgcn_ds_fmax:
3394     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3395   default:
3396     llvm_unreachable("not a DS FP intrinsic");
3397   }
3398 }
3399 
3400 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3401                                                       MachineInstr &MI,
3402                                                       Intrinsic::ID IID) const {
3403   GISelChangeObserver &Observer = Helper.Observer;
3404   Observer.changingInstr(MI);
3405 
3406   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3407 
3408   // The remaining operands were used to set fields in the MemOperand on
3409   // construction.
3410   for (int I = 6; I > 3; --I)
3411     MI.RemoveOperand(I);
3412 
3413   MI.RemoveOperand(1); // Remove the intrinsic ID.
3414   Observer.changedInstr(MI);
3415   return true;
3416 }
3417 
3418 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3419                                             MachineRegisterInfo &MRI,
3420                                             MachineIRBuilder &B) const {
3421   uint64_t Offset =
3422     ST.getTargetLowering()->getImplicitParameterOffset(
3423       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3424   LLT DstTy = MRI.getType(DstReg);
3425   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3426 
3427   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3428   if (!loadInputValue(KernargPtrReg, B,
3429                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3430     return false;
3431 
3432   // FIXME: This should be nuw
3433   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3434   return true;
3435 }
3436 
3437 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3438                                                  MachineRegisterInfo &MRI,
3439                                                  MachineIRBuilder &B) const {
3440   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3441   if (!MFI->isEntryFunction()) {
3442     return legalizePreloadedArgIntrin(MI, MRI, B,
3443                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3444   }
3445 
3446   Register DstReg = MI.getOperand(0).getReg();
3447   if (!getImplicitArgPtr(DstReg, MRI, B))
3448     return false;
3449 
3450   MI.eraseFromParent();
3451   return true;
3452 }
3453 
3454 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3455                                               MachineRegisterInfo &MRI,
3456                                               MachineIRBuilder &B,
3457                                               unsigned AddrSpace) const {
3458   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3459   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3460   Register Hi32 = Unmerge.getReg(1);
3461 
3462   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3463   MI.eraseFromParent();
3464   return true;
3465 }
3466 
3467 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3468 // offset (the offset that is included in bounds checking and swizzling, to be
3469 // split between the instruction's voffset and immoffset fields) and soffset
3470 // (the offset that is excluded from bounds checking and swizzling, to go in
3471 // the instruction's soffset field).  This function takes the first kind of
3472 // offset and figures out how to split it between voffset and immoffset.
3473 std::tuple<Register, unsigned, unsigned>
3474 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3475                                         Register OrigOffset) const {
3476   const unsigned MaxImm = 4095;
3477   Register BaseReg;
3478   unsigned TotalConstOffset;
3479   MachineInstr *OffsetDef;
3480   const LLT S32 = LLT::scalar(32);
3481 
3482   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3483     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3484 
3485   unsigned ImmOffset = TotalConstOffset;
3486 
3487   // If the immediate value is too big for the immoffset field, put the value
3488   // and -4096 into the immoffset field so that the value that is copied/added
3489   // for the voffset field is a multiple of 4096, and it stands more chance
3490   // of being CSEd with the copy/add for another similar load/store.
3491   // However, do not do that rounding down to a multiple of 4096 if that is a
3492   // negative number, as it appears to be illegal to have a negative offset
3493   // in the vgpr, even if adding the immediate offset makes it positive.
3494   unsigned Overflow = ImmOffset & ~MaxImm;
3495   ImmOffset -= Overflow;
3496   if ((int32_t)Overflow < 0) {
3497     Overflow += ImmOffset;
3498     ImmOffset = 0;
3499   }
3500 
3501   if (Overflow != 0) {
3502     if (!BaseReg) {
3503       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3504     } else {
3505       auto OverflowVal = B.buildConstant(S32, Overflow);
3506       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3507     }
3508   }
3509 
3510   if (!BaseReg)
3511     BaseReg = B.buildConstant(S32, 0).getReg(0);
3512 
3513   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3514 }
3515 
3516 /// Handle register layout difference for f16 images for some subtargets.
3517 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3518                                              MachineRegisterInfo &MRI,
3519                                              Register Reg) const {
3520   if (!ST.hasUnpackedD16VMem())
3521     return Reg;
3522 
3523   const LLT S16 = LLT::scalar(16);
3524   const LLT S32 = LLT::scalar(32);
3525   LLT StoreVT = MRI.getType(Reg);
3526   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3527 
3528   auto Unmerge = B.buildUnmerge(S16, Reg);
3529 
3530   SmallVector<Register, 4> WideRegs;
3531   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3532     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3533 
3534   int NumElts = StoreVT.getNumElements();
3535 
3536   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3537 }
3538 
3539 Register AMDGPULegalizerInfo::fixStoreSourceType(
3540   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3541   MachineRegisterInfo *MRI = B.getMRI();
3542   LLT Ty = MRI->getType(VData);
3543 
3544   const LLT S16 = LLT::scalar(16);
3545 
3546   // Fixup illegal register types for i8 stores.
3547   if (Ty == LLT::scalar(8) || Ty == S16) {
3548     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3549     return AnyExt;
3550   }
3551 
3552   if (Ty.isVector()) {
3553     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3554       if (IsFormat)
3555         return handleD16VData(B, *MRI, VData);
3556     }
3557   }
3558 
3559   return VData;
3560 }
3561 
3562 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3563                                               MachineRegisterInfo &MRI,
3564                                               MachineIRBuilder &B,
3565                                               bool IsTyped,
3566                                               bool IsFormat) const {
3567   Register VData = MI.getOperand(1).getReg();
3568   LLT Ty = MRI.getType(VData);
3569   LLT EltTy = Ty.getScalarType();
3570   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3571   const LLT S32 = LLT::scalar(32);
3572 
3573   VData = fixStoreSourceType(B, VData, IsFormat);
3574   Register RSrc = MI.getOperand(2).getReg();
3575 
3576   MachineMemOperand *MMO = *MI.memoperands_begin();
3577   const int MemSize = MMO->getSize();
3578 
3579   unsigned ImmOffset;
3580   unsigned TotalOffset;
3581 
3582   // The typed intrinsics add an immediate after the registers.
3583   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3584 
3585   // The struct intrinsic variants add one additional operand over raw.
3586   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3587   Register VIndex;
3588   int OpOffset = 0;
3589   if (HasVIndex) {
3590     VIndex = MI.getOperand(3).getReg();
3591     OpOffset = 1;
3592   }
3593 
3594   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3595   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3596 
3597   unsigned Format = 0;
3598   if (IsTyped) {
3599     Format = MI.getOperand(5 + OpOffset).getImm();
3600     ++OpOffset;
3601   }
3602 
3603   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3604 
3605   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3606   if (TotalOffset != 0)
3607     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3608 
3609   unsigned Opc;
3610   if (IsTyped) {
3611     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3612                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3613   } else if (IsFormat) {
3614     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3615                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3616   } else {
3617     switch (MemSize) {
3618     case 1:
3619       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3620       break;
3621     case 2:
3622       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3623       break;
3624     default:
3625       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3626       break;
3627     }
3628   }
3629 
3630   if (!VIndex)
3631     VIndex = B.buildConstant(S32, 0).getReg(0);
3632 
3633   auto MIB = B.buildInstr(Opc)
3634     .addUse(VData)              // vdata
3635     .addUse(RSrc)               // rsrc
3636     .addUse(VIndex)             // vindex
3637     .addUse(VOffset)            // voffset
3638     .addUse(SOffset)            // soffset
3639     .addImm(ImmOffset);         // offset(imm)
3640 
3641   if (IsTyped)
3642     MIB.addImm(Format);
3643 
3644   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3645      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3646      .addMemOperand(MMO);
3647 
3648   MI.eraseFromParent();
3649   return true;
3650 }
3651 
3652 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3653                                              MachineRegisterInfo &MRI,
3654                                              MachineIRBuilder &B,
3655                                              bool IsFormat,
3656                                              bool IsTyped) const {
3657   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3658   MachineMemOperand *MMO = *MI.memoperands_begin();
3659   const int MemSize = MMO->getSize();
3660   const LLT S32 = LLT::scalar(32);
3661 
3662   Register Dst = MI.getOperand(0).getReg();
3663   Register RSrc = MI.getOperand(2).getReg();
3664 
3665   // The typed intrinsics add an immediate after the registers.
3666   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3667 
3668   // The struct intrinsic variants add one additional operand over raw.
3669   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3670   Register VIndex;
3671   int OpOffset = 0;
3672   if (HasVIndex) {
3673     VIndex = MI.getOperand(3).getReg();
3674     OpOffset = 1;
3675   }
3676 
3677   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3678   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3679 
3680   unsigned Format = 0;
3681   if (IsTyped) {
3682     Format = MI.getOperand(5 + OpOffset).getImm();
3683     ++OpOffset;
3684   }
3685 
3686   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3687   unsigned ImmOffset;
3688   unsigned TotalOffset;
3689 
3690   LLT Ty = MRI.getType(Dst);
3691   LLT EltTy = Ty.getScalarType();
3692   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3693   const bool Unpacked = ST.hasUnpackedD16VMem();
3694 
3695   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3696   if (TotalOffset != 0)
3697     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3698 
3699   unsigned Opc;
3700 
3701   if (IsTyped) {
3702     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3703                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3704   } else if (IsFormat) {
3705     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3706                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3707   } else {
3708     switch (MemSize) {
3709     case 1:
3710       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3711       break;
3712     case 2:
3713       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3714       break;
3715     default:
3716       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3717       break;
3718     }
3719   }
3720 
3721   Register LoadDstReg;
3722 
3723   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3724   LLT UnpackedTy = Ty.changeElementSize(32);
3725 
3726   if (IsExtLoad)
3727     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3728   else if (Unpacked && IsD16 && Ty.isVector())
3729     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3730   else
3731     LoadDstReg = Dst;
3732 
3733   if (!VIndex)
3734     VIndex = B.buildConstant(S32, 0).getReg(0);
3735 
3736   auto MIB = B.buildInstr(Opc)
3737     .addDef(LoadDstReg)         // vdata
3738     .addUse(RSrc)               // rsrc
3739     .addUse(VIndex)             // vindex
3740     .addUse(VOffset)            // voffset
3741     .addUse(SOffset)            // soffset
3742     .addImm(ImmOffset);         // offset(imm)
3743 
3744   if (IsTyped)
3745     MIB.addImm(Format);
3746 
3747   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3748      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3749      .addMemOperand(MMO);
3750 
3751   if (LoadDstReg != Dst) {
3752     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3753 
3754     // Widen result for extending loads was widened.
3755     if (IsExtLoad)
3756       B.buildTrunc(Dst, LoadDstReg);
3757     else {
3758       // Repack to original 16-bit vector result
3759       // FIXME: G_TRUNC should work, but legalization currently fails
3760       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3761       SmallVector<Register, 4> Repack;
3762       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3763         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3764       B.buildMerge(Dst, Repack);
3765     }
3766   }
3767 
3768   MI.eraseFromParent();
3769   return true;
3770 }
3771 
3772 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3773                                                MachineIRBuilder &B,
3774                                                bool IsInc) const {
3775   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3776                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3777   B.buildInstr(Opc)
3778     .addDef(MI.getOperand(0).getReg())
3779     .addUse(MI.getOperand(2).getReg())
3780     .addUse(MI.getOperand(3).getReg())
3781     .cloneMemRefs(MI);
3782   MI.eraseFromParent();
3783   return true;
3784 }
3785 
3786 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3787   switch (IntrID) {
3788   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3789   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3790     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3791   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3792   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3793     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3794   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3795   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3796     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3797   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3798   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3799     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3800   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3801   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3802     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3803   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3804   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3805     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3806   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3807   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3808     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3809   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3810   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3811     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3812   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3813   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3814     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3815   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3816   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3817     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3818   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3819   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3820     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3821   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3822   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3823     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3824   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3825   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3826     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3827   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3828   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3829     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3830   default:
3831     llvm_unreachable("unhandled atomic opcode");
3832   }
3833 }
3834 
3835 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3836                                                MachineIRBuilder &B,
3837                                                Intrinsic::ID IID) const {
3838   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3839                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3840   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3841 
3842   Register Dst;
3843 
3844   int OpOffset = 0;
3845   if (HasReturn) {
3846     // A few FP atomics do not support return values.
3847     Dst = MI.getOperand(0).getReg();
3848   } else {
3849     OpOffset = -1;
3850   }
3851 
3852   Register VData = MI.getOperand(2 + OpOffset).getReg();
3853   Register CmpVal;
3854 
3855   if (IsCmpSwap) {
3856     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3857     ++OpOffset;
3858   }
3859 
3860   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3861   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3862 
3863   // The struct intrinsic variants add one additional operand over raw.
3864   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3865   Register VIndex;
3866   if (HasVIndex) {
3867     VIndex = MI.getOperand(4 + OpOffset).getReg();
3868     ++OpOffset;
3869   }
3870 
3871   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3872   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3873   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3874 
3875   MachineMemOperand *MMO = *MI.memoperands_begin();
3876 
3877   unsigned ImmOffset;
3878   unsigned TotalOffset;
3879   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3880   if (TotalOffset != 0)
3881     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3882 
3883   if (!VIndex)
3884     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3885 
3886   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3887 
3888   if (HasReturn)
3889     MIB.addDef(Dst);
3890 
3891   MIB.addUse(VData); // vdata
3892 
3893   if (IsCmpSwap)
3894     MIB.addReg(CmpVal);
3895 
3896   MIB.addUse(RSrc)               // rsrc
3897      .addUse(VIndex)             // vindex
3898      .addUse(VOffset)            // voffset
3899      .addUse(SOffset)            // soffset
3900      .addImm(ImmOffset)          // offset(imm)
3901      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3902      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3903      .addMemOperand(MMO);
3904 
3905   MI.eraseFromParent();
3906   return true;
3907 }
3908 
3909 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3910 /// vector with s16 typed elements.
3911 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3912                                         SmallVectorImpl<Register> &PackedAddrs,
3913                                         int AddrIdx, int DimIdx, int EndIdx,
3914                                         int NumGradients) {
3915   const LLT S16 = LLT::scalar(16);
3916   const LLT V2S16 = LLT::vector(2, 16);
3917 
3918   for (int I = AddrIdx; I < EndIdx; ++I) {
3919     MachineOperand &SrcOp = MI.getOperand(I);
3920     if (!SrcOp.isReg())
3921       continue; // _L to _LZ may have eliminated this.
3922 
3923     Register AddrReg = SrcOp.getReg();
3924 
3925     if (I < DimIdx) {
3926       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3927       PackedAddrs.push_back(AddrReg);
3928     } else {
3929       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3930       // derivatives dx/dh and dx/dv are packed with undef.
3931       if (((I + 1) >= EndIdx) ||
3932           ((NumGradients / 2) % 2 == 1 &&
3933            (I == DimIdx + (NumGradients / 2) - 1 ||
3934             I == DimIdx + NumGradients - 1)) ||
3935           // Check for _L to _LZ optimization
3936           !MI.getOperand(I + 1).isReg()) {
3937         PackedAddrs.push_back(
3938             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3939                 .getReg(0));
3940       } else {
3941         PackedAddrs.push_back(
3942             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3943                 .getReg(0));
3944         ++I;
3945       }
3946     }
3947   }
3948 }
3949 
3950 /// Convert from separate vaddr components to a single vector address register,
3951 /// and replace the remaining operands with $noreg.
3952 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3953                                      int DimIdx, int NumVAddrs) {
3954   const LLT S32 = LLT::scalar(32);
3955 
3956   SmallVector<Register, 8> AddrRegs;
3957   for (int I = 0; I != NumVAddrs; ++I) {
3958     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3959     if (SrcOp.isReg()) {
3960       AddrRegs.push_back(SrcOp.getReg());
3961       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3962     }
3963   }
3964 
3965   int NumAddrRegs = AddrRegs.size();
3966   if (NumAddrRegs != 1) {
3967     // Round up to 8 elements for v5-v7
3968     // FIXME: Missing intermediate sized register classes and instructions.
3969     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3970       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3971       auto Undef = B.buildUndef(S32);
3972       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3973       NumAddrRegs = RoundedNumRegs;
3974     }
3975 
3976     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3977     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3978   }
3979 
3980   for (int I = 1; I != NumVAddrs; ++I) {
3981     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3982     if (SrcOp.isReg())
3983       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3984   }
3985 }
3986 
3987 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3988 ///
3989 /// Depending on the subtarget, load/store with 16-bit element data need to be
3990 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3991 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3992 /// registers.
3993 ///
3994 /// We don't want to directly select image instructions just yet, but also want
3995 /// to exposes all register repacking to the legalizer/combiners. We also don't
3996 /// want a selected instrution entering RegBankSelect. In order to avoid
3997 /// defining a multitude of intermediate image instructions, directly hack on
3998 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3999 /// now unnecessary arguments with $noreg.
4000 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4001     MachineInstr &MI, MachineIRBuilder &B,
4002     GISelChangeObserver &Observer,
4003     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
4004 
4005   const int NumDefs = MI.getNumExplicitDefs();
4006   bool IsTFE = NumDefs == 2;
4007   // We are only processing the operands of d16 image operations on subtargets
4008   // that use the unpacked register layout, or need to repack the TFE result.
4009 
4010   // TODO: Do we need to guard against already legalized intrinsics?
4011   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4012     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
4013 
4014   MachineRegisterInfo *MRI = B.getMRI();
4015   const LLT S32 = LLT::scalar(32);
4016   const LLT S16 = LLT::scalar(16);
4017   const LLT V2S16 = LLT::vector(2, 16);
4018 
4019   // Index of first address argument
4020   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
4021 
4022   int NumVAddrs, NumGradients;
4023   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
4024   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
4025     getDMaskIdx(BaseOpcode, NumDefs);
4026   unsigned DMask = 0;
4027 
4028   // Check for 16 bit addresses and pack if true.
4029   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4030   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
4031   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
4032   const bool IsG16 = GradTy == S16;
4033   const bool IsA16 = AddrTy == S16;
4034 
4035   int DMaskLanes = 0;
4036   if (!BaseOpcode->Atomic) {
4037     DMask = MI.getOperand(DMaskIdx).getImm();
4038     if (BaseOpcode->Gather4) {
4039       DMaskLanes = 4;
4040     } else if (DMask != 0) {
4041       DMaskLanes = countPopulation(DMask);
4042     } else if (!IsTFE && !BaseOpcode->Store) {
4043       // If dmask is 0, this is a no-op load. This can be eliminated.
4044       B.buildUndef(MI.getOperand(0));
4045       MI.eraseFromParent();
4046       return true;
4047     }
4048   }
4049 
4050   Observer.changingInstr(MI);
4051   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4052 
4053   unsigned NewOpcode = NumDefs == 0 ?
4054     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4055 
4056   // Track that we legalized this
4057   MI.setDesc(B.getTII().get(NewOpcode));
4058 
4059   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4060   // dmask to be at least 1 otherwise the instruction will fail
4061   if (IsTFE && DMask == 0) {
4062     DMask = 0x1;
4063     DMaskLanes = 1;
4064     MI.getOperand(DMaskIdx).setImm(DMask);
4065   }
4066 
4067   if (BaseOpcode->Atomic) {
4068     Register VData0 = MI.getOperand(2).getReg();
4069     LLT Ty = MRI->getType(VData0);
4070 
4071     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4072     if (Ty.isVector())
4073       return false;
4074 
4075     if (BaseOpcode->AtomicX2) {
4076       Register VData1 = MI.getOperand(3).getReg();
4077       // The two values are packed in one register.
4078       LLT PackedTy = LLT::vector(2, Ty);
4079       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4080       MI.getOperand(2).setReg(Concat.getReg(0));
4081       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4082     }
4083   }
4084 
4085   int CorrectedNumVAddrs = NumVAddrs;
4086 
4087   // Optimize _L to _LZ when _L is zero
4088   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4089         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
4090     const ConstantFP *ConstantLod;
4091     const int LodIdx = AddrIdx + NumVAddrs - 1;
4092 
4093     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
4094       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4095         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4096         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
4097           LZMappingInfo->LZ, ImageDimIntr->Dim);
4098 
4099         // The starting indexes should remain in the same place.
4100         --NumVAddrs;
4101         --CorrectedNumVAddrs;
4102 
4103         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
4104           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
4105         MI.RemoveOperand(LodIdx);
4106       }
4107     }
4108   }
4109 
4110   // Optimize _mip away, when 'lod' is zero
4111   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
4112     int64_t ConstantLod;
4113     const int LodIdx = AddrIdx + NumVAddrs - 1;
4114 
4115     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
4116       if (ConstantLod == 0) {
4117         // TODO: Change intrinsic opcode and remove operand instead or replacing
4118         // it with 0, as the _L to _LZ handling is done above.
4119         MI.getOperand(LodIdx).ChangeToImmediate(0);
4120         --CorrectedNumVAddrs;
4121       }
4122     }
4123   }
4124 
4125   // Rewrite the addressing register layout before doing anything else.
4126   if (IsA16 || IsG16) {
4127     if (IsA16) {
4128       // Target must support the feature and gradients need to be 16 bit too
4129       if (!ST.hasA16() || !IsG16)
4130         return false;
4131     } else if (!ST.hasG16())
4132       return false;
4133 
4134     if (NumVAddrs > 1) {
4135       SmallVector<Register, 4> PackedRegs;
4136       // Don't compress addresses for G16
4137       const int PackEndIdx =
4138           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4139       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4140                                   PackEndIdx, NumGradients);
4141 
4142       if (!IsA16) {
4143         // Add uncompressed address
4144         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4145           int AddrReg = MI.getOperand(I).getReg();
4146           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4147           PackedRegs.push_back(AddrReg);
4148         }
4149       }
4150 
4151       // See also below in the non-a16 branch
4152       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4153 
4154       if (!UseNSA && PackedRegs.size() > 1) {
4155         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4156         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4157         PackedRegs[0] = Concat.getReg(0);
4158         PackedRegs.resize(1);
4159       }
4160 
4161       const int NumPacked = PackedRegs.size();
4162       for (int I = 0; I != NumVAddrs; ++I) {
4163         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4164         if (!SrcOp.isReg()) {
4165           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4166           continue;
4167         }
4168 
4169         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4170 
4171         if (I < NumPacked)
4172           SrcOp.setReg(PackedRegs[I]);
4173         else
4174           SrcOp.setReg(AMDGPU::NoRegister);
4175       }
4176     }
4177   } else {
4178     // If the register allocator cannot place the address registers contiguously
4179     // without introducing moves, then using the non-sequential address encoding
4180     // is always preferable, since it saves VALU instructions and is usually a
4181     // wash in terms of code size or even better.
4182     //
4183     // However, we currently have no way of hinting to the register allocator
4184     // that MIMG addresses should be placed contiguously when it is possible to
4185     // do so, so force non-NSA for the common 2-address case as a heuristic.
4186     //
4187     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4188     // allocation when possible.
4189     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4190 
4191     if (!UseNSA && NumVAddrs > 1)
4192       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4193   }
4194 
4195   int Flags = 0;
4196   if (IsA16)
4197     Flags |= 1;
4198   if (IsG16)
4199     Flags |= 2;
4200   MI.addOperand(MachineOperand::CreateImm(Flags));
4201 
4202   if (BaseOpcode->Store) { // No TFE for stores?
4203     // TODO: Handle dmask trim
4204     Register VData = MI.getOperand(1).getReg();
4205     LLT Ty = MRI->getType(VData);
4206     if (!Ty.isVector() || Ty.getElementType() != S16)
4207       return true;
4208 
4209     Register RepackedReg = handleD16VData(B, *MRI, VData);
4210     if (RepackedReg != VData) {
4211       MI.getOperand(1).setReg(RepackedReg);
4212     }
4213 
4214     return true;
4215   }
4216 
4217   Register DstReg = MI.getOperand(0).getReg();
4218   LLT Ty = MRI->getType(DstReg);
4219   const LLT EltTy = Ty.getScalarType();
4220   const bool IsD16 = Ty.getScalarType() == S16;
4221   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4222 
4223   // Confirm that the return type is large enough for the dmask specified
4224   if (NumElts < DMaskLanes)
4225     return false;
4226 
4227   if (NumElts > 4 || DMaskLanes > 4)
4228     return false;
4229 
4230   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4231   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4232 
4233   // The raw dword aligned data component of the load. The only legal cases
4234   // where this matters should be when using the packed D16 format, for
4235   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4236   LLT RoundedTy;
4237 
4238   // S32 vector to to cover all data, plus TFE result element.
4239   LLT TFETy;
4240 
4241   // Register type to use for each loaded component. Will be S32 or V2S16.
4242   LLT RegTy;
4243 
4244   if (IsD16 && ST.hasUnpackedD16VMem()) {
4245     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4246     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4247     RegTy = S32;
4248   } else {
4249     unsigned EltSize = EltTy.getSizeInBits();
4250     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4251     unsigned RoundedSize = 32 * RoundedElts;
4252     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4253     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4254     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4255   }
4256 
4257   // The return type does not need adjustment.
4258   // TODO: Should we change s16 case to s32 or <2 x s16>?
4259   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4260     return true;
4261 
4262   Register Dst1Reg;
4263 
4264   // Insert after the instruction.
4265   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4266 
4267   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4268   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4269   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4270   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4271 
4272   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4273 
4274   MI.getOperand(0).setReg(NewResultReg);
4275 
4276   // In the IR, TFE is supposed to be used with a 2 element struct return
4277   // type. The intruction really returns these two values in one contiguous
4278   // register, with one additional dword beyond the loaded data. Rewrite the
4279   // return type to use a single register result.
4280 
4281   if (IsTFE) {
4282     Dst1Reg = MI.getOperand(1).getReg();
4283     if (MRI->getType(Dst1Reg) != S32)
4284       return false;
4285 
4286     // TODO: Make sure the TFE operand bit is set.
4287     MI.RemoveOperand(1);
4288 
4289     // Handle the easy case that requires no repack instructions.
4290     if (Ty == S32) {
4291       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4292       return true;
4293     }
4294   }
4295 
4296   // Now figure out how to copy the new result register back into the old
4297   // result.
4298   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4299 
4300   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4301 
4302   if (ResultNumRegs == 1) {
4303     assert(!IsTFE);
4304     ResultRegs[0] = NewResultReg;
4305   } else {
4306     // We have to repack into a new vector of some kind.
4307     for (int I = 0; I != NumDataRegs; ++I)
4308       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4309     B.buildUnmerge(ResultRegs, NewResultReg);
4310 
4311     // Drop the final TFE element to get the data part. The TFE result is
4312     // directly written to the right place already.
4313     if (IsTFE)
4314       ResultRegs.resize(NumDataRegs);
4315   }
4316 
4317   // For an s16 scalar result, we form an s32 result with a truncate regardless
4318   // of packed vs. unpacked.
4319   if (IsD16 && !Ty.isVector()) {
4320     B.buildTrunc(DstReg, ResultRegs[0]);
4321     return true;
4322   }
4323 
4324   // Avoid a build/concat_vector of 1 entry.
4325   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4326     B.buildBitcast(DstReg, ResultRegs[0]);
4327     return true;
4328   }
4329 
4330   assert(Ty.isVector());
4331 
4332   if (IsD16) {
4333     // For packed D16 results with TFE enabled, all the data components are
4334     // S32. Cast back to the expected type.
4335     //
4336     // TODO: We don't really need to use load s32 elements. We would only need one
4337     // cast for the TFE result if a multiple of v2s16 was used.
4338     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4339       for (Register &Reg : ResultRegs)
4340         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4341     } else if (ST.hasUnpackedD16VMem()) {
4342       for (Register &Reg : ResultRegs)
4343         Reg = B.buildTrunc(S16, Reg).getReg(0);
4344     }
4345   }
4346 
4347   auto padWithUndef = [&](LLT Ty, int NumElts) {
4348     if (NumElts == 0)
4349       return;
4350     Register Undef = B.buildUndef(Ty).getReg(0);
4351     for (int I = 0; I != NumElts; ++I)
4352       ResultRegs.push_back(Undef);
4353   };
4354 
4355   // Pad out any elements eliminated due to the dmask.
4356   LLT ResTy = MRI->getType(ResultRegs[0]);
4357   if (!ResTy.isVector()) {
4358     padWithUndef(ResTy, NumElts - ResultRegs.size());
4359     B.buildBuildVector(DstReg, ResultRegs);
4360     return true;
4361   }
4362 
4363   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4364   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4365 
4366   // Deal with the one annoying legal case.
4367   const LLT V3S16 = LLT::vector(3, 16);
4368   if (Ty == V3S16) {
4369     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4370     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4371     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4372     return true;
4373   }
4374 
4375   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4376   B.buildConcatVectors(DstReg, ResultRegs);
4377   return true;
4378 }
4379 
4380 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4381   LegalizerHelper &Helper, MachineInstr &MI) const {
4382   MachineIRBuilder &B = Helper.MIRBuilder;
4383   GISelChangeObserver &Observer = Helper.Observer;
4384 
4385   Register Dst = MI.getOperand(0).getReg();
4386   LLT Ty = B.getMRI()->getType(Dst);
4387   unsigned Size = Ty.getSizeInBits();
4388   MachineFunction &MF = B.getMF();
4389 
4390   Observer.changingInstr(MI);
4391 
4392   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4393     Ty = getBitcastRegisterType(Ty);
4394     Helper.bitcastDst(MI, Ty, 0);
4395     Dst = MI.getOperand(0).getReg();
4396     B.setInsertPt(B.getMBB(), MI);
4397   }
4398 
4399   // FIXME: We don't really need this intermediate instruction. The intrinsic
4400   // should be fixed to have a memory operand. Since it's readnone, we're not
4401   // allowed to add one.
4402   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4403   MI.RemoveOperand(1); // Remove intrinsic ID
4404 
4405   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4406   // TODO: Should this use datalayout alignment?
4407   const unsigned MemSize = (Size + 7) / 8;
4408   const Align MemAlign(4);
4409   MachineMemOperand *MMO = MF.getMachineMemOperand(
4410       MachinePointerInfo(),
4411       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4412           MachineMemOperand::MOInvariant,
4413       MemSize, MemAlign);
4414   MI.addMemOperand(MF, MMO);
4415 
4416   // There are no 96-bit result scalar loads, but widening to 128-bit should
4417   // always be legal. We may need to restore this to a 96-bit result if it turns
4418   // out this needs to be converted to a vector load during RegBankSelect.
4419   if (!isPowerOf2_32(Size)) {
4420     if (Ty.isVector())
4421       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4422     else
4423       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4424   }
4425 
4426   Observer.changedInstr(MI);
4427   return true;
4428 }
4429 
4430 // TODO: Move to selection
4431 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4432                                                 MachineRegisterInfo &MRI,
4433                                                 MachineIRBuilder &B) const {
4434   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4435   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4436       !ST.isTrapHandlerEnabled()) {
4437     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4438   } else {
4439     // Pass queue pointer to trap handler as input, and insert trap instruction
4440     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4441     MachineRegisterInfo &MRI = *B.getMRI();
4442 
4443     Register LiveIn =
4444       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4445     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4446       return false;
4447 
4448     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4449     B.buildCopy(SGPR01, LiveIn);
4450     B.buildInstr(AMDGPU::S_TRAP)
4451         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4452         .addReg(SGPR01, RegState::Implicit);
4453   }
4454 
4455   MI.eraseFromParent();
4456   return true;
4457 }
4458 
4459 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4460     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4461   // Is non-HSA path or trap-handler disabled? then, report a warning
4462   // accordingly
4463   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4464       !ST.isTrapHandlerEnabled()) {
4465     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4466                                      "debugtrap handler not supported",
4467                                      MI.getDebugLoc(), DS_Warning);
4468     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4469     Ctx.diagnose(NoTrap);
4470   } else {
4471     // Insert debug-trap instruction
4472     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4473   }
4474 
4475   MI.eraseFromParent();
4476   return true;
4477 }
4478 
4479 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4480                                             MachineInstr &MI) const {
4481   MachineIRBuilder &B = Helper.MIRBuilder;
4482   MachineRegisterInfo &MRI = *B.getMRI();
4483 
4484   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4485   auto IntrID = MI.getIntrinsicID();
4486   switch (IntrID) {
4487   case Intrinsic::amdgcn_if:
4488   case Intrinsic::amdgcn_else: {
4489     MachineInstr *Br = nullptr;
4490     MachineBasicBlock *UncondBrTarget = nullptr;
4491     bool Negated = false;
4492     if (MachineInstr *BrCond =
4493             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4494       const SIRegisterInfo *TRI
4495         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4496 
4497       Register Def = MI.getOperand(1).getReg();
4498       Register Use = MI.getOperand(3).getReg();
4499 
4500       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4501 
4502       if (Negated)
4503         std::swap(CondBrTarget, UncondBrTarget);
4504 
4505       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4506       if (IntrID == Intrinsic::amdgcn_if) {
4507         B.buildInstr(AMDGPU::SI_IF)
4508           .addDef(Def)
4509           .addUse(Use)
4510           .addMBB(UncondBrTarget);
4511       } else {
4512         B.buildInstr(AMDGPU::SI_ELSE)
4513           .addDef(Def)
4514           .addUse(Use)
4515           .addMBB(UncondBrTarget)
4516           .addImm(0);
4517       }
4518 
4519       if (Br) {
4520         Br->getOperand(0).setMBB(CondBrTarget);
4521       } else {
4522         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4523         // since we're swapping branch targets it needs to be reinserted.
4524         // FIXME: IRTranslator should probably not do this
4525         B.buildBr(*CondBrTarget);
4526       }
4527 
4528       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4529       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4530       MI.eraseFromParent();
4531       BrCond->eraseFromParent();
4532       return true;
4533     }
4534 
4535     return false;
4536   }
4537   case Intrinsic::amdgcn_loop: {
4538     MachineInstr *Br = nullptr;
4539     MachineBasicBlock *UncondBrTarget = nullptr;
4540     bool Negated = false;
4541     if (MachineInstr *BrCond =
4542             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4543       const SIRegisterInfo *TRI
4544         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4545 
4546       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4547       Register Reg = MI.getOperand(2).getReg();
4548 
4549       if (Negated)
4550         std::swap(CondBrTarget, UncondBrTarget);
4551 
4552       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4553       B.buildInstr(AMDGPU::SI_LOOP)
4554         .addUse(Reg)
4555         .addMBB(UncondBrTarget);
4556 
4557       if (Br)
4558         Br->getOperand(0).setMBB(CondBrTarget);
4559       else
4560         B.buildBr(*CondBrTarget);
4561 
4562       MI.eraseFromParent();
4563       BrCond->eraseFromParent();
4564       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4565       return true;
4566     }
4567 
4568     return false;
4569   }
4570   case Intrinsic::amdgcn_kernarg_segment_ptr:
4571     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4572       // This only makes sense to call in a kernel, so just lower to null.
4573       B.buildConstant(MI.getOperand(0).getReg(), 0);
4574       MI.eraseFromParent();
4575       return true;
4576     }
4577 
4578     return legalizePreloadedArgIntrin(
4579       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4580   case Intrinsic::amdgcn_implicitarg_ptr:
4581     return legalizeImplicitArgPtr(MI, MRI, B);
4582   case Intrinsic::amdgcn_workitem_id_x:
4583     return legalizePreloadedArgIntrin(MI, MRI, B,
4584                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4585   case Intrinsic::amdgcn_workitem_id_y:
4586     return legalizePreloadedArgIntrin(MI, MRI, B,
4587                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4588   case Intrinsic::amdgcn_workitem_id_z:
4589     return legalizePreloadedArgIntrin(MI, MRI, B,
4590                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4591   case Intrinsic::amdgcn_workgroup_id_x:
4592     return legalizePreloadedArgIntrin(MI, MRI, B,
4593                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4594   case Intrinsic::amdgcn_workgroup_id_y:
4595     return legalizePreloadedArgIntrin(MI, MRI, B,
4596                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4597   case Intrinsic::amdgcn_workgroup_id_z:
4598     return legalizePreloadedArgIntrin(MI, MRI, B,
4599                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4600   case Intrinsic::amdgcn_dispatch_ptr:
4601     return legalizePreloadedArgIntrin(MI, MRI, B,
4602                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4603   case Intrinsic::amdgcn_queue_ptr:
4604     return legalizePreloadedArgIntrin(MI, MRI, B,
4605                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4606   case Intrinsic::amdgcn_implicit_buffer_ptr:
4607     return legalizePreloadedArgIntrin(
4608       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4609   case Intrinsic::amdgcn_dispatch_id:
4610     return legalizePreloadedArgIntrin(MI, MRI, B,
4611                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4612   case Intrinsic::amdgcn_fdiv_fast:
4613     return legalizeFDIVFastIntrin(MI, MRI, B);
4614   case Intrinsic::amdgcn_is_shared:
4615     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4616   case Intrinsic::amdgcn_is_private:
4617     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4618   case Intrinsic::amdgcn_wavefrontsize: {
4619     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4620     MI.eraseFromParent();
4621     return true;
4622   }
4623   case Intrinsic::amdgcn_s_buffer_load:
4624     return legalizeSBufferLoad(Helper, MI);
4625   case Intrinsic::amdgcn_raw_buffer_store:
4626   case Intrinsic::amdgcn_struct_buffer_store:
4627     return legalizeBufferStore(MI, MRI, B, false, false);
4628   case Intrinsic::amdgcn_raw_buffer_store_format:
4629   case Intrinsic::amdgcn_struct_buffer_store_format:
4630     return legalizeBufferStore(MI, MRI, B, false, true);
4631   case Intrinsic::amdgcn_raw_tbuffer_store:
4632   case Intrinsic::amdgcn_struct_tbuffer_store:
4633     return legalizeBufferStore(MI, MRI, B, true, true);
4634   case Intrinsic::amdgcn_raw_buffer_load:
4635   case Intrinsic::amdgcn_struct_buffer_load:
4636     return legalizeBufferLoad(MI, MRI, B, false, false);
4637   case Intrinsic::amdgcn_raw_buffer_load_format:
4638   case Intrinsic::amdgcn_struct_buffer_load_format:
4639     return legalizeBufferLoad(MI, MRI, B, true, false);
4640   case Intrinsic::amdgcn_raw_tbuffer_load:
4641   case Intrinsic::amdgcn_struct_tbuffer_load:
4642     return legalizeBufferLoad(MI, MRI, B, true, true);
4643   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4644   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4645   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4646   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4647   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4648   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4649   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4650   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4651   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4652   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4653   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4654   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4655   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4656   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4657   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4658   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4659   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4660   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4661   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4662   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4663   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4664   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4665   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4666   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4667   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4668   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4669   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4670   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4671     return legalizeBufferAtomic(MI, B, IntrID);
4672   case Intrinsic::amdgcn_atomic_inc:
4673     return legalizeAtomicIncDec(MI, B, true);
4674   case Intrinsic::amdgcn_atomic_dec:
4675     return legalizeAtomicIncDec(MI, B, false);
4676   case Intrinsic::trap:
4677     return legalizeTrapIntrinsic(MI, MRI, B);
4678   case Intrinsic::debugtrap:
4679     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4680   case Intrinsic::amdgcn_rsq_clamp:
4681     return legalizeRsqClampIntrinsic(MI, MRI, B);
4682   case Intrinsic::amdgcn_ds_fadd:
4683   case Intrinsic::amdgcn_ds_fmin:
4684   case Intrinsic::amdgcn_ds_fmax:
4685     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4686   default: {
4687     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4688             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4689       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4690     return true;
4691   }
4692   }
4693 
4694   return true;
4695 }
4696