1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 // If we have a truncating store or an extending load with a data size larger
227 // than 32-bits, we need to reduce to a 32-bit type.
228 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
229   return [=](const LegalityQuery &Query) {
230     const LLT Ty = Query.Types[TypeIdx];
231     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
232            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
233   };
234 }
235 
236 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
237 // handle some operations by just promoting the register during
238 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
239 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
240                                     bool IsLoad) {
241   switch (AS) {
242   case AMDGPUAS::PRIVATE_ADDRESS:
243     // FIXME: Private element size.
244     return ST.enableFlatScratch() ? 128 : 32;
245   case AMDGPUAS::LOCAL_ADDRESS:
246     return ST.useDS128() ? 128 : 64;
247   case AMDGPUAS::GLOBAL_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS:
249   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
250     // Treat constant and global as identical. SMRD loads are sometimes usable for
251     // global loads (ideally constant address space should be eliminated)
252     // depending on the context. Legality cannot be context dependent, but
253     // RegBankSelect can split the load as necessary depending on the pointer
254     // register bank/uniformity and if the memory is invariant or not written in a
255     // kernel.
256     return IsLoad ? 512 : 128;
257   default:
258     // Flat addresses may contextually need to be split to 32-bit parts if they
259     // may alias scratch depending on the subtarget.
260     return 128;
261   }
262 }
263 
264 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
265                                  const LegalityQuery &Query) {
266   const LLT Ty = Query.Types[0];
267 
268   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
269   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
270 
271   unsigned RegSize = Ty.getSizeInBits();
272   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
273   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
274   unsigned AS = Query.Types[1].getAddressSpace();
275 
276   // All of these need to be custom lowered to cast the pointer operand.
277   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
278     return false;
279 
280   // Do not handle extending vector loads.
281   if (Ty.isVector() && MemSize != RegSize)
282     return false;
283 
284   // TODO: We should be able to widen loads if the alignment is high enough, but
285   // we also need to modify the memory access size.
286 #if 0
287   // Accept widening loads based on alignment.
288   if (IsLoad && MemSize < Size)
289     MemSize = std::max(MemSize, Align);
290 #endif
291 
292   // Only 1-byte and 2-byte to 32-bit extloads are valid.
293   if (MemSize != RegSize && RegSize != 32)
294     return false;
295 
296   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
297     return false;
298 
299   switch (MemSize) {
300   case 8:
301   case 16:
302   case 32:
303   case 64:
304   case 128:
305     break;
306   case 96:
307     if (!ST.hasDwordx3LoadStores())
308       return false;
309     break;
310   case 256:
311   case 512:
312     // These may contextually need to be broken down.
313     break;
314   default:
315     return false;
316   }
317 
318   assert(RegSize >= MemSize);
319 
320   if (AlignBits < MemSize) {
321     const SITargetLowering *TLI = ST.getTargetLowering();
322     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
323                                                  Align(AlignBits / 8)))
324       return false;
325   }
326 
327   return true;
328 }
329 
330 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
331 // workaround this. Eventually it should ignore the type for loads and only care
332 // about the size. Return true in cases where we will workaround this for now by
333 // bitcasting.
334 static bool loadStoreBitcastWorkaround(const LLT Ty) {
335   if (EnableNewLegality)
336     return false;
337 
338   const unsigned Size = Ty.getSizeInBits();
339   if (Size <= 64)
340     return false;
341   if (!Ty.isVector())
342     return true;
343 
344   LLT EltTy = Ty.getElementType();
345   if (EltTy.isPointer())
346     return true;
347 
348   unsigned EltSize = EltTy.getSizeInBits();
349   return EltSize != 32 && EltSize != 64;
350 }
351 
352 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
353   const LLT Ty = Query.Types[0];
354   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
355          !loadStoreBitcastWorkaround(Ty);
356 }
357 
358 /// Return true if a load or store of the type should be lowered with a bitcast
359 /// to a different type.
360 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
361                                        const unsigned MemSizeInBits) {
362   const unsigned Size = Ty.getSizeInBits();
363   if (Size != MemSizeInBits)
364     return Size <= 32 && Ty.isVector();
365 
366   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
367     return true;
368   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
369          !isRegisterVectorElementType(Ty.getElementType());
370 }
371 
372 /// Return true if we should legalize a load by widening an odd sized memory
373 /// access up to the alignment. Note this case when the memory access itself
374 /// changes, not the size of the result register.
375 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
376                             unsigned AlignInBits, unsigned AddrSpace,
377                             unsigned Opcode) {
378   // We don't want to widen cases that are naturally legal.
379   if (isPowerOf2_32(SizeInBits))
380     return false;
381 
382   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
383   // end up widening these for a scalar load during RegBankSelect, since there
384   // aren't 96-bit scalar loads.
385   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
386     return false;
387 
388   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
389     return false;
390 
391   // A load is known dereferenceable up to the alignment, so it's legal to widen
392   // to it.
393   //
394   // TODO: Could check dereferenceable for less aligned cases.
395   unsigned RoundedSize = NextPowerOf2(SizeInBits);
396   if (AlignInBits < RoundedSize)
397     return false;
398 
399   // Do not widen if it would introduce a slow unaligned load.
400   const SITargetLowering *TLI = ST.getTargetLowering();
401   bool Fast = false;
402   return TLI->allowsMisalignedMemoryAccessesImpl(
403              RoundedSize, AddrSpace, Align(AlignInBits / 8),
404              MachineMemOperand::MOLoad, &Fast) &&
405          Fast;
406 }
407 
408 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
409                             unsigned Opcode) {
410   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
411     return false;
412 
413   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
414                          Query.MMODescrs[0].AlignInBits,
415                          Query.Types[1].getAddressSpace(), Opcode);
416 }
417 
418 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
419                                          const GCNTargetMachine &TM)
420   :  ST(ST_) {
421   using namespace TargetOpcode;
422 
423   auto GetAddrSpacePtr = [&TM](unsigned AS) {
424     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
425   };
426 
427   const LLT S1 = LLT::scalar(1);
428   const LLT S8 = LLT::scalar(8);
429   const LLT S16 = LLT::scalar(16);
430   const LLT S32 = LLT::scalar(32);
431   const LLT S64 = LLT::scalar(64);
432   const LLT S128 = LLT::scalar(128);
433   const LLT S256 = LLT::scalar(256);
434   const LLT S512 = LLT::scalar(512);
435   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
436 
437   const LLT V2S8 = LLT::fixed_vector(2, 8);
438   const LLT V2S16 = LLT::fixed_vector(2, 16);
439   const LLT V4S16 = LLT::fixed_vector(4, 16);
440 
441   const LLT V2S32 = LLT::fixed_vector(2, 32);
442   const LLT V3S32 = LLT::fixed_vector(3, 32);
443   const LLT V4S32 = LLT::fixed_vector(4, 32);
444   const LLT V5S32 = LLT::fixed_vector(5, 32);
445   const LLT V6S32 = LLT::fixed_vector(6, 32);
446   const LLT V7S32 = LLT::fixed_vector(7, 32);
447   const LLT V8S32 = LLT::fixed_vector(8, 32);
448   const LLT V9S32 = LLT::fixed_vector(9, 32);
449   const LLT V10S32 = LLT::fixed_vector(10, 32);
450   const LLT V11S32 = LLT::fixed_vector(11, 32);
451   const LLT V12S32 = LLT::fixed_vector(12, 32);
452   const LLT V13S32 = LLT::fixed_vector(13, 32);
453   const LLT V14S32 = LLT::fixed_vector(14, 32);
454   const LLT V15S32 = LLT::fixed_vector(15, 32);
455   const LLT V16S32 = LLT::fixed_vector(16, 32);
456   const LLT V32S32 = LLT::fixed_vector(32, 32);
457 
458   const LLT V2S64 = LLT::fixed_vector(2, 64);
459   const LLT V3S64 = LLT::fixed_vector(3, 64);
460   const LLT V4S64 = LLT::fixed_vector(4, 64);
461   const LLT V5S64 = LLT::fixed_vector(5, 64);
462   const LLT V6S64 = LLT::fixed_vector(6, 64);
463   const LLT V7S64 = LLT::fixed_vector(7, 64);
464   const LLT V8S64 = LLT::fixed_vector(8, 64);
465   const LLT V16S64 = LLT::fixed_vector(16, 64);
466 
467   std::initializer_list<LLT> AllS32Vectors =
468     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
469      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
470   std::initializer_list<LLT> AllS64Vectors =
471     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
472 
473   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
474   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
475   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
476   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
477   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
478   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
479   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
480 
481   const LLT CodePtr = FlatPtr;
482 
483   const std::initializer_list<LLT> AddrSpaces64 = {
484     GlobalPtr, ConstantPtr, FlatPtr
485   };
486 
487   const std::initializer_list<LLT> AddrSpaces32 = {
488     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
489   };
490 
491   const std::initializer_list<LLT> FPTypesBase = {
492     S32, S64
493   };
494 
495   const std::initializer_list<LLT> FPTypes16 = {
496     S32, S64, S16
497   };
498 
499   const std::initializer_list<LLT> FPTypesPK16 = {
500     S32, S64, S16, V2S16
501   };
502 
503   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
504 
505   // s1 for VCC branches, s32 for SCC branches.
506   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
507 
508   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
509   // elements for v3s16
510   getActionDefinitionsBuilder(G_PHI)
511     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
512     .legalFor(AllS32Vectors)
513     .legalFor(AllS64Vectors)
514     .legalFor(AddrSpaces64)
515     .legalFor(AddrSpaces32)
516     .legalIf(isPointer(0))
517     .clampScalar(0, S16, S256)
518     .widenScalarToNextPow2(0, 32)
519     .clampMaxNumElements(0, S32, 16)
520     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
521     .scalarize(0);
522 
523   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
524     // Full set of gfx9 features.
525     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
526       .legalFor({S32, S16, V2S16})
527       .clampScalar(0, S16, S32)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32);
531 
532     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
533       .legalFor({S32, S16, V2S16}) // Clamp modifier
534       .minScalarOrElt(0, S16)
535       .clampMaxNumElements(0, S16, 2)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32)
538       .lower();
539   } else if (ST.has16BitInsts()) {
540     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
541       .legalFor({S32, S16})
542       .clampScalar(0, S16, S32)
543       .scalarize(0)
544       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
545 
546     // Technically the saturating operations require clamp bit support, but this
547     // was introduced at the same time as 16-bit operations.
548     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
549       .legalFor({S32, S16}) // Clamp modifier
550       .minScalar(0, S16)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 16)
553       .lower();
554 
555     // We're just lowering this, but it helps get a better result to try to
556     // coerce to the desired type first.
557     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
558       .minScalar(0, S16)
559       .scalarize(0)
560       .lower();
561   } else {
562     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
563       .legalFor({S32})
564       .clampScalar(0, S32, S32)
565       .scalarize(0);
566 
567     if (ST.hasIntClamp()) {
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .legalFor({S32}) // Clamp modifier.
570         .scalarize(0)
571         .minScalarOrElt(0, S32)
572         .lower();
573     } else {
574       // Clamp bit support was added in VI, along with 16-bit operations.
575       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576         .minScalar(0, S32)
577         .scalarize(0)
578         .lower();
579     }
580 
581     // FIXME: DAG expansion gets better results. The widening uses the smaller
582     // range values and goes for the min/max lowering directly.
583     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
584       .minScalar(0, S32)
585       .scalarize(0)
586       .lower();
587   }
588 
589   getActionDefinitionsBuilder(
590       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
591       .customFor({S32, S64})
592       .clampScalar(0, S32, S64)
593       .widenScalarToNextPow2(0, 32)
594       .scalarize(0);
595 
596   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
597                    .legalFor({S32})
598                    .maxScalarOrElt(0, S32);
599 
600   if (ST.hasVOP3PInsts()) {
601     Mulh
602       .clampMaxNumElements(0, S8, 2)
603       .lowerFor({V2S8});
604   }
605 
606   Mulh
607     .scalarize(0)
608     .lower();
609 
610   // Report legal for any types we can handle anywhere. For the cases only legal
611   // on the SALU, RegBankSelect will be able to re-legalize.
612   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
613     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
614     .clampScalar(0, S32, S64)
615     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
616     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
617     .widenScalarToNextPow2(0)
618     .scalarize(0);
619 
620   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
621                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
622     .legalFor({{S32, S1}, {S32, S32}})
623     .minScalar(0, S32)
624     // TODO: .scalarize(0)
625     .lower();
626 
627   getActionDefinitionsBuilder(G_BITCAST)
628     // Don't worry about the size constraint.
629     .legalIf(all(isRegisterType(0), isRegisterType(1)))
630     .lower();
631 
632 
633   getActionDefinitionsBuilder(G_CONSTANT)
634     .legalFor({S1, S32, S64, S16, GlobalPtr,
635                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
636     .legalIf(isPointer(0))
637     .clampScalar(0, S32, S64)
638     .widenScalarToNextPow2(0);
639 
640   getActionDefinitionsBuilder(G_FCONSTANT)
641     .legalFor({S32, S64, S16})
642     .clampScalar(0, S16, S64);
643 
644   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
645       .legalIf(isRegisterType(0))
646       // s1 and s16 are special cases because they have legal operations on
647       // them, but don't really occupy registers in the normal way.
648       .legalFor({S1, S16})
649       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
650       .clampScalarOrElt(0, S32, MaxScalar)
651       .widenScalarToNextPow2(0, 32)
652       .clampMaxNumElements(0, S32, 16);
653 
654   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
655 
656   // If the amount is divergent, we have to do a wave reduction to get the
657   // maximum value, so this is expanded during RegBankSelect.
658   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
659     .legalFor({{PrivatePtr, S32}});
660 
661   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
662     .customIf(typeIsNot(0, PrivatePtr));
663 
664   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
665 
666   auto &FPOpActions = getActionDefinitionsBuilder(
667     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
668     .legalFor({S32, S64});
669   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
670     .customFor({S32, S64});
671   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
672     .customFor({S32, S64});
673 
674   if (ST.has16BitInsts()) {
675     if (ST.hasVOP3PInsts())
676       FPOpActions.legalFor({S16, V2S16});
677     else
678       FPOpActions.legalFor({S16});
679 
680     TrigActions.customFor({S16});
681     FDIVActions.customFor({S16});
682   }
683 
684   auto &MinNumMaxNum = getActionDefinitionsBuilder({
685       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
686 
687   if (ST.hasVOP3PInsts()) {
688     MinNumMaxNum.customFor(FPTypesPK16)
689       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
690       .clampMaxNumElements(0, S16, 2)
691       .clampScalar(0, S16, S64)
692       .scalarize(0);
693   } else if (ST.has16BitInsts()) {
694     MinNumMaxNum.customFor(FPTypes16)
695       .clampScalar(0, S16, S64)
696       .scalarize(0);
697   } else {
698     MinNumMaxNum.customFor(FPTypesBase)
699       .clampScalar(0, S32, S64)
700       .scalarize(0);
701   }
702 
703   if (ST.hasVOP3PInsts())
704     FPOpActions.clampMaxNumElements(0, S16, 2);
705 
706   FPOpActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   TrigActions
711     .scalarize(0)
712     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
713 
714   FDIVActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   getActionDefinitionsBuilder({G_FNEG, G_FABS})
719     .legalFor(FPTypesPK16)
720     .clampMaxNumElements(0, S16, 2)
721     .scalarize(0)
722     .clampScalar(0, S16, S64);
723 
724   if (ST.has16BitInsts()) {
725     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
726       .legalFor({S32, S64, S16})
727       .scalarize(0)
728       .clampScalar(0, S16, S64);
729   } else {
730     getActionDefinitionsBuilder(G_FSQRT)
731       .legalFor({S32, S64})
732       .scalarize(0)
733       .clampScalar(0, S32, S64);
734 
735     if (ST.hasFractBug()) {
736       getActionDefinitionsBuilder(G_FFLOOR)
737         .customFor({S64})
738         .legalFor({S32, S64})
739         .scalarize(0)
740         .clampScalar(0, S32, S64);
741     } else {
742       getActionDefinitionsBuilder(G_FFLOOR)
743         .legalFor({S32, S64})
744         .scalarize(0)
745         .clampScalar(0, S32, S64);
746     }
747   }
748 
749   getActionDefinitionsBuilder(G_FPTRUNC)
750     .legalFor({{S32, S64}, {S16, S32}})
751     .scalarize(0)
752     .lower();
753 
754   getActionDefinitionsBuilder(G_FPEXT)
755     .legalFor({{S64, S32}, {S32, S16}})
756     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
757     .scalarize(0);
758 
759   getActionDefinitionsBuilder(G_FSUB)
760       // Use actual fsub instruction
761       .legalFor({S32})
762       // Must use fadd + fneg
763       .lowerFor({S64, S16, V2S16})
764       .scalarize(0)
765       .clampScalar(0, S32, S64);
766 
767   // Whether this is legal depends on the floating point mode for the function.
768   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
769   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
770     FMad.customFor({S32, S16});
771   else if (ST.hasMadMacF32Insts())
772     FMad.customFor({S32});
773   else if (ST.hasMadF16())
774     FMad.customFor({S16});
775   FMad.scalarize(0)
776       .lower();
777 
778   auto &FRem = getActionDefinitionsBuilder(G_FREM);
779   if (ST.has16BitInsts()) {
780     FRem.customFor({S16, S32, S64});
781   } else {
782     FRem.minScalar(0, S32)
783         .customFor({S32, S64});
784   }
785   FRem.scalarize(0);
786 
787   // TODO: Do we need to clamp maximum bitwidth?
788   getActionDefinitionsBuilder(G_TRUNC)
789     .legalIf(isScalar(0))
790     .legalFor({{V2S16, V2S32}})
791     .clampMaxNumElements(0, S16, 2)
792     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
793     // situations (like an invalid implicit use), we don't want to infinite loop
794     // in the legalizer.
795     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
796     .alwaysLegal();
797 
798   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
799     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
800                {S32, S1}, {S64, S1}, {S16, S1}})
801     .scalarize(0)
802     .clampScalar(0, S32, S64)
803     .widenScalarToNextPow2(1, 32);
804 
805   // TODO: Split s1->s64 during regbankselect for VALU.
806   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
807     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
808     .lowerFor({{S32, S64}})
809     .lowerIf(typeIs(1, S1))
810     .customFor({{S64, S64}});
811   if (ST.has16BitInsts())
812     IToFP.legalFor({{S16, S16}});
813   IToFP.clampScalar(1, S32, S64)
814        .minScalar(0, S32)
815        .scalarize(0)
816        .widenScalarToNextPow2(1);
817 
818   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
819     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
820     .customFor({{S64, S32}, {S64, S64}})
821     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
822   if (ST.has16BitInsts())
823     FPToI.legalFor({{S16, S16}});
824   else
825     FPToI.minScalar(1, S32);
826 
827   FPToI.minScalar(0, S32)
828        .widenScalarToNextPow2(0, 32)
829        .scalarize(0)
830        .lower();
831 
832   // Lower roundeven into G_FRINT
833   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
834     .scalarize(0)
835     .lower();
836 
837   if (ST.has16BitInsts()) {
838     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
839       .legalFor({S16, S32, S64})
840       .clampScalar(0, S16, S64)
841       .scalarize(0);
842   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
843     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
844       .legalFor({S32, S64})
845       .clampScalar(0, S32, S64)
846       .scalarize(0);
847   } else {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S32})
850       .customFor({S64})
851       .clampScalar(0, S32, S64)
852       .scalarize(0);
853   }
854 
855   getActionDefinitionsBuilder(G_PTR_ADD)
856     .legalIf(all(isPointer(0), sameSize(0, 1)))
857     .scalarize(0)
858     .scalarSameSizeAs(1, 0);
859 
860   getActionDefinitionsBuilder(G_PTRMASK)
861     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
862     .scalarSameSizeAs(1, 0)
863     .scalarize(0);
864 
865   auto &CmpBuilder =
866     getActionDefinitionsBuilder(G_ICMP)
867     // The compare output type differs based on the register bank of the output,
868     // so make both s1 and s32 legal.
869     //
870     // Scalar compares producing output in scc will be promoted to s32, as that
871     // is the allocatable register type that will be needed for the copy from
872     // scc. This will be promoted during RegBankSelect, and we assume something
873     // before that won't try to use s32 result types.
874     //
875     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
876     // bank.
877     .legalForCartesianProduct(
878       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
879     .legalForCartesianProduct(
880       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
881   if (ST.has16BitInsts()) {
882     CmpBuilder.legalFor({{S1, S16}});
883   }
884 
885   CmpBuilder
886     .widenScalarToNextPow2(1)
887     .clampScalar(1, S32, S64)
888     .scalarize(0)
889     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
890 
891   getActionDefinitionsBuilder(G_FCMP)
892     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
893     .widenScalarToNextPow2(1)
894     .clampScalar(1, S32, S64)
895     .scalarize(0);
896 
897   // FIXME: fpow has a selection pattern that should move to custom lowering.
898   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
899   if (ST.has16BitInsts())
900     Exp2Ops.legalFor({S32, S16});
901   else
902     Exp2Ops.legalFor({S32});
903   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
904   Exp2Ops.scalarize(0);
905 
906   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
907   if (ST.has16BitInsts())
908     ExpOps.customFor({{S32}, {S16}});
909   else
910     ExpOps.customFor({S32});
911   ExpOps.clampScalar(0, MinScalarFPTy, S32)
912         .scalarize(0);
913 
914   getActionDefinitionsBuilder(G_FPOWI)
915     .clampScalar(0, MinScalarFPTy, S32)
916     .lower();
917 
918   // The 64-bit versions produce 32-bit results, but only on the SALU.
919   getActionDefinitionsBuilder(G_CTPOP)
920     .legalFor({{S32, S32}, {S32, S64}})
921     .clampScalar(0, S32, S32)
922     .clampScalar(1, S32, S64)
923     .scalarize(0)
924     .widenScalarToNextPow2(0, 32)
925     .widenScalarToNextPow2(1, 32);
926 
927   // The hardware instructions return a different result on 0 than the generic
928   // instructions expect. The hardware produces -1, but these produce the
929   // bitwidth.
930   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
931     .scalarize(0)
932     .clampScalar(0, S32, S32)
933     .clampScalar(1, S32, S64)
934     .widenScalarToNextPow2(0, 32)
935     .widenScalarToNextPow2(1, 32)
936     .lower();
937 
938   // The 64-bit versions produce 32-bit results, but only on the SALU.
939   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
940     .legalFor({{S32, S32}, {S32, S64}})
941     .clampScalar(0, S32, S32)
942     .clampScalar(1, S32, S64)
943     .scalarize(0)
944     .widenScalarToNextPow2(0, 32)
945     .widenScalarToNextPow2(1, 32);
946 
947   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
948   // RegBankSelect.
949   getActionDefinitionsBuilder(G_BITREVERSE)
950     .legalFor({S32, S64})
951     .clampScalar(0, S32, S64)
952     .scalarize(0)
953     .widenScalarToNextPow2(0);
954 
955   if (ST.has16BitInsts()) {
956     getActionDefinitionsBuilder(G_BSWAP)
957       .legalFor({S16, S32, V2S16})
958       .clampMaxNumElements(0, S16, 2)
959       // FIXME: Fixing non-power-of-2 before clamp is workaround for
960       // narrowScalar limitation.
961       .widenScalarToNextPow2(0)
962       .clampScalar(0, S16, S32)
963       .scalarize(0);
964 
965     if (ST.hasVOP3PInsts()) {
966       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
967         .legalFor({S32, S16, V2S16})
968         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
969         .clampMaxNumElements(0, S16, 2)
970         .minScalar(0, S16)
971         .widenScalarToNextPow2(0)
972         .scalarize(0)
973         .lower();
974     } else {
975       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
976         .legalFor({S32, S16})
977         .widenScalarToNextPow2(0)
978         .minScalar(0, S16)
979         .scalarize(0)
980         .lower();
981     }
982   } else {
983     // TODO: Should have same legality without v_perm_b32
984     getActionDefinitionsBuilder(G_BSWAP)
985       .legalFor({S32})
986       .lowerIf(scalarNarrowerThan(0, 32))
987       // FIXME: Fixing non-power-of-2 before clamp is workaround for
988       // narrowScalar limitation.
989       .widenScalarToNextPow2(0)
990       .maxScalar(0, S32)
991       .scalarize(0)
992       .lower();
993 
994     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
995       .legalFor({S32})
996       .minScalar(0, S32)
997       .widenScalarToNextPow2(0)
998       .scalarize(0)
999       .lower();
1000   }
1001 
1002   getActionDefinitionsBuilder(G_INTTOPTR)
1003     // List the common cases
1004     .legalForCartesianProduct(AddrSpaces64, {S64})
1005     .legalForCartesianProduct(AddrSpaces32, {S32})
1006     .scalarize(0)
1007     // Accept any address space as long as the size matches
1008     .legalIf(sameSize(0, 1))
1009     .widenScalarIf(smallerThan(1, 0),
1010       [](const LegalityQuery &Query) {
1011         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1012       })
1013     .narrowScalarIf(largerThan(1, 0),
1014       [](const LegalityQuery &Query) {
1015         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1016       });
1017 
1018   getActionDefinitionsBuilder(G_PTRTOINT)
1019     // List the common cases
1020     .legalForCartesianProduct(AddrSpaces64, {S64})
1021     .legalForCartesianProduct(AddrSpaces32, {S32})
1022     .scalarize(0)
1023     // Accept any address space as long as the size matches
1024     .legalIf(sameSize(0, 1))
1025     .widenScalarIf(smallerThan(0, 1),
1026       [](const LegalityQuery &Query) {
1027         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1028       })
1029     .narrowScalarIf(
1030       largerThan(0, 1),
1031       [](const LegalityQuery &Query) {
1032         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1033       });
1034 
1035   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1036     .scalarize(0)
1037     .custom();
1038 
1039   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1040                                     bool IsLoad) -> bool {
1041     const LLT DstTy = Query.Types[0];
1042 
1043     // Split vector extloads.
1044     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1045     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1046 
1047     if (MemSize < DstTy.getSizeInBits())
1048       MemSize = std::max(MemSize, AlignBits);
1049 
1050     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1051       return true;
1052 
1053     const LLT PtrTy = Query.Types[1];
1054     unsigned AS = PtrTy.getAddressSpace();
1055     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1056       return true;
1057 
1058     // Catch weird sized loads that don't evenly divide into the access sizes
1059     // TODO: May be able to widen depending on alignment etc.
1060     unsigned NumRegs = (MemSize + 31) / 32;
1061     if (NumRegs == 3) {
1062       if (!ST.hasDwordx3LoadStores())
1063         return true;
1064     } else {
1065       // If the alignment allows, these should have been widened.
1066       if (!isPowerOf2_32(NumRegs))
1067         return true;
1068     }
1069 
1070     if (AlignBits < MemSize) {
1071       const SITargetLowering *TLI = ST.getTargetLowering();
1072       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1073                                                       Align(AlignBits / 8));
1074     }
1075 
1076     return false;
1077   };
1078 
1079   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084   // LDS
1085   // TODO: Unsupported flat for SI.
1086 
1087   for (unsigned Op : {G_LOAD, G_STORE}) {
1088     const bool IsStore = Op == G_STORE;
1089 
1090     auto &Actions = getActionDefinitionsBuilder(Op);
1091     // Explicitly list some common cases.
1092     // TODO: Does this help compile time at all?
1093     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1094                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1095                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1096                                       {S64, GlobalPtr, 64, GlobalAlign32},
1097                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1098                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1099                                       {S32, GlobalPtr, 8, GlobalAlign8},
1100                                       {S32, GlobalPtr, 16, GlobalAlign16},
1101 
1102                                       {S32, LocalPtr, 32, 32},
1103                                       {S64, LocalPtr, 64, 32},
1104                                       {V2S32, LocalPtr, 64, 32},
1105                                       {S32, LocalPtr, 8, 8},
1106                                       {S32, LocalPtr, 16, 16},
1107                                       {V2S16, LocalPtr, 32, 32},
1108 
1109                                       {S32, PrivatePtr, 32, 32},
1110                                       {S32, PrivatePtr, 8, 8},
1111                                       {S32, PrivatePtr, 16, 16},
1112                                       {V2S16, PrivatePtr, 32, 32},
1113 
1114                                       {S32, ConstantPtr, 32, GlobalAlign32},
1115                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1116                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1117                                       {S64, ConstantPtr, 64, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1119     Actions.legalIf(
1120       [=](const LegalityQuery &Query) -> bool {
1121         return isLoadStoreLegal(ST, Query);
1122       });
1123 
1124     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125     // 64-bits.
1126     //
1127     // TODO: Should generalize bitcast action into coerce, which will also cover
1128     // inserting addrspacecasts.
1129     Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131     // Turn any illegal element vectors into something easier to deal
1132     // with. These will ultimately produce 32-bit scalar shifts to extract the
1133     // parts anyway.
1134     //
1135     // For odd 16-bit element vectors, prefer to split those into pieces with
1136     // 16-bit vector parts.
1137     Actions.bitcastIf(
1138       [=](const LegalityQuery &Query) -> bool {
1139         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140                                           Query.MMODescrs[0].SizeInBits);
1141       }, bitcastToRegisterType(0));
1142 
1143     if (!IsStore) {
1144       // Widen suitably aligned loads by loading extra bytes. The standard
1145       // legalization actions can't properly express widening memory operands.
1146       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147         return shouldWidenLoad(ST, Query, G_LOAD);
1148       });
1149     }
1150 
1151     // FIXME: load/store narrowing should be moved to lower action
1152     Actions
1153         .narrowScalarIf(
1154             [=](const LegalityQuery &Query) -> bool {
1155               return !Query.Types[0].isVector() &&
1156                      needToSplitMemOp(Query, Op == G_LOAD);
1157             },
1158             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159               const LLT DstTy = Query.Types[0];
1160               const LLT PtrTy = Query.Types[1];
1161 
1162               const unsigned DstSize = DstTy.getSizeInBits();
1163               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1164 
1165               // Split extloads.
1166               if (DstSize > MemSize)
1167                 return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169               if (!isPowerOf2_32(DstSize)) {
1170                 // We're probably decomposing an odd sized store. Try to split
1171                 // to the widest type. TODO: Account for alignment. As-is it
1172                 // should be OK, since the new parts will be further legalized.
1173                 unsigned FloorSize = PowerOf2Floor(DstSize);
1174                 return std::make_pair(0, LLT::scalar(FloorSize));
1175               }
1176 
1177               if (DstSize > 32 && (DstSize % 32 != 0)) {
1178                 // FIXME: Need a way to specify non-extload of larger size if
1179                 // suitably aligned.
1180                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1181               }
1182 
1183               unsigned MaxSize = maxSizeForAddrSpace(ST,
1184                                                      PtrTy.getAddressSpace(),
1185                                                      Op == G_LOAD);
1186               if (MemSize > MaxSize)
1187                 return std::make_pair(0, LLT::scalar(MaxSize));
1188 
1189               unsigned Align = Query.MMODescrs[0].AlignInBits;
1190               return std::make_pair(0, LLT::scalar(Align));
1191             })
1192         .fewerElementsIf(
1193             [=](const LegalityQuery &Query) -> bool {
1194               return Query.Types[0].isVector() &&
1195                      needToSplitMemOp(Query, Op == G_LOAD);
1196             },
1197             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1198               const LLT DstTy = Query.Types[0];
1199               const LLT PtrTy = Query.Types[1];
1200 
1201               LLT EltTy = DstTy.getElementType();
1202               unsigned MaxSize = maxSizeForAddrSpace(ST,
1203                                                      PtrTy.getAddressSpace(),
1204                                                      Op == G_LOAD);
1205 
1206               // FIXME: Handle widened to power of 2 results better. This ends
1207               // up scalarizing.
1208               // FIXME: 3 element stores scalarized on SI
1209 
1210               // Split if it's too large for the address space.
1211               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1212                 unsigned NumElts = DstTy.getNumElements();
1213                 unsigned EltSize = EltTy.getSizeInBits();
1214 
1215                 if (MaxSize % EltSize == 0) {
1216                   return std::make_pair(
1217                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1218                 }
1219 
1220                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1221 
1222                 // FIXME: Refine when odd breakdowns handled
1223                 // The scalars will need to be re-legalized.
1224                 if (NumPieces == 1 || NumPieces >= NumElts ||
1225                     NumElts % NumPieces != 0)
1226                   return std::make_pair(0, EltTy);
1227 
1228                 return std::make_pair(
1229                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1230               }
1231 
1232               // FIXME: We could probably handle weird extending loads better.
1233               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1234               if (DstTy.getSizeInBits() > MemSize)
1235                 return std::make_pair(0, EltTy);
1236 
1237               unsigned EltSize = EltTy.getSizeInBits();
1238               unsigned DstSize = DstTy.getSizeInBits();
1239               if (!isPowerOf2_32(DstSize)) {
1240                 // We're probably decomposing an odd sized store. Try to split
1241                 // to the widest type. TODO: Account for alignment. As-is it
1242                 // should be OK, since the new parts will be further legalized.
1243                 unsigned FloorSize = PowerOf2Floor(DstSize);
1244                 return std::make_pair(
1245                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1246               }
1247 
1248               // Need to split because of alignment.
1249               unsigned Align = Query.MMODescrs[0].AlignInBits;
1250               if (EltSize > Align &&
1251                   (EltSize / Align < DstTy.getNumElements())) {
1252                 return std::make_pair(
1253                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1254               }
1255 
1256               // May need relegalization for the scalars.
1257               return std::make_pair(0, EltTy);
1258             })
1259     .lowerIfMemSizeNotPow2()
1260     .minScalar(0, S32)
1261     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1262     .widenScalarToNextPow2(0)
1263     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1264     .lower();
1265   }
1266 
1267   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1268                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1269                                                   {S32, GlobalPtr, 16, 2 * 8},
1270                                                   {S32, LocalPtr, 8, 8},
1271                                                   {S32, LocalPtr, 16, 16},
1272                                                   {S32, PrivatePtr, 8, 8},
1273                                                   {S32, PrivatePtr, 16, 16},
1274                                                   {S32, ConstantPtr, 8, 8},
1275                                                   {S32, ConstantPtr, 16, 2 * 8}})
1276                        .legalIf(
1277                          [=](const LegalityQuery &Query) -> bool {
1278                            return isLoadStoreLegal(ST, Query);
1279                          });
1280 
1281   if (ST.hasFlatAddressSpace()) {
1282     ExtLoads.legalForTypesWithMemDesc(
1283         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1284   }
1285 
1286   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1287   // 64-bits.
1288   //
1289   // TODO: Should generalize bitcast action into coerce, which will also cover
1290   // inserting addrspacecasts.
1291   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1292 
1293   ExtLoads.clampScalar(0, S32, S32)
1294           .widenScalarToNextPow2(0)
1295           .unsupportedIfMemSizeNotPow2()
1296           .lower();
1297 
1298   auto &Atomics = getActionDefinitionsBuilder(
1299     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1300      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1301      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1302      G_ATOMICRMW_UMIN})
1303     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1304                {S64, GlobalPtr}, {S64, LocalPtr},
1305                {S32, RegionPtr}, {S64, RegionPtr}});
1306   if (ST.hasFlatAddressSpace()) {
1307     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1308   }
1309 
1310   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1311   if (ST.hasLDSFPAtomics()) {
1312     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1313     if (ST.hasGFX90AInsts())
1314       Atomic.legalFor({{S64, LocalPtr}});
1315   }
1316   if (ST.hasAtomicFaddInsts())
1317     Atomic.legalFor({{S32, GlobalPtr}});
1318 
1319   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1320   // demarshalling
1321   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1322     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1323                 {S32, FlatPtr}, {S64, FlatPtr}})
1324     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1325                {S32, RegionPtr}, {S64, RegionPtr}});
1326   // TODO: Pointer types, any 32-bit or 64-bit vector
1327 
1328   // Condition should be s32 for scalar, s1 for vector.
1329   getActionDefinitionsBuilder(G_SELECT)
1330       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1331                                  LocalPtr, FlatPtr, PrivatePtr,
1332                                  LLT::fixed_vector(2, LocalPtr),
1333                                  LLT::fixed_vector(2, PrivatePtr)},
1334                                 {S1, S32})
1335       .clampScalar(0, S16, S64)
1336       .scalarize(1)
1337       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1338       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1339       .clampMaxNumElements(0, S32, 2)
1340       .clampMaxNumElements(0, LocalPtr, 2)
1341       .clampMaxNumElements(0, PrivatePtr, 2)
1342       .scalarize(0)
1343       .widenScalarToNextPow2(0)
1344       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1345 
1346   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1347   // be more flexible with the shift amount type.
1348   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1349     .legalFor({{S32, S32}, {S64, S32}});
1350   if (ST.has16BitInsts()) {
1351     if (ST.hasVOP3PInsts()) {
1352       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1353             .clampMaxNumElements(0, S16, 2);
1354     } else
1355       Shifts.legalFor({{S16, S16}});
1356 
1357     // TODO: Support 16-bit shift amounts for all types
1358     Shifts.widenScalarIf(
1359       [=](const LegalityQuery &Query) {
1360         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1361         // 32-bit amount.
1362         const LLT ValTy = Query.Types[0];
1363         const LLT AmountTy = Query.Types[1];
1364         return ValTy.getSizeInBits() <= 16 &&
1365                AmountTy.getSizeInBits() < 16;
1366       }, changeTo(1, S16));
1367     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1368     Shifts.clampScalar(1, S32, S32);
1369     Shifts.clampScalar(0, S16, S64);
1370     Shifts.widenScalarToNextPow2(0, 16);
1371 
1372     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1373       .minScalar(0, S16)
1374       .scalarize(0)
1375       .lower();
1376   } else {
1377     // Make sure we legalize the shift amount type first, as the general
1378     // expansion for the shifted type will produce much worse code if it hasn't
1379     // been truncated already.
1380     Shifts.clampScalar(1, S32, S32);
1381     Shifts.clampScalar(0, S32, S64);
1382     Shifts.widenScalarToNextPow2(0, 32);
1383 
1384     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1385       .minScalar(0, S32)
1386       .scalarize(0)
1387       .lower();
1388   }
1389   Shifts.scalarize(0);
1390 
1391   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1392     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1393     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1394     unsigned IdxTypeIdx = 2;
1395 
1396     getActionDefinitionsBuilder(Op)
1397       .customIf([=](const LegalityQuery &Query) {
1398           const LLT EltTy = Query.Types[EltTypeIdx];
1399           const LLT VecTy = Query.Types[VecTypeIdx];
1400           const LLT IdxTy = Query.Types[IdxTypeIdx];
1401           const unsigned EltSize = EltTy.getSizeInBits();
1402           return (EltSize == 32 || EltSize == 64) &&
1403                   VecTy.getSizeInBits() % 32 == 0 &&
1404                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1405                   IdxTy.getSizeInBits() == 32;
1406         })
1407       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1408                  bitcastToVectorElement32(VecTypeIdx))
1409       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1410       .bitcastIf(
1411         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1412         [=](const LegalityQuery &Query) {
1413           // For > 64-bit element types, try to turn this into a 64-bit
1414           // element vector since we may be able to do better indexing
1415           // if this is scalar. If not, fall back to 32.
1416           const LLT EltTy = Query.Types[EltTypeIdx];
1417           const LLT VecTy = Query.Types[VecTypeIdx];
1418           const unsigned DstEltSize = EltTy.getSizeInBits();
1419           const unsigned VecSize = VecTy.getSizeInBits();
1420 
1421           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1422           return std::make_pair(
1423               VecTypeIdx,
1424               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1425         })
1426       .clampScalar(EltTypeIdx, S32, S64)
1427       .clampScalar(VecTypeIdx, S32, S64)
1428       .clampScalar(IdxTypeIdx, S32, S32)
1429       .clampMaxNumElements(VecTypeIdx, S32, 32)
1430       // TODO: Clamp elements for 64-bit vectors?
1431       // It should only be necessary with variable indexes.
1432       // As a last resort, lower to the stack
1433       .lower();
1434   }
1435 
1436   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1437     .unsupportedIf([=](const LegalityQuery &Query) {
1438         const LLT &EltTy = Query.Types[1].getElementType();
1439         return Query.Types[0] != EltTy;
1440       });
1441 
1442   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1443     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1444     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1445 
1446     // FIXME: Doesn't handle extract of illegal sizes.
1447     getActionDefinitionsBuilder(Op)
1448       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1449       // FIXME: Multiples of 16 should not be legal.
1450       .legalIf([=](const LegalityQuery &Query) {
1451           const LLT BigTy = Query.Types[BigTyIdx];
1452           const LLT LitTy = Query.Types[LitTyIdx];
1453           return (BigTy.getSizeInBits() % 32 == 0) &&
1454                  (LitTy.getSizeInBits() % 16 == 0);
1455         })
1456       .widenScalarIf(
1457         [=](const LegalityQuery &Query) {
1458           const LLT BigTy = Query.Types[BigTyIdx];
1459           return (BigTy.getScalarSizeInBits() < 16);
1460         },
1461         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1462       .widenScalarIf(
1463         [=](const LegalityQuery &Query) {
1464           const LLT LitTy = Query.Types[LitTyIdx];
1465           return (LitTy.getScalarSizeInBits() < 16);
1466         },
1467         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1468       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1469       .widenScalarToNextPow2(BigTyIdx, 32);
1470 
1471   }
1472 
1473   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1474     .legalForCartesianProduct(AllS32Vectors, {S32})
1475     .legalForCartesianProduct(AllS64Vectors, {S64})
1476     .clampNumElements(0, V16S32, V32S32)
1477     .clampNumElements(0, V2S64, V16S64)
1478     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1479 
1480   if (ST.hasScalarPackInsts()) {
1481     BuildVector
1482       // FIXME: Should probably widen s1 vectors straight to s32
1483       .minScalarOrElt(0, S16)
1484       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1485       .minScalar(1, S32);
1486 
1487     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1488       .legalFor({V2S16, S32})
1489       .lower();
1490     BuildVector.minScalarOrElt(0, S32);
1491   } else {
1492     BuildVector.customFor({V2S16, S16});
1493     BuildVector.minScalarOrElt(0, S32);
1494 
1495     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496       .customFor({V2S16, S32})
1497       .lower();
1498   }
1499 
1500   BuildVector.legalIf(isRegisterType(0));
1501 
1502   // FIXME: Clamp maximum size
1503   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1504     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1505     .clampMaxNumElements(0, S32, 32)
1506     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1507     .clampMaxNumElements(0, S16, 64);
1508 
1509   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1510   // pre-legalize.
1511   if (ST.hasVOP3PInsts()) {
1512     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1513       .customFor({V2S16, V2S16})
1514       .lower();
1515   } else
1516     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1517 
1518   // Merge/Unmerge
1519   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1520     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1521     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1522 
1523     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1524       const LLT Ty = Query.Types[TypeIdx];
1525       if (Ty.isVector()) {
1526         const LLT &EltTy = Ty.getElementType();
1527         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1528           return true;
1529         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1530           return true;
1531       }
1532       return false;
1533     };
1534 
1535     auto &Builder = getActionDefinitionsBuilder(Op)
1536       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1537       .lowerFor({{S16, V2S16}})
1538       .lowerIf([=](const LegalityQuery &Query) {
1539           const LLT BigTy = Query.Types[BigTyIdx];
1540           return BigTy.getSizeInBits() == 32;
1541         })
1542       // Try to widen to s16 first for small types.
1543       // TODO: Only do this on targets with legal s16 shifts
1544       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1545       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1546       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1547       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1548                            elementTypeIs(1, S16)),
1549                        changeTo(1, V2S16))
1550       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1551       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1552       // valid.
1553       .clampScalar(LitTyIdx, S32, S512)
1554       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1555       // Break up vectors with weird elements into scalars
1556       .fewerElementsIf(
1557         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1558         scalarize(0))
1559       .fewerElementsIf(
1560         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1561         scalarize(1))
1562       .clampScalar(BigTyIdx, S32, MaxScalar);
1563 
1564     if (Op == G_MERGE_VALUES) {
1565       Builder.widenScalarIf(
1566         // TODO: Use 16-bit shifts if legal for 8-bit values?
1567         [=](const LegalityQuery &Query) {
1568           const LLT Ty = Query.Types[LitTyIdx];
1569           return Ty.getSizeInBits() < 32;
1570         },
1571         changeTo(LitTyIdx, S32));
1572     }
1573 
1574     Builder.widenScalarIf(
1575       [=](const LegalityQuery &Query) {
1576         const LLT Ty = Query.Types[BigTyIdx];
1577         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1578           Ty.getSizeInBits() % 16 != 0;
1579       },
1580       [=](const LegalityQuery &Query) {
1581         // Pick the next power of 2, or a multiple of 64 over 128.
1582         // Whichever is smaller.
1583         const LLT &Ty = Query.Types[BigTyIdx];
1584         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1585         if (NewSizeInBits >= 256) {
1586           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1587           if (RoundedTo < NewSizeInBits)
1588             NewSizeInBits = RoundedTo;
1589         }
1590         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1591       })
1592       // Any vectors left are the wrong size. Scalarize them.
1593       .scalarize(0)
1594       .scalarize(1);
1595   }
1596 
1597   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1598   // RegBankSelect.
1599   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1600     .legalFor({{S32}, {S64}});
1601 
1602   if (ST.hasVOP3PInsts()) {
1603     SextInReg.lowerFor({{V2S16}})
1604       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1605       // get more vector shift opportunities, since we'll get those when
1606       // expanded.
1607       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1608   } else if (ST.has16BitInsts()) {
1609     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1610   } else {
1611     // Prefer to promote to s32 before lowering if we don't have 16-bit
1612     // shifts. This avoid a lot of intermediate truncate and extend operations.
1613     SextInReg.lowerFor({{S32}, {S64}});
1614   }
1615 
1616   SextInReg
1617     .scalarize(0)
1618     .clampScalar(0, S32, S64)
1619     .lower();
1620 
1621   // TODO: Only Try to form v2s16 with legal packed instructions.
1622   getActionDefinitionsBuilder(G_FSHR)
1623     .legalFor({{S32, S32}})
1624     .lowerFor({{V2S16, V2S16}})
1625     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1626     .scalarize(0)
1627     .lower();
1628 
1629   if (ST.hasVOP3PInsts()) {
1630     getActionDefinitionsBuilder(G_FSHL)
1631       .lowerFor({{V2S16, V2S16}})
1632       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1633       .scalarize(0)
1634       .lower();
1635   } else {
1636     getActionDefinitionsBuilder(G_FSHL)
1637       .scalarize(0)
1638       .lower();
1639   }
1640 
1641   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1642     .legalFor({S64});
1643 
1644   getActionDefinitionsBuilder(G_FENCE)
1645     .alwaysLegal();
1646 
1647   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1648       .scalarize(0)
1649       .minScalar(0, S32)
1650       .lower();
1651 
1652   getActionDefinitionsBuilder({
1653       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1654       G_FCOPYSIGN,
1655 
1656       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1657       G_ATOMICRMW_NAND,
1658       G_ATOMICRMW_FSUB,
1659       G_READ_REGISTER,
1660       G_WRITE_REGISTER,
1661 
1662       G_SADDO, G_SSUBO,
1663 
1664        // TODO: Implement
1665       G_FMINIMUM, G_FMAXIMUM}).lower();
1666 
1667   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670     .unsupported();
1671 
1672   getLegacyLegalizerInfo().computeTables();
1673   verify(*ST.getInstrInfo());
1674 }
1675 
1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677                                          MachineInstr &MI) const {
1678   MachineIRBuilder &B = Helper.MIRBuilder;
1679   MachineRegisterInfo &MRI = *B.getMRI();
1680 
1681   switch (MI.getOpcode()) {
1682   case TargetOpcode::G_ADDRSPACE_CAST:
1683     return legalizeAddrSpaceCast(MI, MRI, B);
1684   case TargetOpcode::G_FRINT:
1685     return legalizeFrint(MI, MRI, B);
1686   case TargetOpcode::G_FCEIL:
1687     return legalizeFceil(MI, MRI, B);
1688   case TargetOpcode::G_FREM:
1689     return legalizeFrem(MI, MRI, B);
1690   case TargetOpcode::G_INTRINSIC_TRUNC:
1691     return legalizeIntrinsicTrunc(MI, MRI, B);
1692   case TargetOpcode::G_SITOFP:
1693     return legalizeITOFP(MI, MRI, B, true);
1694   case TargetOpcode::G_UITOFP:
1695     return legalizeITOFP(MI, MRI, B, false);
1696   case TargetOpcode::G_FPTOSI:
1697     return legalizeFPTOI(MI, MRI, B, true);
1698   case TargetOpcode::G_FPTOUI:
1699     return legalizeFPTOI(MI, MRI, B, false);
1700   case TargetOpcode::G_FMINNUM:
1701   case TargetOpcode::G_FMAXNUM:
1702   case TargetOpcode::G_FMINNUM_IEEE:
1703   case TargetOpcode::G_FMAXNUM_IEEE:
1704     return legalizeMinNumMaxNum(Helper, MI);
1705   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706     return legalizeExtractVectorElt(MI, MRI, B);
1707   case TargetOpcode::G_INSERT_VECTOR_ELT:
1708     return legalizeInsertVectorElt(MI, MRI, B);
1709   case TargetOpcode::G_SHUFFLE_VECTOR:
1710     return legalizeShuffleVector(MI, MRI, B);
1711   case TargetOpcode::G_FSIN:
1712   case TargetOpcode::G_FCOS:
1713     return legalizeSinCos(MI, MRI, B);
1714   case TargetOpcode::G_GLOBAL_VALUE:
1715     return legalizeGlobalValue(MI, MRI, B);
1716   case TargetOpcode::G_LOAD:
1717   case TargetOpcode::G_SEXTLOAD:
1718   case TargetOpcode::G_ZEXTLOAD:
1719     return legalizeLoad(Helper, MI);
1720   case TargetOpcode::G_FMAD:
1721     return legalizeFMad(MI, MRI, B);
1722   case TargetOpcode::G_FDIV:
1723     return legalizeFDIV(MI, MRI, B);
1724   case TargetOpcode::G_UDIV:
1725   case TargetOpcode::G_UREM:
1726   case TargetOpcode::G_UDIVREM:
1727     return legalizeUnsignedDIV_REM(MI, MRI, B);
1728   case TargetOpcode::G_SDIV:
1729   case TargetOpcode::G_SREM:
1730   case TargetOpcode::G_SDIVREM:
1731     return legalizeSignedDIV_REM(MI, MRI, B);
1732   case TargetOpcode::G_ATOMIC_CMPXCHG:
1733     return legalizeAtomicCmpXChg(MI, MRI, B);
1734   case TargetOpcode::G_FLOG:
1735     return legalizeFlog(MI, B, numbers::ln2f);
1736   case TargetOpcode::G_FLOG10:
1737     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1738   case TargetOpcode::G_FEXP:
1739     return legalizeFExp(MI, B);
1740   case TargetOpcode::G_FPOW:
1741     return legalizeFPow(MI, B);
1742   case TargetOpcode::G_FFLOOR:
1743     return legalizeFFloor(MI, MRI, B);
1744   case TargetOpcode::G_BUILD_VECTOR:
1745     return legalizeBuildVector(MI, MRI, B);
1746   default:
1747     return false;
1748   }
1749 
1750   llvm_unreachable("expected switch to return");
1751 }
1752 
1753 Register AMDGPULegalizerInfo::getSegmentAperture(
1754   unsigned AS,
1755   MachineRegisterInfo &MRI,
1756   MachineIRBuilder &B) const {
1757   MachineFunction &MF = B.getMF();
1758   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1759   const LLT S32 = LLT::scalar(32);
1760 
1761   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1762 
1763   if (ST.hasApertureRegs()) {
1764     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1765     // getreg.
1766     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1767         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1768         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1769     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1770         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1771         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1772     unsigned Encoding =
1773         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1774         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1775         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1776 
1777     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1778 
1779     B.buildInstr(AMDGPU::S_GETREG_B32)
1780       .addDef(GetReg)
1781       .addImm(Encoding);
1782     MRI.setType(GetReg, S32);
1783 
1784     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1785     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1786   }
1787 
1788   Register QueuePtr = MRI.createGenericVirtualRegister(
1789     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1790 
1791   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1792     return Register();
1793 
1794   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1795   // private_segment_aperture_base_hi.
1796   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1797 
1798   // TODO: can we be smarter about machine pointer info?
1799   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1800   MachineMemOperand *MMO = MF.getMachineMemOperand(
1801       PtrInfo,
1802       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1803           MachineMemOperand::MOInvariant,
1804       4, commonAlignment(Align(64), StructOffset));
1805 
1806   Register LoadAddr;
1807 
1808   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1809   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1810 }
1811 
1812 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1813   MachineInstr &MI, MachineRegisterInfo &MRI,
1814   MachineIRBuilder &B) const {
1815   MachineFunction &MF = B.getMF();
1816 
1817   const LLT S32 = LLT::scalar(32);
1818   Register Dst = MI.getOperand(0).getReg();
1819   Register Src = MI.getOperand(1).getReg();
1820 
1821   LLT DstTy = MRI.getType(Dst);
1822   LLT SrcTy = MRI.getType(Src);
1823   unsigned DestAS = DstTy.getAddressSpace();
1824   unsigned SrcAS = SrcTy.getAddressSpace();
1825 
1826   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1827   // vector element.
1828   assert(!DstTy.isVector());
1829 
1830   const AMDGPUTargetMachine &TM
1831     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1832 
1833   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1834     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1835     return true;
1836   }
1837 
1838   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1839     // Truncate.
1840     B.buildExtract(Dst, Src, 0);
1841     MI.eraseFromParent();
1842     return true;
1843   }
1844 
1845   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1846     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1847     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1848 
1849     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1850     // another. Merge operands are required to be the same type, but creating an
1851     // extra ptrtoint would be kind of pointless.
1852     auto HighAddr = B.buildConstant(
1853       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1854     B.buildMerge(Dst, {Src, HighAddr});
1855     MI.eraseFromParent();
1856     return true;
1857   }
1858 
1859   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1860     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1861            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1862     unsigned NullVal = TM.getNullPointerValue(DestAS);
1863 
1864     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1865     auto FlatNull = B.buildConstant(SrcTy, 0);
1866 
1867     // Extract low 32-bits of the pointer.
1868     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1869 
1870     auto CmpRes =
1871         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1872     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1873 
1874     MI.eraseFromParent();
1875     return true;
1876   }
1877 
1878   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1879     return false;
1880 
1881   if (!ST.hasFlatAddressSpace())
1882     return false;
1883 
1884   auto SegmentNull =
1885       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1886   auto FlatNull =
1887       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1888 
1889   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1890   if (!ApertureReg.isValid())
1891     return false;
1892 
1893   auto CmpRes =
1894       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1895 
1896   // Coerce the type of the low half of the result so we can use merge_values.
1897   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1898 
1899   // TODO: Should we allow mismatched types but matching sizes in merges to
1900   // avoid the ptrtoint?
1901   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1902   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1903 
1904   MI.eraseFromParent();
1905   return true;
1906 }
1907 
1908 bool AMDGPULegalizerInfo::legalizeFrint(
1909   MachineInstr &MI, MachineRegisterInfo &MRI,
1910   MachineIRBuilder &B) const {
1911   Register Src = MI.getOperand(1).getReg();
1912   LLT Ty = MRI.getType(Src);
1913   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1914 
1915   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1916   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1917 
1918   auto C1 = B.buildFConstant(Ty, C1Val);
1919   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1920 
1921   // TODO: Should this propagate fast-math-flags?
1922   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1923   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1924 
1925   auto C2 = B.buildFConstant(Ty, C2Val);
1926   auto Fabs = B.buildFAbs(Ty, Src);
1927 
1928   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1929   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1930   MI.eraseFromParent();
1931   return true;
1932 }
1933 
1934 bool AMDGPULegalizerInfo::legalizeFceil(
1935   MachineInstr &MI, MachineRegisterInfo &MRI,
1936   MachineIRBuilder &B) const {
1937 
1938   const LLT S1 = LLT::scalar(1);
1939   const LLT S64 = LLT::scalar(64);
1940 
1941   Register Src = MI.getOperand(1).getReg();
1942   assert(MRI.getType(Src) == S64);
1943 
1944   // result = trunc(src)
1945   // if (src > 0.0 && src != result)
1946   //   result += 1.0
1947 
1948   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1949 
1950   const auto Zero = B.buildFConstant(S64, 0.0);
1951   const auto One = B.buildFConstant(S64, 1.0);
1952   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1953   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1954   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1955   auto Add = B.buildSelect(S64, And, One, Zero);
1956 
1957   // TODO: Should this propagate fast-math-flags?
1958   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1959   return true;
1960 }
1961 
1962 bool AMDGPULegalizerInfo::legalizeFrem(
1963   MachineInstr &MI, MachineRegisterInfo &MRI,
1964   MachineIRBuilder &B) const {
1965     Register DstReg = MI.getOperand(0).getReg();
1966     Register Src0Reg = MI.getOperand(1).getReg();
1967     Register Src1Reg = MI.getOperand(2).getReg();
1968     auto Flags = MI.getFlags();
1969     LLT Ty = MRI.getType(DstReg);
1970 
1971     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1972     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1973     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1974     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1975     MI.eraseFromParent();
1976     return true;
1977 }
1978 
1979 static MachineInstrBuilder extractF64Exponent(Register Hi,
1980                                               MachineIRBuilder &B) {
1981   const unsigned FractBits = 52;
1982   const unsigned ExpBits = 11;
1983   LLT S32 = LLT::scalar(32);
1984 
1985   auto Const0 = B.buildConstant(S32, FractBits - 32);
1986   auto Const1 = B.buildConstant(S32, ExpBits);
1987 
1988   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1989     .addUse(Hi)
1990     .addUse(Const0.getReg(0))
1991     .addUse(Const1.getReg(0));
1992 
1993   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1994 }
1995 
1996 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1997   MachineInstr &MI, MachineRegisterInfo &MRI,
1998   MachineIRBuilder &B) const {
1999   const LLT S1 = LLT::scalar(1);
2000   const LLT S32 = LLT::scalar(32);
2001   const LLT S64 = LLT::scalar(64);
2002 
2003   Register Src = MI.getOperand(1).getReg();
2004   assert(MRI.getType(Src) == S64);
2005 
2006   // TODO: Should this use extract since the low half is unused?
2007   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2008   Register Hi = Unmerge.getReg(1);
2009 
2010   // Extract the upper half, since this is where we will find the sign and
2011   // exponent.
2012   auto Exp = extractF64Exponent(Hi, B);
2013 
2014   const unsigned FractBits = 52;
2015 
2016   // Extract the sign bit.
2017   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2018   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2019 
2020   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2021 
2022   const auto Zero32 = B.buildConstant(S32, 0);
2023 
2024   // Extend back to 64-bits.
2025   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2026 
2027   auto Shr = B.buildAShr(S64, FractMask, Exp);
2028   auto Not = B.buildNot(S64, Shr);
2029   auto Tmp0 = B.buildAnd(S64, Src, Not);
2030   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2031 
2032   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2033   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2034 
2035   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2036   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2037   MI.eraseFromParent();
2038   return true;
2039 }
2040 
2041 bool AMDGPULegalizerInfo::legalizeITOFP(
2042   MachineInstr &MI, MachineRegisterInfo &MRI,
2043   MachineIRBuilder &B, bool Signed) const {
2044 
2045   Register Dst = MI.getOperand(0).getReg();
2046   Register Src = MI.getOperand(1).getReg();
2047 
2048   const LLT S64 = LLT::scalar(64);
2049   const LLT S32 = LLT::scalar(32);
2050 
2051   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2052 
2053   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2054 
2055   auto CvtHi = Signed ?
2056     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2057     B.buildUITOFP(S64, Unmerge.getReg(1));
2058 
2059   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2060 
2061   auto ThirtyTwo = B.buildConstant(S32, 32);
2062   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2063     .addUse(CvtHi.getReg(0))
2064     .addUse(ThirtyTwo.getReg(0));
2065 
2066   // TODO: Should this propagate fast-math-flags?
2067   B.buildFAdd(Dst, LdExp, CvtLo);
2068   MI.eraseFromParent();
2069   return true;
2070 }
2071 
2072 // TODO: Copied from DAG implementation. Verify logic and document how this
2073 // actually works.
2074 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2075                                         MachineRegisterInfo &MRI,
2076                                         MachineIRBuilder &B,
2077                                         bool Signed) const {
2078 
2079   Register Dst = MI.getOperand(0).getReg();
2080   Register Src = MI.getOperand(1).getReg();
2081 
2082   const LLT S64 = LLT::scalar(64);
2083   const LLT S32 = LLT::scalar(32);
2084 
2085   const LLT SrcLT = MRI.getType(Src);
2086   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2087 
2088   unsigned Flags = MI.getFlags();
2089 
2090   // The basic idea of converting a floating point number into a pair of 32-bit
2091   // integers is illustrated as follows:
2092   //
2093   //     tf := trunc(val);
2094   //    hif := floor(tf * 2^-32);
2095   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2096   //     hi := fptoi(hif);
2097   //     lo := fptoi(lof);
2098   //
2099   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2100   MachineInstrBuilder Sign;
2101   if (Signed && SrcLT == S32) {
2102     // However, a 32-bit floating point number has only 23 bits mantissa and
2103     // it's not enough to hold all the significant bits of `lof` if val is
2104     // negative. To avoid the loss of precision, We need to take the absolute
2105     // value after truncating and flip the result back based on the original
2106     // signedness.
2107     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2108     Trunc = B.buildFAbs(S32, Trunc, Flags);
2109   }
2110   MachineInstrBuilder K0, K1;
2111   if (SrcLT == S64) {
2112     K0 = B.buildFConstant(S64,
2113                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2114     K1 = B.buildFConstant(S64,
2115                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2116   } else {
2117     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2118     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2119   }
2120 
2121   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2122   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2123   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2124 
2125   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2126                                      : B.buildFPTOUI(S32, FloorMul);
2127   auto Lo = B.buildFPTOUI(S32, Fma);
2128 
2129   if (Signed && SrcLT == S32) {
2130     // Flip the result based on the signedness, which is either all 0s or 1s.
2131     Sign = B.buildMerge(S64, {Sign, Sign});
2132     // r := xor({lo, hi}, sign) - sign;
2133     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2134   } else
2135     B.buildMerge(Dst, {Lo, Hi});
2136   MI.eraseFromParent();
2137 
2138   return true;
2139 }
2140 
2141 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2142                                                MachineInstr &MI) const {
2143   MachineFunction &MF = Helper.MIRBuilder.getMF();
2144   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2145 
2146   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2147                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2148 
2149   // With ieee_mode disabled, the instructions have the correct behavior
2150   // already for G_FMINNUM/G_FMAXNUM
2151   if (!MFI->getMode().IEEE)
2152     return !IsIEEEOp;
2153 
2154   if (IsIEEEOp)
2155     return true;
2156 
2157   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2158 }
2159 
2160 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2161   MachineInstr &MI, MachineRegisterInfo &MRI,
2162   MachineIRBuilder &B) const {
2163   // TODO: Should move some of this into LegalizerHelper.
2164 
2165   // TODO: Promote dynamic indexing of s16 to s32
2166 
2167   // FIXME: Artifact combiner probably should have replaced the truncated
2168   // constant before this, so we shouldn't need
2169   // getConstantVRegValWithLookThrough.
2170   Optional<ValueAndVReg> MaybeIdxVal =
2171       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2172   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2173     return true;
2174   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2175 
2176   Register Dst = MI.getOperand(0).getReg();
2177   Register Vec = MI.getOperand(1).getReg();
2178 
2179   LLT VecTy = MRI.getType(Vec);
2180   LLT EltTy = VecTy.getElementType();
2181   assert(EltTy == MRI.getType(Dst));
2182 
2183   if (IdxVal < VecTy.getNumElements())
2184     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2185   else
2186     B.buildUndef(Dst);
2187 
2188   MI.eraseFromParent();
2189   return true;
2190 }
2191 
2192 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2193   MachineInstr &MI, MachineRegisterInfo &MRI,
2194   MachineIRBuilder &B) const {
2195   // TODO: Should move some of this into LegalizerHelper.
2196 
2197   // TODO: Promote dynamic indexing of s16 to s32
2198 
2199   // FIXME: Artifact combiner probably should have replaced the truncated
2200   // constant before this, so we shouldn't need
2201   // getConstantVRegValWithLookThrough.
2202   Optional<ValueAndVReg> MaybeIdxVal =
2203       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2204   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2205     return true;
2206 
2207   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2208   Register Dst = MI.getOperand(0).getReg();
2209   Register Vec = MI.getOperand(1).getReg();
2210   Register Ins = MI.getOperand(2).getReg();
2211 
2212   LLT VecTy = MRI.getType(Vec);
2213   LLT EltTy = VecTy.getElementType();
2214   assert(EltTy == MRI.getType(Ins));
2215 
2216   if (IdxVal < VecTy.getNumElements())
2217     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2218   else
2219     B.buildUndef(Dst);
2220 
2221   MI.eraseFromParent();
2222   return true;
2223 }
2224 
2225 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2226   MachineInstr &MI, MachineRegisterInfo &MRI,
2227   MachineIRBuilder &B) const {
2228   const LLT V2S16 = LLT::fixed_vector(2, 16);
2229 
2230   Register Dst = MI.getOperand(0).getReg();
2231   Register Src0 = MI.getOperand(1).getReg();
2232   LLT DstTy = MRI.getType(Dst);
2233   LLT SrcTy = MRI.getType(Src0);
2234 
2235   if (SrcTy == V2S16 && DstTy == V2S16 &&
2236       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2237     return true;
2238 
2239   MachineIRBuilder HelperBuilder(MI);
2240   GISelObserverWrapper DummyObserver;
2241   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2242   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2243 }
2244 
2245 bool AMDGPULegalizerInfo::legalizeSinCos(
2246   MachineInstr &MI, MachineRegisterInfo &MRI,
2247   MachineIRBuilder &B) const {
2248 
2249   Register DstReg = MI.getOperand(0).getReg();
2250   Register SrcReg = MI.getOperand(1).getReg();
2251   LLT Ty = MRI.getType(DstReg);
2252   unsigned Flags = MI.getFlags();
2253 
2254   Register TrigVal;
2255   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2256   if (ST.hasTrigReducedRange()) {
2257     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2258     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2259       .addUse(MulVal.getReg(0))
2260       .setMIFlags(Flags).getReg(0);
2261   } else
2262     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2263 
2264   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2265     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2266   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2267     .addUse(TrigVal)
2268     .setMIFlags(Flags);
2269   MI.eraseFromParent();
2270   return true;
2271 }
2272 
2273 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2274                                                   MachineIRBuilder &B,
2275                                                   const GlobalValue *GV,
2276                                                   int64_t Offset,
2277                                                   unsigned GAFlags) const {
2278   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2279   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2280   // to the following code sequence:
2281   //
2282   // For constant address space:
2283   //   s_getpc_b64 s[0:1]
2284   //   s_add_u32 s0, s0, $symbol
2285   //   s_addc_u32 s1, s1, 0
2286   //
2287   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2288   //   a fixup or relocation is emitted to replace $symbol with a literal
2289   //   constant, which is a pc-relative offset from the encoding of the $symbol
2290   //   operand to the global variable.
2291   //
2292   // For global address space:
2293   //   s_getpc_b64 s[0:1]
2294   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2295   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2296   //
2297   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2298   //   fixups or relocations are emitted to replace $symbol@*@lo and
2299   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2300   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2301   //   operand to the global variable.
2302   //
2303   // What we want here is an offset from the value returned by s_getpc
2304   // (which is the address of the s_add_u32 instruction) to the global
2305   // variable, but since the encoding of $symbol starts 4 bytes after the start
2306   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2307   // small. This requires us to add 4 to the global variable offset in order to
2308   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2309   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2310   // instruction.
2311 
2312   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2313 
2314   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2315     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2316 
2317   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2318     .addDef(PCReg);
2319 
2320   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2321   if (GAFlags == SIInstrInfo::MO_NONE)
2322     MIB.addImm(0);
2323   else
2324     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2325 
2326   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2327 
2328   if (PtrTy.getSizeInBits() == 32)
2329     B.buildExtract(DstReg, PCReg, 0);
2330   return true;
2331  }
2332 
2333 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2334   MachineInstr &MI, MachineRegisterInfo &MRI,
2335   MachineIRBuilder &B) const {
2336   Register DstReg = MI.getOperand(0).getReg();
2337   LLT Ty = MRI.getType(DstReg);
2338   unsigned AS = Ty.getAddressSpace();
2339 
2340   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2341   MachineFunction &MF = B.getMF();
2342   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2343 
2344   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2345     if (!MFI->isModuleEntryFunction() &&
2346         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2347       const Function &Fn = MF.getFunction();
2348       DiagnosticInfoUnsupported BadLDSDecl(
2349         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2350         DS_Warning);
2351       Fn.getContext().diagnose(BadLDSDecl);
2352 
2353       // We currently don't have a way to correctly allocate LDS objects that
2354       // aren't directly associated with a kernel. We do force inlining of
2355       // functions that use local objects. However, if these dead functions are
2356       // not eliminated, we don't want a compile time error. Just emit a warning
2357       // and a trap, since there should be no callable path here.
2358       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2359       B.buildUndef(DstReg);
2360       MI.eraseFromParent();
2361       return true;
2362     }
2363 
2364     // TODO: We could emit code to handle the initialization somewhere.
2365     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2366       const SITargetLowering *TLI = ST.getTargetLowering();
2367       if (!TLI->shouldUseLDSConstAddress(GV)) {
2368         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2369         return true; // Leave in place;
2370       }
2371 
2372       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2373         Type *Ty = GV->getValueType();
2374         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2375         // zero-sized type in other languages to declare the dynamic shared
2376         // memory which size is not known at the compile time. They will be
2377         // allocated by the runtime and placed directly after the static
2378         // allocated ones. They all share the same offset.
2379         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2380           // Adjust alignment for that dynamic shared memory array.
2381           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2382           LLT S32 = LLT::scalar(32);
2383           auto Sz =
2384               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2385           B.buildIntToPtr(DstReg, Sz);
2386           MI.eraseFromParent();
2387           return true;
2388         }
2389       }
2390 
2391       B.buildConstant(
2392           DstReg,
2393           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2394       MI.eraseFromParent();
2395       return true;
2396     }
2397 
2398     const Function &Fn = MF.getFunction();
2399     DiagnosticInfoUnsupported BadInit(
2400       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2401     Fn.getContext().diagnose(BadInit);
2402     return true;
2403   }
2404 
2405   const SITargetLowering *TLI = ST.getTargetLowering();
2406 
2407   if (TLI->shouldEmitFixup(GV)) {
2408     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2409     MI.eraseFromParent();
2410     return true;
2411   }
2412 
2413   if (TLI->shouldEmitPCReloc(GV)) {
2414     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2415     MI.eraseFromParent();
2416     return true;
2417   }
2418 
2419   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2420   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2421 
2422   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2423       MachinePointerInfo::getGOT(MF),
2424       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2425           MachineMemOperand::MOInvariant,
2426       8 /*Size*/, Align(8));
2427 
2428   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2429 
2430   if (Ty.getSizeInBits() == 32) {
2431     // Truncate if this is a 32-bit constant adrdess.
2432     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2433     B.buildExtract(DstReg, Load, 0);
2434   } else
2435     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2436 
2437   MI.eraseFromParent();
2438   return true;
2439 }
2440 
2441 static LLT widenToNextPowerOf2(LLT Ty) {
2442   if (Ty.isVector())
2443     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2444   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2445 }
2446 
2447 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2448                                        MachineInstr &MI) const {
2449   MachineIRBuilder &B = Helper.MIRBuilder;
2450   MachineRegisterInfo &MRI = *B.getMRI();
2451   GISelChangeObserver &Observer = Helper.Observer;
2452 
2453   Register PtrReg = MI.getOperand(1).getReg();
2454   LLT PtrTy = MRI.getType(PtrReg);
2455   unsigned AddrSpace = PtrTy.getAddressSpace();
2456 
2457   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2458     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2459     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2460     Observer.changingInstr(MI);
2461     MI.getOperand(1).setReg(Cast.getReg(0));
2462     Observer.changedInstr(MI);
2463     return true;
2464   }
2465 
2466   if (MI.getOpcode() != AMDGPU::G_LOAD)
2467     return false;
2468 
2469   Register ValReg = MI.getOperand(0).getReg();
2470   LLT ValTy = MRI.getType(ValReg);
2471 
2472   MachineMemOperand *MMO = *MI.memoperands_begin();
2473   const unsigned ValSize = ValTy.getSizeInBits();
2474   const unsigned MemSize = 8 * MMO->getSize();
2475   const Align MemAlign = MMO->getAlign();
2476   const unsigned AlignInBits = 8 * MemAlign.value();
2477 
2478   // Widen non-power-of-2 loads to the alignment if needed
2479   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2480     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2481 
2482     // This was already the correct extending load result type, so just adjust
2483     // the memory type.
2484     if (WideMemSize == ValSize) {
2485       MachineFunction &MF = B.getMF();
2486 
2487       MachineMemOperand *WideMMO =
2488           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2489       Observer.changingInstr(MI);
2490       MI.setMemRefs(MF, {WideMMO});
2491       Observer.changedInstr(MI);
2492       return true;
2493     }
2494 
2495     // Don't bother handling edge case that should probably never be produced.
2496     if (ValSize > WideMemSize)
2497       return false;
2498 
2499     LLT WideTy = widenToNextPowerOf2(ValTy);
2500 
2501     Register WideLoad;
2502     if (!WideTy.isVector()) {
2503       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2504       B.buildTrunc(ValReg, WideLoad).getReg(0);
2505     } else {
2506       // Extract the subvector.
2507 
2508       if (isRegisterType(ValTy)) {
2509         // If this a case where G_EXTRACT is legal, use it.
2510         // (e.g. <3 x s32> -> <4 x s32>)
2511         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2512         B.buildExtract(ValReg, WideLoad, 0);
2513       } else {
2514         // For cases where the widened type isn't a nice register value, unmerge
2515         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2516         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2517         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2518         B.setInsertPt(B.getMBB(), MI.getIterator());
2519         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2520       }
2521     }
2522 
2523     MI.eraseFromParent();
2524     return true;
2525   }
2526 
2527   return false;
2528 }
2529 
2530 bool AMDGPULegalizerInfo::legalizeFMad(
2531   MachineInstr &MI, MachineRegisterInfo &MRI,
2532   MachineIRBuilder &B) const {
2533   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2534   assert(Ty.isScalar());
2535 
2536   MachineFunction &MF = B.getMF();
2537   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2538 
2539   // TODO: Always legal with future ftz flag.
2540   // FIXME: Do we need just output?
2541   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2542     return true;
2543   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2544     return true;
2545 
2546   MachineIRBuilder HelperBuilder(MI);
2547   GISelObserverWrapper DummyObserver;
2548   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2549   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2550 }
2551 
2552 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2553   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2554   Register DstReg = MI.getOperand(0).getReg();
2555   Register PtrReg = MI.getOperand(1).getReg();
2556   Register CmpVal = MI.getOperand(2).getReg();
2557   Register NewVal = MI.getOperand(3).getReg();
2558 
2559   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2560          "this should not have been custom lowered");
2561 
2562   LLT ValTy = MRI.getType(CmpVal);
2563   LLT VecTy = LLT::fixed_vector(2, ValTy);
2564 
2565   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2566 
2567   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2568     .addDef(DstReg)
2569     .addUse(PtrReg)
2570     .addUse(PackedVal)
2571     .setMemRefs(MI.memoperands());
2572 
2573   MI.eraseFromParent();
2574   return true;
2575 }
2576 
2577 bool AMDGPULegalizerInfo::legalizeFlog(
2578   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2579   Register Dst = MI.getOperand(0).getReg();
2580   Register Src = MI.getOperand(1).getReg();
2581   LLT Ty = B.getMRI()->getType(Dst);
2582   unsigned Flags = MI.getFlags();
2583 
2584   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2585   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2586 
2587   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2588   MI.eraseFromParent();
2589   return true;
2590 }
2591 
2592 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2593                                        MachineIRBuilder &B) const {
2594   Register Dst = MI.getOperand(0).getReg();
2595   Register Src = MI.getOperand(1).getReg();
2596   unsigned Flags = MI.getFlags();
2597   LLT Ty = B.getMRI()->getType(Dst);
2598 
2599   auto K = B.buildFConstant(Ty, numbers::log2e);
2600   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2601   B.buildFExp2(Dst, Mul, Flags);
2602   MI.eraseFromParent();
2603   return true;
2604 }
2605 
2606 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2607                                        MachineIRBuilder &B) const {
2608   Register Dst = MI.getOperand(0).getReg();
2609   Register Src0 = MI.getOperand(1).getReg();
2610   Register Src1 = MI.getOperand(2).getReg();
2611   unsigned Flags = MI.getFlags();
2612   LLT Ty = B.getMRI()->getType(Dst);
2613   const LLT S16 = LLT::scalar(16);
2614   const LLT S32 = LLT::scalar(32);
2615 
2616   if (Ty == S32) {
2617     auto Log = B.buildFLog2(S32, Src0, Flags);
2618     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2619       .addUse(Log.getReg(0))
2620       .addUse(Src1)
2621       .setMIFlags(Flags);
2622     B.buildFExp2(Dst, Mul, Flags);
2623   } else if (Ty == S16) {
2624     // There's no f16 fmul_legacy, so we need to convert for it.
2625     auto Log = B.buildFLog2(S16, Src0, Flags);
2626     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2627     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2628     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2629       .addUse(Ext0.getReg(0))
2630       .addUse(Ext1.getReg(0))
2631       .setMIFlags(Flags);
2632 
2633     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2634   } else
2635     return false;
2636 
2637   MI.eraseFromParent();
2638   return true;
2639 }
2640 
2641 // Find a source register, ignoring any possible source modifiers.
2642 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2643   Register ModSrc = OrigSrc;
2644   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2645     ModSrc = SrcFNeg->getOperand(1).getReg();
2646     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2647       ModSrc = SrcFAbs->getOperand(1).getReg();
2648   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2649     ModSrc = SrcFAbs->getOperand(1).getReg();
2650   return ModSrc;
2651 }
2652 
2653 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2654                                          MachineRegisterInfo &MRI,
2655                                          MachineIRBuilder &B) const {
2656 
2657   const LLT S1 = LLT::scalar(1);
2658   const LLT S64 = LLT::scalar(64);
2659   Register Dst = MI.getOperand(0).getReg();
2660   Register OrigSrc = MI.getOperand(1).getReg();
2661   unsigned Flags = MI.getFlags();
2662   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2663          "this should not have been custom lowered");
2664 
2665   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2666   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2667   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2668   // V_FRACT bug is:
2669   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2670   //
2671   // Convert floor(x) to (x - fract(x))
2672 
2673   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2674     .addUse(OrigSrc)
2675     .setMIFlags(Flags);
2676 
2677   // Give source modifier matching some assistance before obscuring a foldable
2678   // pattern.
2679 
2680   // TODO: We can avoid the neg on the fract? The input sign to fract
2681   // shouldn't matter?
2682   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2683 
2684   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2685 
2686   Register Min = MRI.createGenericVirtualRegister(S64);
2687 
2688   // We don't need to concern ourselves with the snan handling difference, so
2689   // use the one which will directly select.
2690   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2691   if (MFI->getMode().IEEE)
2692     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2693   else
2694     B.buildFMinNum(Min, Fract, Const, Flags);
2695 
2696   Register CorrectedFract = Min;
2697   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2698     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2699     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2700   }
2701 
2702   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2703   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2704 
2705   MI.eraseFromParent();
2706   return true;
2707 }
2708 
2709 // Turn an illegal packed v2s16 build vector into bit operations.
2710 // TODO: This should probably be a bitcast action in LegalizerHelper.
2711 bool AMDGPULegalizerInfo::legalizeBuildVector(
2712   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2713   Register Dst = MI.getOperand(0).getReg();
2714   const LLT S32 = LLT::scalar(32);
2715   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2716 
2717   Register Src0 = MI.getOperand(1).getReg();
2718   Register Src1 = MI.getOperand(2).getReg();
2719   assert(MRI.getType(Src0) == LLT::scalar(16));
2720 
2721   auto Merge = B.buildMerge(S32, {Src0, Src1});
2722   B.buildBitcast(Dst, Merge);
2723 
2724   MI.eraseFromParent();
2725   return true;
2726 }
2727 
2728 // Check that this is a G_XOR x, -1
2729 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2730   if (MI.getOpcode() != TargetOpcode::G_XOR)
2731     return false;
2732   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2733   return ConstVal && *ConstVal == -1;
2734 }
2735 
2736 // Return the use branch instruction, otherwise null if the usage is invalid.
2737 static MachineInstr *
2738 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2739                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2740   Register CondDef = MI.getOperand(0).getReg();
2741   if (!MRI.hasOneNonDBGUse(CondDef))
2742     return nullptr;
2743 
2744   MachineBasicBlock *Parent = MI.getParent();
2745   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2746 
2747   if (isNot(MRI, *UseMI)) {
2748     Register NegatedCond = UseMI->getOperand(0).getReg();
2749     if (!MRI.hasOneNonDBGUse(NegatedCond))
2750       return nullptr;
2751 
2752     // We're deleting the def of this value, so we need to remove it.
2753     UseMI->eraseFromParent();
2754 
2755     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2756     Negated = true;
2757   }
2758 
2759   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2760     return nullptr;
2761 
2762   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2763   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2764   if (Next == Parent->end()) {
2765     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2766     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2767       return nullptr;
2768     UncondBrTarget = &*NextMBB;
2769   } else {
2770     if (Next->getOpcode() != AMDGPU::G_BR)
2771       return nullptr;
2772     Br = &*Next;
2773     UncondBrTarget = Br->getOperand(0).getMBB();
2774   }
2775 
2776   return UseMI;
2777 }
2778 
2779 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2780                                          const ArgDescriptor *Arg,
2781                                          const TargetRegisterClass *ArgRC,
2782                                          LLT ArgTy) const {
2783   MCRegister SrcReg = Arg->getRegister();
2784   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2785   assert(DstReg.isVirtual() && "Virtual register expected");
2786 
2787   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2788                                              ArgTy);
2789   if (Arg->isMasked()) {
2790     // TODO: Should we try to emit this once in the entry block?
2791     const LLT S32 = LLT::scalar(32);
2792     const unsigned Mask = Arg->getMask();
2793     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2794 
2795     Register AndMaskSrc = LiveIn;
2796 
2797     if (Shift != 0) {
2798       auto ShiftAmt = B.buildConstant(S32, Shift);
2799       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2800     }
2801 
2802     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2803   } else {
2804     B.buildCopy(DstReg, LiveIn);
2805   }
2806 
2807   return true;
2808 }
2809 
2810 bool AMDGPULegalizerInfo::loadInputValue(
2811     Register DstReg, MachineIRBuilder &B,
2812     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2813   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2814   const ArgDescriptor *Arg;
2815   const TargetRegisterClass *ArgRC;
2816   LLT ArgTy;
2817   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2818 
2819   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2820     return false; // TODO: Handle these
2821   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2822 }
2823 
2824 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2825     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2826     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2827   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2828     return false;
2829 
2830   MI.eraseFromParent();
2831   return true;
2832 }
2833 
2834 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2835                                        MachineRegisterInfo &MRI,
2836                                        MachineIRBuilder &B) const {
2837   Register Dst = MI.getOperand(0).getReg();
2838   LLT DstTy = MRI.getType(Dst);
2839   LLT S16 = LLT::scalar(16);
2840   LLT S32 = LLT::scalar(32);
2841   LLT S64 = LLT::scalar(64);
2842 
2843   if (DstTy == S16)
2844     return legalizeFDIV16(MI, MRI, B);
2845   if (DstTy == S32)
2846     return legalizeFDIV32(MI, MRI, B);
2847   if (DstTy == S64)
2848     return legalizeFDIV64(MI, MRI, B);
2849 
2850   return false;
2851 }
2852 
2853 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2854                                                         Register DstDivReg,
2855                                                         Register DstRemReg,
2856                                                         Register X,
2857                                                         Register Y) const {
2858   const LLT S1 = LLT::scalar(1);
2859   const LLT S32 = LLT::scalar(32);
2860 
2861   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2862   // algorithm used here.
2863 
2864   // Initial estimate of inv(y).
2865   auto FloatY = B.buildUITOFP(S32, Y);
2866   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2867   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2868   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2869   auto Z = B.buildFPTOUI(S32, ScaledY);
2870 
2871   // One round of UNR.
2872   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2873   auto NegYZ = B.buildMul(S32, NegY, Z);
2874   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2875 
2876   // Quotient/remainder estimate.
2877   auto Q = B.buildUMulH(S32, X, Z);
2878   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2879 
2880   // First quotient/remainder refinement.
2881   auto One = B.buildConstant(S32, 1);
2882   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2883   if (DstDivReg)
2884     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2885   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2886 
2887   // Second quotient/remainder refinement.
2888   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2889   if (DstDivReg)
2890     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2891 
2892   if (DstRemReg)
2893     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2894 }
2895 
2896 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2897 //
2898 // Return lo, hi of result
2899 //
2900 // %cvt.lo = G_UITOFP Val.lo
2901 // %cvt.hi = G_UITOFP Val.hi
2902 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2903 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2904 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2905 // %mul2 = G_FMUL %mul1, 2**(-32)
2906 // %trunc = G_INTRINSIC_TRUNC %mul2
2907 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2908 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2909 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2910                                                        Register Val) {
2911   const LLT S32 = LLT::scalar(32);
2912   auto Unmerge = B.buildUnmerge(S32, Val);
2913 
2914   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2915   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2916 
2917   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2918                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2919 
2920   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2921   auto Mul1 =
2922       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2923 
2924   // 2**(-32)
2925   auto Mul2 =
2926       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2927   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2928 
2929   // -(2**32)
2930   auto Mad2 = B.buildFMAD(S32, Trunc,
2931                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2932 
2933   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2934   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2935 
2936   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2937 }
2938 
2939 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2940                                                         Register DstDivReg,
2941                                                         Register DstRemReg,
2942                                                         Register Numer,
2943                                                         Register Denom) const {
2944   const LLT S32 = LLT::scalar(32);
2945   const LLT S64 = LLT::scalar(64);
2946   const LLT S1 = LLT::scalar(1);
2947   Register RcpLo, RcpHi;
2948 
2949   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2950 
2951   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2952 
2953   auto Zero64 = B.buildConstant(S64, 0);
2954   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2955 
2956   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2957   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2958 
2959   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2960   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2961   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2962 
2963   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2964   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2965   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2966   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2967 
2968   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2969   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2970   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2971   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2972   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2973 
2974   auto Zero32 = B.buildConstant(S32, 0);
2975   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2976   auto Add2_HiC =
2977       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2978   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2979   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2980 
2981   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2982   Register NumerLo = UnmergeNumer.getReg(0);
2983   Register NumerHi = UnmergeNumer.getReg(1);
2984 
2985   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2986   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2987   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2988   Register Mul3_Lo = UnmergeMul3.getReg(0);
2989   Register Mul3_Hi = UnmergeMul3.getReg(1);
2990   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2991   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2992   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2993   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2994 
2995   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2996   Register DenomLo = UnmergeDenom.getReg(0);
2997   Register DenomHi = UnmergeDenom.getReg(1);
2998 
2999   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3000   auto C1 = B.buildSExt(S32, CmpHi);
3001 
3002   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3003   auto C2 = B.buildSExt(S32, CmpLo);
3004 
3005   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3006   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3007 
3008   // TODO: Here and below portions of the code can be enclosed into if/endif.
3009   // Currently control flow is unconditional and we have 4 selects after
3010   // potential endif to substitute PHIs.
3011 
3012   // if C3 != 0 ...
3013   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3014   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3015   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3016   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3017 
3018   auto One64 = B.buildConstant(S64, 1);
3019   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3020 
3021   auto C4 =
3022       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3023   auto C5 =
3024       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3025   auto C6 = B.buildSelect(
3026       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3027 
3028   // if (C6 != 0)
3029   auto Add4 = B.buildAdd(S64, Add3, One64);
3030   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3031 
3032   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3033   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3034   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3035 
3036   // endif C6
3037   // endif C3
3038 
3039   if (DstDivReg) {
3040     auto Sel1 = B.buildSelect(
3041         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3042     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3043                   Sel1, MulHi3);
3044   }
3045 
3046   if (DstRemReg) {
3047     auto Sel2 = B.buildSelect(
3048         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3049     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3050                   Sel2, Sub1);
3051   }
3052 }
3053 
3054 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3055                                                   MachineRegisterInfo &MRI,
3056                                                   MachineIRBuilder &B) const {
3057   Register DstDivReg, DstRemReg;
3058   switch (MI.getOpcode()) {
3059   default:
3060     llvm_unreachable("Unexpected opcode!");
3061   case AMDGPU::G_UDIV: {
3062     DstDivReg = MI.getOperand(0).getReg();
3063     break;
3064   }
3065   case AMDGPU::G_UREM: {
3066     DstRemReg = MI.getOperand(0).getReg();
3067     break;
3068   }
3069   case AMDGPU::G_UDIVREM: {
3070     DstDivReg = MI.getOperand(0).getReg();
3071     DstRemReg = MI.getOperand(1).getReg();
3072     break;
3073   }
3074   }
3075 
3076   const LLT S64 = LLT::scalar(64);
3077   const LLT S32 = LLT::scalar(32);
3078   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3079   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3080   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3081   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3082 
3083   if (Ty == S32)
3084     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3085   else if (Ty == S64)
3086     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3087   else
3088     return false;
3089 
3090   MI.eraseFromParent();
3091   return true;
3092 }
3093 
3094 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3095                                                 MachineRegisterInfo &MRI,
3096                                                 MachineIRBuilder &B) const {
3097   const LLT S64 = LLT::scalar(64);
3098   const LLT S32 = LLT::scalar(32);
3099 
3100   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3101   if (Ty != S32 && Ty != S64)
3102     return false;
3103 
3104   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3105   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3106   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3107 
3108   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3109   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3110   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3111 
3112   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3113   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3114 
3115   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3116   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3117 
3118   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3119   switch (MI.getOpcode()) {
3120   default:
3121     llvm_unreachable("Unexpected opcode!");
3122   case AMDGPU::G_SDIV: {
3123     DstDivReg = MI.getOperand(0).getReg();
3124     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3125     break;
3126   }
3127   case AMDGPU::G_SREM: {
3128     DstRemReg = MI.getOperand(0).getReg();
3129     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3130     break;
3131   }
3132   case AMDGPU::G_SDIVREM: {
3133     DstDivReg = MI.getOperand(0).getReg();
3134     DstRemReg = MI.getOperand(1).getReg();
3135     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3136     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3137     break;
3138   }
3139   }
3140 
3141   if (Ty == S32)
3142     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3143   else
3144     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3145 
3146   if (DstDivReg) {
3147     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3148     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3149     B.buildSub(DstDivReg, SignXor, Sign);
3150   }
3151 
3152   if (DstRemReg) {
3153     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3154     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3155     B.buildSub(DstRemReg, SignXor, Sign);
3156   }
3157 
3158   MI.eraseFromParent();
3159   return true;
3160 }
3161 
3162 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3163                                                  MachineRegisterInfo &MRI,
3164                                                  MachineIRBuilder &B) const {
3165   Register Res = MI.getOperand(0).getReg();
3166   Register LHS = MI.getOperand(1).getReg();
3167   Register RHS = MI.getOperand(2).getReg();
3168   uint16_t Flags = MI.getFlags();
3169   LLT ResTy = MRI.getType(Res);
3170 
3171   const MachineFunction &MF = B.getMF();
3172   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3173                             MI.getFlag(MachineInstr::FmAfn);
3174 
3175   if (!AllowInaccurateRcp)
3176     return false;
3177 
3178   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3179     // 1 / x -> RCP(x)
3180     if (CLHS->isExactlyValue(1.0)) {
3181       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3182         .addUse(RHS)
3183         .setMIFlags(Flags);
3184 
3185       MI.eraseFromParent();
3186       return true;
3187     }
3188 
3189     // -1 / x -> RCP( FNEG(x) )
3190     if (CLHS->isExactlyValue(-1.0)) {
3191       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3192       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3193         .addUse(FNeg.getReg(0))
3194         .setMIFlags(Flags);
3195 
3196       MI.eraseFromParent();
3197       return true;
3198     }
3199   }
3200 
3201   // x / y -> x * (1.0 / y)
3202   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3203     .addUse(RHS)
3204     .setMIFlags(Flags);
3205   B.buildFMul(Res, LHS, RCP, Flags);
3206 
3207   MI.eraseFromParent();
3208   return true;
3209 }
3210 
3211 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3212                                                    MachineRegisterInfo &MRI,
3213                                                    MachineIRBuilder &B) const {
3214   Register Res = MI.getOperand(0).getReg();
3215   Register X = MI.getOperand(1).getReg();
3216   Register Y = MI.getOperand(2).getReg();
3217   uint16_t Flags = MI.getFlags();
3218   LLT ResTy = MRI.getType(Res);
3219 
3220   const MachineFunction &MF = B.getMF();
3221   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3222                             MI.getFlag(MachineInstr::FmAfn);
3223 
3224   if (!AllowInaccurateRcp)
3225     return false;
3226 
3227   auto NegY = B.buildFNeg(ResTy, Y);
3228   auto One = B.buildFConstant(ResTy, 1.0);
3229 
3230   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3231     .addUse(Y)
3232     .setMIFlags(Flags);
3233 
3234   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3235   R = B.buildFMA(ResTy, Tmp0, R, R);
3236 
3237   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3238   R = B.buildFMA(ResTy, Tmp1, R, R);
3239 
3240   auto Ret = B.buildFMul(ResTy, X, R);
3241   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3242 
3243   B.buildFMA(Res, Tmp2, R, Ret);
3244   MI.eraseFromParent();
3245   return true;
3246 }
3247 
3248 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3249                                          MachineRegisterInfo &MRI,
3250                                          MachineIRBuilder &B) const {
3251   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3252     return true;
3253 
3254   Register Res = MI.getOperand(0).getReg();
3255   Register LHS = MI.getOperand(1).getReg();
3256   Register RHS = MI.getOperand(2).getReg();
3257 
3258   uint16_t Flags = MI.getFlags();
3259 
3260   LLT S16 = LLT::scalar(16);
3261   LLT S32 = LLT::scalar(32);
3262 
3263   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3264   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3265 
3266   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3267     .addUse(RHSExt.getReg(0))
3268     .setMIFlags(Flags);
3269 
3270   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3271   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3272 
3273   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3274     .addUse(RDst.getReg(0))
3275     .addUse(RHS)
3276     .addUse(LHS)
3277     .setMIFlags(Flags);
3278 
3279   MI.eraseFromParent();
3280   return true;
3281 }
3282 
3283 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3284 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3285 static void toggleSPDenormMode(bool Enable,
3286                                MachineIRBuilder &B,
3287                                const GCNSubtarget &ST,
3288                                AMDGPU::SIModeRegisterDefaults Mode) {
3289   // Set SP denorm mode to this value.
3290   unsigned SPDenormMode =
3291     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3292 
3293   if (ST.hasDenormModeInst()) {
3294     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3295     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3296 
3297     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3298     B.buildInstr(AMDGPU::S_DENORM_MODE)
3299       .addImm(NewDenormModeValue);
3300 
3301   } else {
3302     // Select FP32 bit field in mode register.
3303     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3304                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3305                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3306 
3307     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3308       .addImm(SPDenormMode)
3309       .addImm(SPDenormModeBitField);
3310   }
3311 }
3312 
3313 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3314                                          MachineRegisterInfo &MRI,
3315                                          MachineIRBuilder &B) const {
3316   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3317     return true;
3318 
3319   Register Res = MI.getOperand(0).getReg();
3320   Register LHS = MI.getOperand(1).getReg();
3321   Register RHS = MI.getOperand(2).getReg();
3322   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3323   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3324 
3325   uint16_t Flags = MI.getFlags();
3326 
3327   LLT S32 = LLT::scalar(32);
3328   LLT S1 = LLT::scalar(1);
3329 
3330   auto One = B.buildFConstant(S32, 1.0f);
3331 
3332   auto DenominatorScaled =
3333     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3334       .addUse(LHS)
3335       .addUse(RHS)
3336       .addImm(0)
3337       .setMIFlags(Flags);
3338   auto NumeratorScaled =
3339     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3340       .addUse(LHS)
3341       .addUse(RHS)
3342       .addImm(1)
3343       .setMIFlags(Flags);
3344 
3345   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3346     .addUse(DenominatorScaled.getReg(0))
3347     .setMIFlags(Flags);
3348   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3349 
3350   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3351   // aren't modeled as reading it.
3352   if (!Mode.allFP32Denormals())
3353     toggleSPDenormMode(true, B, ST, Mode);
3354 
3355   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3356   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3357   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3358   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3359   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3360   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3361 
3362   if (!Mode.allFP32Denormals())
3363     toggleSPDenormMode(false, B, ST, Mode);
3364 
3365   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3366     .addUse(Fma4.getReg(0))
3367     .addUse(Fma1.getReg(0))
3368     .addUse(Fma3.getReg(0))
3369     .addUse(NumeratorScaled.getReg(1))
3370     .setMIFlags(Flags);
3371 
3372   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3373     .addUse(Fmas.getReg(0))
3374     .addUse(RHS)
3375     .addUse(LHS)
3376     .setMIFlags(Flags);
3377 
3378   MI.eraseFromParent();
3379   return true;
3380 }
3381 
3382 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3383                                          MachineRegisterInfo &MRI,
3384                                          MachineIRBuilder &B) const {
3385   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3386     return true;
3387 
3388   Register Res = MI.getOperand(0).getReg();
3389   Register LHS = MI.getOperand(1).getReg();
3390   Register RHS = MI.getOperand(2).getReg();
3391 
3392   uint16_t Flags = MI.getFlags();
3393 
3394   LLT S64 = LLT::scalar(64);
3395   LLT S1 = LLT::scalar(1);
3396 
3397   auto One = B.buildFConstant(S64, 1.0);
3398 
3399   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3400     .addUse(LHS)
3401     .addUse(RHS)
3402     .addImm(0)
3403     .setMIFlags(Flags);
3404 
3405   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3406 
3407   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3408     .addUse(DivScale0.getReg(0))
3409     .setMIFlags(Flags);
3410 
3411   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3412   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3413   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3414 
3415   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3416     .addUse(LHS)
3417     .addUse(RHS)
3418     .addImm(1)
3419     .setMIFlags(Flags);
3420 
3421   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3422   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3423   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3424 
3425   Register Scale;
3426   if (!ST.hasUsableDivScaleConditionOutput()) {
3427     // Workaround a hardware bug on SI where the condition output from div_scale
3428     // is not usable.
3429 
3430     LLT S32 = LLT::scalar(32);
3431 
3432     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3433     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3434     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3435     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3436 
3437     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3438                               Scale1Unmerge.getReg(1));
3439     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3440                               Scale0Unmerge.getReg(1));
3441     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3442   } else {
3443     Scale = DivScale1.getReg(1);
3444   }
3445 
3446   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3447     .addUse(Fma4.getReg(0))
3448     .addUse(Fma3.getReg(0))
3449     .addUse(Mul.getReg(0))
3450     .addUse(Scale)
3451     .setMIFlags(Flags);
3452 
3453   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3454     .addUse(Fmas.getReg(0))
3455     .addUse(RHS)
3456     .addUse(LHS)
3457     .setMIFlags(Flags);
3458 
3459   MI.eraseFromParent();
3460   return true;
3461 }
3462 
3463 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3464                                                  MachineRegisterInfo &MRI,
3465                                                  MachineIRBuilder &B) const {
3466   Register Res = MI.getOperand(0).getReg();
3467   Register LHS = MI.getOperand(2).getReg();
3468   Register RHS = MI.getOperand(3).getReg();
3469   uint16_t Flags = MI.getFlags();
3470 
3471   LLT S32 = LLT::scalar(32);
3472   LLT S1 = LLT::scalar(1);
3473 
3474   auto Abs = B.buildFAbs(S32, RHS, Flags);
3475   const APFloat C0Val(1.0f);
3476 
3477   auto C0 = B.buildConstant(S32, 0x6f800000);
3478   auto C1 = B.buildConstant(S32, 0x2f800000);
3479   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3480 
3481   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3482   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3483 
3484   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3485 
3486   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3487     .addUse(Mul0.getReg(0))
3488     .setMIFlags(Flags);
3489 
3490   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3491 
3492   B.buildFMul(Res, Sel, Mul1, Flags);
3493 
3494   MI.eraseFromParent();
3495   return true;
3496 }
3497 
3498 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3499 // FIXME: Why do we handle this one but not other removed instructions?
3500 //
3501 // Reciprocal square root.  The clamp prevents infinite results, clamping
3502 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3503 // +-max_float.
3504 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3505                                                     MachineRegisterInfo &MRI,
3506                                                     MachineIRBuilder &B) const {
3507   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3508     return true;
3509 
3510   Register Dst = MI.getOperand(0).getReg();
3511   Register Src = MI.getOperand(2).getReg();
3512   auto Flags = MI.getFlags();
3513 
3514   LLT Ty = MRI.getType(Dst);
3515 
3516   const fltSemantics *FltSemantics;
3517   if (Ty == LLT::scalar(32))
3518     FltSemantics = &APFloat::IEEEsingle();
3519   else if (Ty == LLT::scalar(64))
3520     FltSemantics = &APFloat::IEEEdouble();
3521   else
3522     return false;
3523 
3524   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3525     .addUse(Src)
3526     .setMIFlags(Flags);
3527 
3528   // We don't need to concern ourselves with the snan handling difference, since
3529   // the rsq quieted (or not) so use the one which will directly select.
3530   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3531   const bool UseIEEE = MFI->getMode().IEEE;
3532 
3533   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3534   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3535                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3536 
3537   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3538 
3539   if (UseIEEE)
3540     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3541   else
3542     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3543   MI.eraseFromParent();
3544   return true;
3545 }
3546 
3547 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3548   switch (IID) {
3549   case Intrinsic::amdgcn_ds_fadd:
3550     return AMDGPU::G_ATOMICRMW_FADD;
3551   case Intrinsic::amdgcn_ds_fmin:
3552     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3553   case Intrinsic::amdgcn_ds_fmax:
3554     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3555   default:
3556     llvm_unreachable("not a DS FP intrinsic");
3557   }
3558 }
3559 
3560 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3561                                                       MachineInstr &MI,
3562                                                       Intrinsic::ID IID) const {
3563   GISelChangeObserver &Observer = Helper.Observer;
3564   Observer.changingInstr(MI);
3565 
3566   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3567 
3568   // The remaining operands were used to set fields in the MemOperand on
3569   // construction.
3570   for (int I = 6; I > 3; --I)
3571     MI.RemoveOperand(I);
3572 
3573   MI.RemoveOperand(1); // Remove the intrinsic ID.
3574   Observer.changedInstr(MI);
3575   return true;
3576 }
3577 
3578 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3579                                             MachineRegisterInfo &MRI,
3580                                             MachineIRBuilder &B) const {
3581   uint64_t Offset =
3582     ST.getTargetLowering()->getImplicitParameterOffset(
3583       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3584   LLT DstTy = MRI.getType(DstReg);
3585   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3586 
3587   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3588   if (!loadInputValue(KernargPtrReg, B,
3589                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3590     return false;
3591 
3592   // FIXME: This should be nuw
3593   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3594   return true;
3595 }
3596 
3597 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3598                                                  MachineRegisterInfo &MRI,
3599                                                  MachineIRBuilder &B) const {
3600   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3601   if (!MFI->isEntryFunction()) {
3602     return legalizePreloadedArgIntrin(MI, MRI, B,
3603                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3604   }
3605 
3606   Register DstReg = MI.getOperand(0).getReg();
3607   if (!getImplicitArgPtr(DstReg, MRI, B))
3608     return false;
3609 
3610   MI.eraseFromParent();
3611   return true;
3612 }
3613 
3614 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3615                                               MachineRegisterInfo &MRI,
3616                                               MachineIRBuilder &B,
3617                                               unsigned AddrSpace) const {
3618   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3619   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3620   Register Hi32 = Unmerge.getReg(1);
3621 
3622   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3623   MI.eraseFromParent();
3624   return true;
3625 }
3626 
3627 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3628 // offset (the offset that is included in bounds checking and swizzling, to be
3629 // split between the instruction's voffset and immoffset fields) and soffset
3630 // (the offset that is excluded from bounds checking and swizzling, to go in
3631 // the instruction's soffset field).  This function takes the first kind of
3632 // offset and figures out how to split it between voffset and immoffset.
3633 std::tuple<Register, unsigned, unsigned>
3634 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3635                                         Register OrigOffset) const {
3636   const unsigned MaxImm = 4095;
3637   Register BaseReg;
3638   unsigned TotalConstOffset;
3639   const LLT S32 = LLT::scalar(32);
3640   MachineRegisterInfo &MRI = *B.getMRI();
3641 
3642   std::tie(BaseReg, TotalConstOffset) =
3643       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3644 
3645   unsigned ImmOffset = TotalConstOffset;
3646 
3647   // If BaseReg is a pointer, convert it to int.
3648   if (MRI.getType(BaseReg).isPointer())
3649     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3650 
3651   // If the immediate value is too big for the immoffset field, put the value
3652   // and -4096 into the immoffset field so that the value that is copied/added
3653   // for the voffset field is a multiple of 4096, and it stands more chance
3654   // of being CSEd with the copy/add for another similar load/store.
3655   // However, do not do that rounding down to a multiple of 4096 if that is a
3656   // negative number, as it appears to be illegal to have a negative offset
3657   // in the vgpr, even if adding the immediate offset makes it positive.
3658   unsigned Overflow = ImmOffset & ~MaxImm;
3659   ImmOffset -= Overflow;
3660   if ((int32_t)Overflow < 0) {
3661     Overflow += ImmOffset;
3662     ImmOffset = 0;
3663   }
3664 
3665   if (Overflow != 0) {
3666     if (!BaseReg) {
3667       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3668     } else {
3669       auto OverflowVal = B.buildConstant(S32, Overflow);
3670       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3671     }
3672   }
3673 
3674   if (!BaseReg)
3675     BaseReg = B.buildConstant(S32, 0).getReg(0);
3676 
3677   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3678 }
3679 
3680 /// Handle register layout difference for f16 images for some subtargets.
3681 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3682                                              MachineRegisterInfo &MRI,
3683                                              Register Reg,
3684                                              bool ImageStore) const {
3685   const LLT S16 = LLT::scalar(16);
3686   const LLT S32 = LLT::scalar(32);
3687   LLT StoreVT = MRI.getType(Reg);
3688   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3689 
3690   if (ST.hasUnpackedD16VMem()) {
3691     auto Unmerge = B.buildUnmerge(S16, Reg);
3692 
3693     SmallVector<Register, 4> WideRegs;
3694     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3695       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3696 
3697     int NumElts = StoreVT.getNumElements();
3698 
3699     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3700         .getReg(0);
3701   }
3702 
3703   if (ImageStore && ST.hasImageStoreD16Bug()) {
3704     if (StoreVT.getNumElements() == 2) {
3705       SmallVector<Register, 4> PackedRegs;
3706       Reg = B.buildBitcast(S32, Reg).getReg(0);
3707       PackedRegs.push_back(Reg);
3708       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3709       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3710           .getReg(0);
3711     }
3712 
3713     if (StoreVT.getNumElements() == 3) {
3714       SmallVector<Register, 4> PackedRegs;
3715       auto Unmerge = B.buildUnmerge(S16, Reg);
3716       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3717         PackedRegs.push_back(Unmerge.getReg(I));
3718       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3719       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3720       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3721     }
3722 
3723     if (StoreVT.getNumElements() == 4) {
3724       SmallVector<Register, 4> PackedRegs;
3725       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3726       auto Unmerge = B.buildUnmerge(S32, Reg);
3727       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3728         PackedRegs.push_back(Unmerge.getReg(I));
3729       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3730       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3731           .getReg(0);
3732     }
3733 
3734     llvm_unreachable("invalid data type");
3735   }
3736 
3737   return Reg;
3738 }
3739 
3740 Register AMDGPULegalizerInfo::fixStoreSourceType(
3741   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3742   MachineRegisterInfo *MRI = B.getMRI();
3743   LLT Ty = MRI->getType(VData);
3744 
3745   const LLT S16 = LLT::scalar(16);
3746 
3747   // Fixup illegal register types for i8 stores.
3748   if (Ty == LLT::scalar(8) || Ty == S16) {
3749     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3750     return AnyExt;
3751   }
3752 
3753   if (Ty.isVector()) {
3754     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3755       if (IsFormat)
3756         return handleD16VData(B, *MRI, VData);
3757     }
3758   }
3759 
3760   return VData;
3761 }
3762 
3763 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3764                                               MachineRegisterInfo &MRI,
3765                                               MachineIRBuilder &B,
3766                                               bool IsTyped,
3767                                               bool IsFormat) const {
3768   Register VData = MI.getOperand(1).getReg();
3769   LLT Ty = MRI.getType(VData);
3770   LLT EltTy = Ty.getScalarType();
3771   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3772   const LLT S32 = LLT::scalar(32);
3773 
3774   VData = fixStoreSourceType(B, VData, IsFormat);
3775   Register RSrc = MI.getOperand(2).getReg();
3776 
3777   MachineMemOperand *MMO = *MI.memoperands_begin();
3778   const int MemSize = MMO->getSize();
3779 
3780   unsigned ImmOffset;
3781   unsigned TotalOffset;
3782 
3783   // The typed intrinsics add an immediate after the registers.
3784   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3785 
3786   // The struct intrinsic variants add one additional operand over raw.
3787   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3788   Register VIndex;
3789   int OpOffset = 0;
3790   if (HasVIndex) {
3791     VIndex = MI.getOperand(3).getReg();
3792     OpOffset = 1;
3793   }
3794 
3795   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3796   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3797 
3798   unsigned Format = 0;
3799   if (IsTyped) {
3800     Format = MI.getOperand(5 + OpOffset).getImm();
3801     ++OpOffset;
3802   }
3803 
3804   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3805 
3806   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3807   if (TotalOffset != 0)
3808     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3809 
3810   unsigned Opc;
3811   if (IsTyped) {
3812     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3813                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3814   } else if (IsFormat) {
3815     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3816                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3817   } else {
3818     switch (MemSize) {
3819     case 1:
3820       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3821       break;
3822     case 2:
3823       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3824       break;
3825     default:
3826       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3827       break;
3828     }
3829   }
3830 
3831   if (!VIndex)
3832     VIndex = B.buildConstant(S32, 0).getReg(0);
3833 
3834   auto MIB = B.buildInstr(Opc)
3835     .addUse(VData)              // vdata
3836     .addUse(RSrc)               // rsrc
3837     .addUse(VIndex)             // vindex
3838     .addUse(VOffset)            // voffset
3839     .addUse(SOffset)            // soffset
3840     .addImm(ImmOffset);         // offset(imm)
3841 
3842   if (IsTyped)
3843     MIB.addImm(Format);
3844 
3845   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3846      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3847      .addMemOperand(MMO);
3848 
3849   MI.eraseFromParent();
3850   return true;
3851 }
3852 
3853 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3854                                              MachineRegisterInfo &MRI,
3855                                              MachineIRBuilder &B,
3856                                              bool IsFormat,
3857                                              bool IsTyped) const {
3858   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3859   MachineMemOperand *MMO = *MI.memoperands_begin();
3860   const int MemSize = MMO->getSize();
3861   const LLT S32 = LLT::scalar(32);
3862 
3863   Register Dst = MI.getOperand(0).getReg();
3864   Register RSrc = MI.getOperand(2).getReg();
3865 
3866   // The typed intrinsics add an immediate after the registers.
3867   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3868 
3869   // The struct intrinsic variants add one additional operand over raw.
3870   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3871   Register VIndex;
3872   int OpOffset = 0;
3873   if (HasVIndex) {
3874     VIndex = MI.getOperand(3).getReg();
3875     OpOffset = 1;
3876   }
3877 
3878   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3879   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3880 
3881   unsigned Format = 0;
3882   if (IsTyped) {
3883     Format = MI.getOperand(5 + OpOffset).getImm();
3884     ++OpOffset;
3885   }
3886 
3887   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3888   unsigned ImmOffset;
3889   unsigned TotalOffset;
3890 
3891   LLT Ty = MRI.getType(Dst);
3892   LLT EltTy = Ty.getScalarType();
3893   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3894   const bool Unpacked = ST.hasUnpackedD16VMem();
3895 
3896   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3897   if (TotalOffset != 0)
3898     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3899 
3900   unsigned Opc;
3901 
3902   if (IsTyped) {
3903     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3904                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3905   } else if (IsFormat) {
3906     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3907                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3908   } else {
3909     switch (MemSize) {
3910     case 1:
3911       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3912       break;
3913     case 2:
3914       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3915       break;
3916     default:
3917       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3918       break;
3919     }
3920   }
3921 
3922   Register LoadDstReg;
3923 
3924   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3925   LLT UnpackedTy = Ty.changeElementSize(32);
3926 
3927   if (IsExtLoad)
3928     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3929   else if (Unpacked && IsD16 && Ty.isVector())
3930     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3931   else
3932     LoadDstReg = Dst;
3933 
3934   if (!VIndex)
3935     VIndex = B.buildConstant(S32, 0).getReg(0);
3936 
3937   auto MIB = B.buildInstr(Opc)
3938     .addDef(LoadDstReg)         // vdata
3939     .addUse(RSrc)               // rsrc
3940     .addUse(VIndex)             // vindex
3941     .addUse(VOffset)            // voffset
3942     .addUse(SOffset)            // soffset
3943     .addImm(ImmOffset);         // offset(imm)
3944 
3945   if (IsTyped)
3946     MIB.addImm(Format);
3947 
3948   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3949      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3950      .addMemOperand(MMO);
3951 
3952   if (LoadDstReg != Dst) {
3953     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3954 
3955     // Widen result for extending loads was widened.
3956     if (IsExtLoad)
3957       B.buildTrunc(Dst, LoadDstReg);
3958     else {
3959       // Repack to original 16-bit vector result
3960       // FIXME: G_TRUNC should work, but legalization currently fails
3961       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3962       SmallVector<Register, 4> Repack;
3963       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3964         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3965       B.buildMerge(Dst, Repack);
3966     }
3967   }
3968 
3969   MI.eraseFromParent();
3970   return true;
3971 }
3972 
3973 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3974                                                MachineIRBuilder &B,
3975                                                bool IsInc) const {
3976   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3977                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3978   B.buildInstr(Opc)
3979     .addDef(MI.getOperand(0).getReg())
3980     .addUse(MI.getOperand(2).getReg())
3981     .addUse(MI.getOperand(3).getReg())
3982     .cloneMemRefs(MI);
3983   MI.eraseFromParent();
3984   return true;
3985 }
3986 
3987 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3988   switch (IntrID) {
3989   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3990   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3991     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3992   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3993   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3994     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3995   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3996   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3997     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3998   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3999   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4000     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4001   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4002   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4003     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4004   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4005   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4006     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4007   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4008   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4009     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4010   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4011   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4012     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4013   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4014   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4015     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4016   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4017   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4018     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4019   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4020   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4021     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4022   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4023   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4024     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4025   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4026   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4027     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4028   case Intrinsic::amdgcn_buffer_atomic_fadd:
4029   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4030   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4031     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4032   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4033   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4034     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4035   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4036   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4037     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4038   default:
4039     llvm_unreachable("unhandled atomic opcode");
4040   }
4041 }
4042 
4043 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4044                                                MachineIRBuilder &B,
4045                                                Intrinsic::ID IID) const {
4046   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4047                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4048   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4049 
4050   Register Dst;
4051 
4052   int OpOffset = 0;
4053   if (HasReturn) {
4054     // A few FP atomics do not support return values.
4055     Dst = MI.getOperand(0).getReg();
4056   } else {
4057     OpOffset = -1;
4058   }
4059 
4060   Register VData = MI.getOperand(2 + OpOffset).getReg();
4061   Register CmpVal;
4062 
4063   if (IsCmpSwap) {
4064     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4065     ++OpOffset;
4066   }
4067 
4068   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4069   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4070 
4071   // The struct intrinsic variants add one additional operand over raw.
4072   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4073   Register VIndex;
4074   if (HasVIndex) {
4075     VIndex = MI.getOperand(4 + OpOffset).getReg();
4076     ++OpOffset;
4077   }
4078 
4079   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4080   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4081   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4082 
4083   MachineMemOperand *MMO = *MI.memoperands_begin();
4084 
4085   unsigned ImmOffset;
4086   unsigned TotalOffset;
4087   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4088   if (TotalOffset != 0)
4089     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4090 
4091   if (!VIndex)
4092     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4093 
4094   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4095 
4096   if (HasReturn)
4097     MIB.addDef(Dst);
4098 
4099   MIB.addUse(VData); // vdata
4100 
4101   if (IsCmpSwap)
4102     MIB.addReg(CmpVal);
4103 
4104   MIB.addUse(RSrc)               // rsrc
4105      .addUse(VIndex)             // vindex
4106      .addUse(VOffset)            // voffset
4107      .addUse(SOffset)            // soffset
4108      .addImm(ImmOffset)          // offset(imm)
4109      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4110      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4111      .addMemOperand(MMO);
4112 
4113   MI.eraseFromParent();
4114   return true;
4115 }
4116 
4117 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4118 /// vector with s16 typed elements.
4119 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4120                                       SmallVectorImpl<Register> &PackedAddrs,
4121                                       unsigned ArgOffset,
4122                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4123                                       bool IsA16, bool IsG16) {
4124   const LLT S16 = LLT::scalar(16);
4125   const LLT V2S16 = LLT::fixed_vector(2, 16);
4126   auto EndIdx = Intr->VAddrEnd;
4127 
4128   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4129     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4130     if (!SrcOp.isReg())
4131       continue; // _L to _LZ may have eliminated this.
4132 
4133     Register AddrReg = SrcOp.getReg();
4134 
4135     if (I < Intr->GradientStart) {
4136       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4137       PackedAddrs.push_back(AddrReg);
4138     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4139                (I >= Intr->CoordStart && !IsA16)) {
4140       // Handle any gradient or coordinate operands that should not be packed
4141       PackedAddrs.push_back(AddrReg);
4142     } else {
4143       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4144       // derivatives dx/dh and dx/dv are packed with undef.
4145       if (((I + 1) >= EndIdx) ||
4146           ((Intr->NumGradients / 2) % 2 == 1 &&
4147            (I == static_cast<unsigned>(Intr->GradientStart +
4148                                        (Intr->NumGradients / 2) - 1) ||
4149             I == static_cast<unsigned>(Intr->GradientStart +
4150                                        Intr->NumGradients - 1))) ||
4151           // Check for _L to _LZ optimization
4152           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4153         PackedAddrs.push_back(
4154             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4155                 .getReg(0));
4156       } else {
4157         PackedAddrs.push_back(
4158             B.buildBuildVector(
4159                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4160                 .getReg(0));
4161         ++I;
4162       }
4163     }
4164   }
4165 }
4166 
4167 /// Convert from separate vaddr components to a single vector address register,
4168 /// and replace the remaining operands with $noreg.
4169 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4170                                      int DimIdx, int NumVAddrs) {
4171   const LLT S32 = LLT::scalar(32);
4172 
4173   SmallVector<Register, 8> AddrRegs;
4174   for (int I = 0; I != NumVAddrs; ++I) {
4175     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4176     if (SrcOp.isReg()) {
4177       AddrRegs.push_back(SrcOp.getReg());
4178       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4179     }
4180   }
4181 
4182   int NumAddrRegs = AddrRegs.size();
4183   if (NumAddrRegs != 1) {
4184     // Round up to 8 elements for v5-v7
4185     // FIXME: Missing intermediate sized register classes and instructions.
4186     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4187       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4188       auto Undef = B.buildUndef(S32);
4189       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4190       NumAddrRegs = RoundedNumRegs;
4191     }
4192 
4193     auto VAddr =
4194         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4195     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4196   }
4197 
4198   for (int I = 1; I != NumVAddrs; ++I) {
4199     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4200     if (SrcOp.isReg())
4201       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4202   }
4203 }
4204 
4205 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4206 ///
4207 /// Depending on the subtarget, load/store with 16-bit element data need to be
4208 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4209 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4210 /// registers.
4211 ///
4212 /// We don't want to directly select image instructions just yet, but also want
4213 /// to exposes all register repacking to the legalizer/combiners. We also don't
4214 /// want a selected instrution entering RegBankSelect. In order to avoid
4215 /// defining a multitude of intermediate image instructions, directly hack on
4216 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4217 /// now unnecessary arguments with $noreg.
4218 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4219     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4220     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4221 
4222   const unsigned NumDefs = MI.getNumExplicitDefs();
4223   const unsigned ArgOffset = NumDefs + 1;
4224   bool IsTFE = NumDefs == 2;
4225   // We are only processing the operands of d16 image operations on subtargets
4226   // that use the unpacked register layout, or need to repack the TFE result.
4227 
4228   // TODO: Do we need to guard against already legalized intrinsics?
4229   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4230       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4231 
4232   MachineRegisterInfo *MRI = B.getMRI();
4233   const LLT S32 = LLT::scalar(32);
4234   const LLT S16 = LLT::scalar(16);
4235   const LLT V2S16 = LLT::fixed_vector(2, 16);
4236 
4237   unsigned DMask = 0;
4238 
4239   // Check for 16 bit addresses and pack if true.
4240   LLT GradTy =
4241       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4242   LLT AddrTy =
4243       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4244   const bool IsG16 = GradTy == S16;
4245   const bool IsA16 = AddrTy == S16;
4246 
4247   int DMaskLanes = 0;
4248   if (!BaseOpcode->Atomic) {
4249     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4250     if (BaseOpcode->Gather4) {
4251       DMaskLanes = 4;
4252     } else if (DMask != 0) {
4253       DMaskLanes = countPopulation(DMask);
4254     } else if (!IsTFE && !BaseOpcode->Store) {
4255       // If dmask is 0, this is a no-op load. This can be eliminated.
4256       B.buildUndef(MI.getOperand(0));
4257       MI.eraseFromParent();
4258       return true;
4259     }
4260   }
4261 
4262   Observer.changingInstr(MI);
4263   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4264 
4265   unsigned NewOpcode = NumDefs == 0 ?
4266     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4267 
4268   // Track that we legalized this
4269   MI.setDesc(B.getTII().get(NewOpcode));
4270 
4271   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4272   // dmask to be at least 1 otherwise the instruction will fail
4273   if (IsTFE && DMask == 0) {
4274     DMask = 0x1;
4275     DMaskLanes = 1;
4276     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4277   }
4278 
4279   if (BaseOpcode->Atomic) {
4280     Register VData0 = MI.getOperand(2).getReg();
4281     LLT Ty = MRI->getType(VData0);
4282 
4283     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4284     if (Ty.isVector())
4285       return false;
4286 
4287     if (BaseOpcode->AtomicX2) {
4288       Register VData1 = MI.getOperand(3).getReg();
4289       // The two values are packed in one register.
4290       LLT PackedTy = LLT::fixed_vector(2, Ty);
4291       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4292       MI.getOperand(2).setReg(Concat.getReg(0));
4293       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4294     }
4295   }
4296 
4297   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4298 
4299   // Optimize _L to _LZ when _L is zero
4300   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4301           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4302     const ConstantFP *ConstantLod;
4303 
4304     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4305                  m_GFCst(ConstantLod))) {
4306       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4307         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4308         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4309             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4310                                                       Intr->Dim);
4311 
4312         // The starting indexes should remain in the same place.
4313         --CorrectedNumVAddrs;
4314 
4315         MI.getOperand(MI.getNumExplicitDefs())
4316             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4317         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4318         Intr = NewImageDimIntr;
4319       }
4320     }
4321   }
4322 
4323   // Optimize _mip away, when 'lod' is zero
4324   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4325     int64_t ConstantLod;
4326     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4327                  m_ICst(ConstantLod))) {
4328       if (ConstantLod == 0) {
4329         // TODO: Change intrinsic opcode and remove operand instead or replacing
4330         // it with 0, as the _L to _LZ handling is done above.
4331         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4332         --CorrectedNumVAddrs;
4333       }
4334     }
4335   }
4336 
4337   // Rewrite the addressing register layout before doing anything else.
4338   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4339     // 16 bit gradients are supported, but are tied to the A16 control
4340     // so both gradients and addresses must be 16 bit
4341     return false;
4342   }
4343 
4344   if (IsA16 && !ST.hasA16()) {
4345     // A16 not supported
4346     return false;
4347   }
4348 
4349   if (IsA16 || IsG16) {
4350     if (Intr->NumVAddrs > 1) {
4351       SmallVector<Register, 4> PackedRegs;
4352 
4353       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4354                                 IsG16);
4355 
4356       // See also below in the non-a16 branch
4357       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4358 
4359       if (!UseNSA && PackedRegs.size() > 1) {
4360         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4361         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4362         PackedRegs[0] = Concat.getReg(0);
4363         PackedRegs.resize(1);
4364       }
4365 
4366       const unsigned NumPacked = PackedRegs.size();
4367       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4368         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4369         if (!SrcOp.isReg()) {
4370           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4371           continue;
4372         }
4373 
4374         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4375 
4376         if (I - Intr->VAddrStart < NumPacked)
4377           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4378         else
4379           SrcOp.setReg(AMDGPU::NoRegister);
4380       }
4381     }
4382   } else {
4383     // If the register allocator cannot place the address registers contiguously
4384     // without introducing moves, then using the non-sequential address encoding
4385     // is always preferable, since it saves VALU instructions and is usually a
4386     // wash in terms of code size or even better.
4387     //
4388     // However, we currently have no way of hinting to the register allocator
4389     // that MIMG addresses should be placed contiguously when it is possible to
4390     // do so, so force non-NSA for the common 2-address case as a heuristic.
4391     //
4392     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4393     // allocation when possible.
4394     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4395 
4396     if (!UseNSA && Intr->NumVAddrs > 1)
4397       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4398                                Intr->NumVAddrs);
4399   }
4400 
4401   int Flags = 0;
4402   if (IsA16)
4403     Flags |= 1;
4404   if (IsG16)
4405     Flags |= 2;
4406   MI.addOperand(MachineOperand::CreateImm(Flags));
4407 
4408   if (BaseOpcode->Store) { // No TFE for stores?
4409     // TODO: Handle dmask trim
4410     Register VData = MI.getOperand(1).getReg();
4411     LLT Ty = MRI->getType(VData);
4412     if (!Ty.isVector() || Ty.getElementType() != S16)
4413       return true;
4414 
4415     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4416     if (RepackedReg != VData) {
4417       MI.getOperand(1).setReg(RepackedReg);
4418     }
4419 
4420     return true;
4421   }
4422 
4423   Register DstReg = MI.getOperand(0).getReg();
4424   LLT Ty = MRI->getType(DstReg);
4425   const LLT EltTy = Ty.getScalarType();
4426   const bool IsD16 = Ty.getScalarType() == S16;
4427   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4428 
4429   // Confirm that the return type is large enough for the dmask specified
4430   if (NumElts < DMaskLanes)
4431     return false;
4432 
4433   if (NumElts > 4 || DMaskLanes > 4)
4434     return false;
4435 
4436   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4437   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4438 
4439   // The raw dword aligned data component of the load. The only legal cases
4440   // where this matters should be when using the packed D16 format, for
4441   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4442   LLT RoundedTy;
4443 
4444   // S32 vector to to cover all data, plus TFE result element.
4445   LLT TFETy;
4446 
4447   // Register type to use for each loaded component. Will be S32 or V2S16.
4448   LLT RegTy;
4449 
4450   if (IsD16 && ST.hasUnpackedD16VMem()) {
4451     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4452     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4453     RegTy = S32;
4454   } else {
4455     unsigned EltSize = EltTy.getSizeInBits();
4456     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4457     unsigned RoundedSize = 32 * RoundedElts;
4458     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4459     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4460     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4461   }
4462 
4463   // The return type does not need adjustment.
4464   // TODO: Should we change s16 case to s32 or <2 x s16>?
4465   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4466     return true;
4467 
4468   Register Dst1Reg;
4469 
4470   // Insert after the instruction.
4471   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4472 
4473   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4474   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4475   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4476   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4477 
4478   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4479 
4480   MI.getOperand(0).setReg(NewResultReg);
4481 
4482   // In the IR, TFE is supposed to be used with a 2 element struct return
4483   // type. The intruction really returns these two values in one contiguous
4484   // register, with one additional dword beyond the loaded data. Rewrite the
4485   // return type to use a single register result.
4486 
4487   if (IsTFE) {
4488     Dst1Reg = MI.getOperand(1).getReg();
4489     if (MRI->getType(Dst1Reg) != S32)
4490       return false;
4491 
4492     // TODO: Make sure the TFE operand bit is set.
4493     MI.RemoveOperand(1);
4494 
4495     // Handle the easy case that requires no repack instructions.
4496     if (Ty == S32) {
4497       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4498       return true;
4499     }
4500   }
4501 
4502   // Now figure out how to copy the new result register back into the old
4503   // result.
4504   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4505 
4506   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4507 
4508   if (ResultNumRegs == 1) {
4509     assert(!IsTFE);
4510     ResultRegs[0] = NewResultReg;
4511   } else {
4512     // We have to repack into a new vector of some kind.
4513     for (int I = 0; I != NumDataRegs; ++I)
4514       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4515     B.buildUnmerge(ResultRegs, NewResultReg);
4516 
4517     // Drop the final TFE element to get the data part. The TFE result is
4518     // directly written to the right place already.
4519     if (IsTFE)
4520       ResultRegs.resize(NumDataRegs);
4521   }
4522 
4523   // For an s16 scalar result, we form an s32 result with a truncate regardless
4524   // of packed vs. unpacked.
4525   if (IsD16 && !Ty.isVector()) {
4526     B.buildTrunc(DstReg, ResultRegs[0]);
4527     return true;
4528   }
4529 
4530   // Avoid a build/concat_vector of 1 entry.
4531   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4532     B.buildBitcast(DstReg, ResultRegs[0]);
4533     return true;
4534   }
4535 
4536   assert(Ty.isVector());
4537 
4538   if (IsD16) {
4539     // For packed D16 results with TFE enabled, all the data components are
4540     // S32. Cast back to the expected type.
4541     //
4542     // TODO: We don't really need to use load s32 elements. We would only need one
4543     // cast for the TFE result if a multiple of v2s16 was used.
4544     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4545       for (Register &Reg : ResultRegs)
4546         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4547     } else if (ST.hasUnpackedD16VMem()) {
4548       for (Register &Reg : ResultRegs)
4549         Reg = B.buildTrunc(S16, Reg).getReg(0);
4550     }
4551   }
4552 
4553   auto padWithUndef = [&](LLT Ty, int NumElts) {
4554     if (NumElts == 0)
4555       return;
4556     Register Undef = B.buildUndef(Ty).getReg(0);
4557     for (int I = 0; I != NumElts; ++I)
4558       ResultRegs.push_back(Undef);
4559   };
4560 
4561   // Pad out any elements eliminated due to the dmask.
4562   LLT ResTy = MRI->getType(ResultRegs[0]);
4563   if (!ResTy.isVector()) {
4564     padWithUndef(ResTy, NumElts - ResultRegs.size());
4565     B.buildBuildVector(DstReg, ResultRegs);
4566     return true;
4567   }
4568 
4569   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4570   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4571 
4572   // Deal with the one annoying legal case.
4573   const LLT V3S16 = LLT::fixed_vector(3, 16);
4574   if (Ty == V3S16) {
4575     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4576     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4577     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4578     return true;
4579   }
4580 
4581   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4582   B.buildConcatVectors(DstReg, ResultRegs);
4583   return true;
4584 }
4585 
4586 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4587   LegalizerHelper &Helper, MachineInstr &MI) const {
4588   MachineIRBuilder &B = Helper.MIRBuilder;
4589   GISelChangeObserver &Observer = Helper.Observer;
4590 
4591   Register Dst = MI.getOperand(0).getReg();
4592   LLT Ty = B.getMRI()->getType(Dst);
4593   unsigned Size = Ty.getSizeInBits();
4594   MachineFunction &MF = B.getMF();
4595 
4596   Observer.changingInstr(MI);
4597 
4598   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4599     Ty = getBitcastRegisterType(Ty);
4600     Helper.bitcastDst(MI, Ty, 0);
4601     Dst = MI.getOperand(0).getReg();
4602     B.setInsertPt(B.getMBB(), MI);
4603   }
4604 
4605   // FIXME: We don't really need this intermediate instruction. The intrinsic
4606   // should be fixed to have a memory operand. Since it's readnone, we're not
4607   // allowed to add one.
4608   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4609   MI.RemoveOperand(1); // Remove intrinsic ID
4610 
4611   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4612   // TODO: Should this use datalayout alignment?
4613   const unsigned MemSize = (Size + 7) / 8;
4614   const Align MemAlign(4);
4615   MachineMemOperand *MMO = MF.getMachineMemOperand(
4616       MachinePointerInfo(),
4617       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4618           MachineMemOperand::MOInvariant,
4619       MemSize, MemAlign);
4620   MI.addMemOperand(MF, MMO);
4621 
4622   // There are no 96-bit result scalar loads, but widening to 128-bit should
4623   // always be legal. We may need to restore this to a 96-bit result if it turns
4624   // out this needs to be converted to a vector load during RegBankSelect.
4625   if (!isPowerOf2_32(Size)) {
4626     if (Ty.isVector())
4627       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4628     else
4629       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4630   }
4631 
4632   Observer.changedInstr(MI);
4633   return true;
4634 }
4635 
4636 // TODO: Move to selection
4637 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4638                                                 MachineRegisterInfo &MRI,
4639                                                 MachineIRBuilder &B) const {
4640   if (!ST.isTrapHandlerEnabled() ||
4641       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4642     return legalizeTrapEndpgm(MI, MRI, B);
4643 
4644   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4645     switch (*HsaAbiVer) {
4646     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4647     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4648       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4649     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4650       return ST.supportsGetDoorbellID() ?
4651           legalizeTrapHsa(MI, MRI, B) :
4652           legalizeTrapHsaQueuePtr(MI, MRI, B);
4653     }
4654   }
4655 
4656   llvm_unreachable("Unknown trap handler");
4657 }
4658 
4659 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4660     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4661   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4662   MI.eraseFromParent();
4663   return true;
4664 }
4665 
4666 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4667     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4668   // Pass queue pointer to trap handler as input, and insert trap instruction
4669   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4670   Register LiveIn =
4671     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4672   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4673     return false;
4674 
4675   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4676   B.buildCopy(SGPR01, LiveIn);
4677   B.buildInstr(AMDGPU::S_TRAP)
4678       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4679       .addReg(SGPR01, RegState::Implicit);
4680 
4681   MI.eraseFromParent();
4682   return true;
4683 }
4684 
4685 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4686     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4687   B.buildInstr(AMDGPU::S_TRAP)
4688       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4689   MI.eraseFromParent();
4690   return true;
4691 }
4692 
4693 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4694     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4695   // Is non-HSA path or trap-handler disabled? then, report a warning
4696   // accordingly
4697   if (!ST.isTrapHandlerEnabled() ||
4698       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4699     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4700                                      "debugtrap handler not supported",
4701                                      MI.getDebugLoc(), DS_Warning);
4702     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4703     Ctx.diagnose(NoTrap);
4704   } else {
4705     // Insert debug-trap instruction
4706     B.buildInstr(AMDGPU::S_TRAP)
4707         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4708   }
4709 
4710   MI.eraseFromParent();
4711   return true;
4712 }
4713 
4714 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4715                                                MachineIRBuilder &B) const {
4716   MachineRegisterInfo &MRI = *B.getMRI();
4717   const LLT S16 = LLT::scalar(16);
4718   const LLT S32 = LLT::scalar(32);
4719 
4720   Register DstReg = MI.getOperand(0).getReg();
4721   Register NodePtr = MI.getOperand(2).getReg();
4722   Register RayExtent = MI.getOperand(3).getReg();
4723   Register RayOrigin = MI.getOperand(4).getReg();
4724   Register RayDir = MI.getOperand(5).getReg();
4725   Register RayInvDir = MI.getOperand(6).getReg();
4726   Register TDescr = MI.getOperand(7).getReg();
4727 
4728   if (!ST.hasGFX10_AEncoding()) {
4729     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4730                                         "intrinsic not supported on subtarget",
4731                                         MI.getDebugLoc());
4732     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4733     return false;
4734   }
4735 
4736   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4737   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4738   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4739                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4740                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4741                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4742 
4743   SmallVector<Register, 12> Ops;
4744   if (Is64) {
4745     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4746     Ops.push_back(Unmerge.getReg(0));
4747     Ops.push_back(Unmerge.getReg(1));
4748   } else {
4749     Ops.push_back(NodePtr);
4750   }
4751   Ops.push_back(RayExtent);
4752 
4753   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4754     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4755     Ops.push_back(Unmerge.getReg(0));
4756     Ops.push_back(Unmerge.getReg(1));
4757     Ops.push_back(Unmerge.getReg(2));
4758   };
4759 
4760   packLanes(RayOrigin);
4761   if (IsA16) {
4762     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4763     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4764     Register R1 = MRI.createGenericVirtualRegister(S32);
4765     Register R2 = MRI.createGenericVirtualRegister(S32);
4766     Register R3 = MRI.createGenericVirtualRegister(S32);
4767     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4768     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4769     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4770     Ops.push_back(R1);
4771     Ops.push_back(R2);
4772     Ops.push_back(R3);
4773   } else {
4774     packLanes(RayDir);
4775     packLanes(RayInvDir);
4776   }
4777 
4778   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4779     .addDef(DstReg)
4780     .addImm(Opcode);
4781 
4782   for (Register R : Ops) {
4783     MIB.addUse(R);
4784   }
4785 
4786   MIB.addUse(TDescr)
4787      .addImm(IsA16 ? 1 : 0)
4788      .cloneMemRefs(MI);
4789 
4790   MI.eraseFromParent();
4791   return true;
4792 }
4793 
4794 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4795                                             MachineInstr &MI) const {
4796   MachineIRBuilder &B = Helper.MIRBuilder;
4797   MachineRegisterInfo &MRI = *B.getMRI();
4798 
4799   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4800   auto IntrID = MI.getIntrinsicID();
4801   switch (IntrID) {
4802   case Intrinsic::amdgcn_if:
4803   case Intrinsic::amdgcn_else: {
4804     MachineInstr *Br = nullptr;
4805     MachineBasicBlock *UncondBrTarget = nullptr;
4806     bool Negated = false;
4807     if (MachineInstr *BrCond =
4808             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4809       const SIRegisterInfo *TRI
4810         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4811 
4812       Register Def = MI.getOperand(1).getReg();
4813       Register Use = MI.getOperand(3).getReg();
4814 
4815       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4816 
4817       if (Negated)
4818         std::swap(CondBrTarget, UncondBrTarget);
4819 
4820       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4821       if (IntrID == Intrinsic::amdgcn_if) {
4822         B.buildInstr(AMDGPU::SI_IF)
4823           .addDef(Def)
4824           .addUse(Use)
4825           .addMBB(UncondBrTarget);
4826       } else {
4827         B.buildInstr(AMDGPU::SI_ELSE)
4828             .addDef(Def)
4829             .addUse(Use)
4830             .addMBB(UncondBrTarget);
4831       }
4832 
4833       if (Br) {
4834         Br->getOperand(0).setMBB(CondBrTarget);
4835       } else {
4836         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4837         // since we're swapping branch targets it needs to be reinserted.
4838         // FIXME: IRTranslator should probably not do this
4839         B.buildBr(*CondBrTarget);
4840       }
4841 
4842       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4843       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4844       MI.eraseFromParent();
4845       BrCond->eraseFromParent();
4846       return true;
4847     }
4848 
4849     return false;
4850   }
4851   case Intrinsic::amdgcn_loop: {
4852     MachineInstr *Br = nullptr;
4853     MachineBasicBlock *UncondBrTarget = nullptr;
4854     bool Negated = false;
4855     if (MachineInstr *BrCond =
4856             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4857       const SIRegisterInfo *TRI
4858         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4859 
4860       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4861       Register Reg = MI.getOperand(2).getReg();
4862 
4863       if (Negated)
4864         std::swap(CondBrTarget, UncondBrTarget);
4865 
4866       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4867       B.buildInstr(AMDGPU::SI_LOOP)
4868         .addUse(Reg)
4869         .addMBB(UncondBrTarget);
4870 
4871       if (Br)
4872         Br->getOperand(0).setMBB(CondBrTarget);
4873       else
4874         B.buildBr(*CondBrTarget);
4875 
4876       MI.eraseFromParent();
4877       BrCond->eraseFromParent();
4878       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4879       return true;
4880     }
4881 
4882     return false;
4883   }
4884   case Intrinsic::amdgcn_kernarg_segment_ptr:
4885     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4886       // This only makes sense to call in a kernel, so just lower to null.
4887       B.buildConstant(MI.getOperand(0).getReg(), 0);
4888       MI.eraseFromParent();
4889       return true;
4890     }
4891 
4892     return legalizePreloadedArgIntrin(
4893       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4894   case Intrinsic::amdgcn_implicitarg_ptr:
4895     return legalizeImplicitArgPtr(MI, MRI, B);
4896   case Intrinsic::amdgcn_workitem_id_x:
4897     return legalizePreloadedArgIntrin(MI, MRI, B,
4898                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4899   case Intrinsic::amdgcn_workitem_id_y:
4900     return legalizePreloadedArgIntrin(MI, MRI, B,
4901                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4902   case Intrinsic::amdgcn_workitem_id_z:
4903     return legalizePreloadedArgIntrin(MI, MRI, B,
4904                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4905   case Intrinsic::amdgcn_workgroup_id_x:
4906     return legalizePreloadedArgIntrin(MI, MRI, B,
4907                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4908   case Intrinsic::amdgcn_workgroup_id_y:
4909     return legalizePreloadedArgIntrin(MI, MRI, B,
4910                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4911   case Intrinsic::amdgcn_workgroup_id_z:
4912     return legalizePreloadedArgIntrin(MI, MRI, B,
4913                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4914   case Intrinsic::amdgcn_dispatch_ptr:
4915     return legalizePreloadedArgIntrin(MI, MRI, B,
4916                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4917   case Intrinsic::amdgcn_queue_ptr:
4918     return legalizePreloadedArgIntrin(MI, MRI, B,
4919                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4920   case Intrinsic::amdgcn_implicit_buffer_ptr:
4921     return legalizePreloadedArgIntrin(
4922       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4923   case Intrinsic::amdgcn_dispatch_id:
4924     return legalizePreloadedArgIntrin(MI, MRI, B,
4925                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4926   case Intrinsic::amdgcn_fdiv_fast:
4927     return legalizeFDIVFastIntrin(MI, MRI, B);
4928   case Intrinsic::amdgcn_is_shared:
4929     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4930   case Intrinsic::amdgcn_is_private:
4931     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4932   case Intrinsic::amdgcn_wavefrontsize: {
4933     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4934     MI.eraseFromParent();
4935     return true;
4936   }
4937   case Intrinsic::amdgcn_s_buffer_load:
4938     return legalizeSBufferLoad(Helper, MI);
4939   case Intrinsic::amdgcn_raw_buffer_store:
4940   case Intrinsic::amdgcn_struct_buffer_store:
4941     return legalizeBufferStore(MI, MRI, B, false, false);
4942   case Intrinsic::amdgcn_raw_buffer_store_format:
4943   case Intrinsic::amdgcn_struct_buffer_store_format:
4944     return legalizeBufferStore(MI, MRI, B, false, true);
4945   case Intrinsic::amdgcn_raw_tbuffer_store:
4946   case Intrinsic::amdgcn_struct_tbuffer_store:
4947     return legalizeBufferStore(MI, MRI, B, true, true);
4948   case Intrinsic::amdgcn_raw_buffer_load:
4949   case Intrinsic::amdgcn_struct_buffer_load:
4950     return legalizeBufferLoad(MI, MRI, B, false, false);
4951   case Intrinsic::amdgcn_raw_buffer_load_format:
4952   case Intrinsic::amdgcn_struct_buffer_load_format:
4953     return legalizeBufferLoad(MI, MRI, B, true, false);
4954   case Intrinsic::amdgcn_raw_tbuffer_load:
4955   case Intrinsic::amdgcn_struct_tbuffer_load:
4956     return legalizeBufferLoad(MI, MRI, B, true, true);
4957   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4958   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4959   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4960   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4961   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4962   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4963   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4964   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4965   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4966   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4967   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4968   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4969   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4970   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4971   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4972   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4973   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4974   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4975   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4976   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4977   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4978   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4979   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4980   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4981   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4982   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4983   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4984   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4985   case Intrinsic::amdgcn_buffer_atomic_fadd:
4986   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4987   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4988   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4989   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4990     return legalizeBufferAtomic(MI, B, IntrID);
4991   case Intrinsic::amdgcn_atomic_inc:
4992     return legalizeAtomicIncDec(MI, B, true);
4993   case Intrinsic::amdgcn_atomic_dec:
4994     return legalizeAtomicIncDec(MI, B, false);
4995   case Intrinsic::trap:
4996     return legalizeTrapIntrinsic(MI, MRI, B);
4997   case Intrinsic::debugtrap:
4998     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4999   case Intrinsic::amdgcn_rsq_clamp:
5000     return legalizeRsqClampIntrinsic(MI, MRI, B);
5001   case Intrinsic::amdgcn_ds_fadd:
5002   case Intrinsic::amdgcn_ds_fmin:
5003   case Intrinsic::amdgcn_ds_fmax:
5004     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5005   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5006     return legalizeBVHIntrinsic(MI, B);
5007   default: {
5008     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5009             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5010       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5011     return true;
5012   }
5013   }
5014 
5015   return true;
5016 }
5017