1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226   return [=](const LegalityQuery &Query) {
227     const LLT Ty = Query.Types[TypeIdx];
228     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230   };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237                                     bool IsLoad) {
238   switch (AS) {
239   case AMDGPUAS::PRIVATE_ADDRESS:
240     // FIXME: Private element size.
241     return ST.enableFlatScratch() ? 128 : 32;
242   case AMDGPUAS::LOCAL_ADDRESS:
243     return ST.useDS128() ? 128 : 64;
244   case AMDGPUAS::GLOBAL_ADDRESS:
245   case AMDGPUAS::CONSTANT_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247     // Treat constant and global as identical. SMRD loads are sometimes usable for
248     // global loads (ideally constant address space should be eliminated)
249     // depending on the context. Legality cannot be context dependent, but
250     // RegBankSelect can split the load as necessary depending on the pointer
251     // register bank/uniformity and if the memory is invariant or not written in a
252     // kernel.
253     return IsLoad ? 512 : 128;
254   default:
255     // Flat addresses may contextually need to be split to 32-bit parts if they
256     // may alias scratch depending on the subtarget.
257     return 128;
258   }
259 }
260 
261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262                                  const LegalityQuery &Query,
263                                  unsigned Opcode) {
264   const LLT Ty = Query.Types[0];
265 
266   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267   const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 
269   unsigned RegSize = Ty.getSizeInBits();
270   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272   unsigned AS = Query.Types[1].getAddressSpace();
273 
274   // All of these need to be custom lowered to cast the pointer operand.
275   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276     return false;
277 
278   // TODO: We should be able to widen loads if the alignment is high enough, but
279   // we also need to modify the memory access size.
280 #if 0
281   // Accept widening loads based on alignment.
282   if (IsLoad && MemSize < Size)
283     MemSize = std::max(MemSize, Align);
284 #endif
285 
286   // Only 1-byte and 2-byte to 32-bit extloads are valid.
287   if (MemSize != RegSize && RegSize != 32)
288     return false;
289 
290   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291     return false;
292 
293   switch (MemSize) {
294   case 8:
295   case 16:
296   case 32:
297   case 64:
298   case 128:
299     break;
300   case 96:
301     if (!ST.hasDwordx3LoadStores())
302       return false;
303     break;
304   case 256:
305   case 512:
306     // These may contextually need to be broken down.
307     break;
308   default:
309     return false;
310   }
311 
312   assert(RegSize >= MemSize);
313 
314   if (AlignBits < MemSize) {
315     const SITargetLowering *TLI = ST.getTargetLowering();
316     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317                                                  Align(AlignBits / 8)))
318       return false;
319   }
320 
321   return true;
322 }
323 
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329   if (EnableNewLegality)
330     return false;
331 
332   const unsigned Size = Ty.getSizeInBits();
333   if (Size <= 64)
334     return false;
335   if (!Ty.isVector())
336     return true;
337 
338   LLT EltTy = Ty.getElementType();
339   if (EltTy.isPointer())
340     return true;
341 
342   unsigned EltSize = EltTy.getSizeInBits();
343   return EltSize != 32 && EltSize != 64;
344 }
345 
346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347                              unsigned Opcode) {
348   const LLT Ty = Query.Types[0];
349   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350          !loadStoreBitcastWorkaround(Ty);
351 }
352 
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356                                        const unsigned MemSizeInBits) {
357   const unsigned Size = Ty.getSizeInBits();
358     if (Size != MemSizeInBits)
359       return Size <= 32 && Ty.isVector();
360 
361   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362     return true;
363   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364          !isRegisterVectorElementType(Ty.getElementType());
365 }
366 
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371                             unsigned AlignInBits, unsigned AddrSpace,
372                             unsigned Opcode) {
373   // We don't want to widen cases that are naturally legal.
374   if (isPowerOf2_32(SizeInBits))
375     return false;
376 
377   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378   // end up widening these for a scalar load during RegBankSelect, since there
379   // aren't 96-bit scalar loads.
380   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381     return false;
382 
383   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384     return false;
385 
386   // A load is known dereferenceable up to the alignment, so it's legal to widen
387   // to it.
388   //
389   // TODO: Could check dereferenceable for less aligned cases.
390   unsigned RoundedSize = NextPowerOf2(SizeInBits);
391   if (AlignInBits < RoundedSize)
392     return false;
393 
394   // Do not widen if it would introduce a slow unaligned load.
395   const SITargetLowering *TLI = ST.getTargetLowering();
396   bool Fast = false;
397   return TLI->allowsMisalignedMemoryAccessesImpl(
398              RoundedSize, AddrSpace, Align(AlignInBits / 8),
399              MachineMemOperand::MOLoad, &Fast) &&
400          Fast;
401 }
402 
403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404                             unsigned Opcode) {
405   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406     return false;
407 
408   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409                          Query.MMODescrs[0].AlignInBits,
410                          Query.Types[1].getAddressSpace(), Opcode);
411 }
412 
413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414                                          const GCNTargetMachine &TM)
415   :  ST(ST_) {
416   using namespace TargetOpcode;
417 
418   auto GetAddrSpacePtr = [&TM](unsigned AS) {
419     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420   };
421 
422   const LLT S1 = LLT::scalar(1);
423   const LLT S8 = LLT::scalar(8);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S8 = LLT::vector(2, 8);
433   const LLT V2S16 = LLT::vector(2, 16);
434   const LLT V4S16 = LLT::vector(4, 16);
435 
436   const LLT V2S32 = LLT::vector(2, 32);
437   const LLT V3S32 = LLT::vector(3, 32);
438   const LLT V4S32 = LLT::vector(4, 32);
439   const LLT V5S32 = LLT::vector(5, 32);
440   const LLT V6S32 = LLT::vector(6, 32);
441   const LLT V7S32 = LLT::vector(7, 32);
442   const LLT V8S32 = LLT::vector(8, 32);
443   const LLT V9S32 = LLT::vector(9, 32);
444   const LLT V10S32 = LLT::vector(10, 32);
445   const LLT V11S32 = LLT::vector(11, 32);
446   const LLT V12S32 = LLT::vector(12, 32);
447   const LLT V13S32 = LLT::vector(13, 32);
448   const LLT V14S32 = LLT::vector(14, 32);
449   const LLT V15S32 = LLT::vector(15, 32);
450   const LLT V16S32 = LLT::vector(16, 32);
451   const LLT V32S32 = LLT::vector(32, 32);
452 
453   const LLT V2S64 = LLT::vector(2, 64);
454   const LLT V3S64 = LLT::vector(3, 64);
455   const LLT V4S64 = LLT::vector(4, 64);
456   const LLT V5S64 = LLT::vector(5, 64);
457   const LLT V6S64 = LLT::vector(6, 64);
458   const LLT V7S64 = LLT::vector(7, 64);
459   const LLT V8S64 = LLT::vector(8, 64);
460   const LLT V16S64 = LLT::vector(16, 64);
461 
462   std::initializer_list<LLT> AllS32Vectors =
463     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465   std::initializer_list<LLT> AllS64Vectors =
466     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467 
468   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475 
476   const LLT CodePtr = FlatPtr;
477 
478   const std::initializer_list<LLT> AddrSpaces64 = {
479     GlobalPtr, ConstantPtr, FlatPtr
480   };
481 
482   const std::initializer_list<LLT> AddrSpaces32 = {
483     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484   };
485 
486   const std::initializer_list<LLT> FPTypesBase = {
487     S32, S64
488   };
489 
490   const std::initializer_list<LLT> FPTypes16 = {
491     S32, S64, S16
492   };
493 
494   const std::initializer_list<LLT> FPTypesPK16 = {
495     S32, S64, S16, V2S16
496   };
497 
498   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499 
500   setAction({G_BRCOND, S1}, Legal); // VCC branches
501   setAction({G_BRCOND, S32}, Legal); // SCC branches
502 
503   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504   // elements for v3s16
505   getActionDefinitionsBuilder(G_PHI)
506     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507     .legalFor(AllS32Vectors)
508     .legalFor(AllS64Vectors)
509     .legalFor(AddrSpaces64)
510     .legalFor(AddrSpaces32)
511     .legalIf(isPointer(0))
512     .clampScalar(0, S16, S256)
513     .widenScalarToNextPow2(0, 32)
514     .clampMaxNumElements(0, S32, 16)
515     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516     .scalarize(0);
517 
518   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519     // Full set of gfx9 features.
520     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521       .legalFor({S32, S16, V2S16})
522       .clampScalar(0, S16, S32)
523       .clampMaxNumElements(0, S16, 2)
524       .scalarize(0)
525       .widenScalarToNextPow2(0, 32);
526 
527     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528       .legalFor({S32, S16, V2S16}) // Clamp modifier
529       .minScalarOrElt(0, S16)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32)
533       .lower();
534   } else if (ST.has16BitInsts()) {
535     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536       .legalFor({S32, S16})
537       .clampScalar(0, S16, S32)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540 
541     // Technically the saturating operations require clamp bit support, but this
542     // was introduced at the same time as 16-bit operations.
543     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544       .legalFor({S32, S16}) // Clamp modifier
545       .minScalar(0, S16)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 16)
548       .lower();
549 
550     // We're just lowering this, but it helps get a better result to try to
551     // coerce to the desired type first.
552     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553       .minScalar(0, S16)
554       .scalarize(0)
555       .lower();
556   } else {
557     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558       .legalFor({S32})
559       .clampScalar(0, S32, S32)
560       .scalarize(0);
561 
562     if (ST.hasIntClamp()) {
563       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564         .legalFor({S32}) // Clamp modifier.
565         .scalarize(0)
566         .minScalarOrElt(0, S32)
567         .lower();
568     } else {
569       // Clamp bit support was added in VI, along with 16-bit operations.
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .minScalar(0, S32)
572         .scalarize(0)
573         .lower();
574     }
575 
576     // FIXME: DAG expansion gets better results. The widening uses the smaller
577     // range values and goes for the min/max lowering directly.
578     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579       .minScalar(0, S32)
580       .scalarize(0)
581       .lower();
582   }
583 
584   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585     .customFor({S32, S64})
586     .clampScalar(0, S32, S64)
587     .widenScalarToNextPow2(0, 32)
588     .scalarize(0);
589 
590   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591                    .legalFor({S32})
592                    .maxScalarOrElt(0, S32);
593 
594   if (ST.hasVOP3PInsts()) {
595     Mulh
596       .clampMaxNumElements(0, S8, 2)
597       .lowerFor({V2S8});
598   }
599 
600   Mulh
601     .scalarize(0)
602     .lower();
603 
604   // Report legal for any types we can handle anywhere. For the cases only legal
605   // on the SALU, RegBankSelect will be able to re-legalize.
606   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608     .clampScalar(0, S32, S64)
609     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611     .widenScalarToNextPow2(0)
612     .scalarize(0);
613 
614   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616     .legalFor({{S32, S1}, {S32, S32}})
617     .minScalar(0, S32)
618     // TODO: .scalarize(0)
619     .lower();
620 
621   getActionDefinitionsBuilder(G_BITCAST)
622     // Don't worry about the size constraint.
623     .legalIf(all(isRegisterType(0), isRegisterType(1)))
624     .lower();
625 
626 
627   getActionDefinitionsBuilder(G_CONSTANT)
628     .legalFor({S1, S32, S64, S16, GlobalPtr,
629                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630     .legalIf(isPointer(0))
631     .clampScalar(0, S32, S64)
632     .widenScalarToNextPow2(0);
633 
634   getActionDefinitionsBuilder(G_FCONSTANT)
635     .legalFor({S32, S64, S16})
636     .clampScalar(0, S16, S64);
637 
638   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639       .legalIf(isRegisterType(0))
640       // s1 and s16 are special cases because they have legal operations on
641       // them, but don't really occupy registers in the normal way.
642       .legalFor({S1, S16})
643       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644       .clampScalarOrElt(0, S32, MaxScalar)
645       .widenScalarToNextPow2(0, 32)
646       .clampMaxNumElements(0, S32, 16);
647 
648   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649 
650   // If the amount is divergent, we have to do a wave reduction to get the
651   // maximum value, so this is expanded during RegBankSelect.
652   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653     .legalFor({{PrivatePtr, S32}});
654 
655   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656     .customIf(typeIsNot(0, PrivatePtr));
657 
658   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659 
660   auto &FPOpActions = getActionDefinitionsBuilder(
661     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662     .legalFor({S32, S64});
663   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664     .customFor({S32, S64});
665   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666     .customFor({S32, S64});
667 
668   if (ST.has16BitInsts()) {
669     if (ST.hasVOP3PInsts())
670       FPOpActions.legalFor({S16, V2S16});
671     else
672       FPOpActions.legalFor({S16});
673 
674     TrigActions.customFor({S16});
675     FDIVActions.customFor({S16});
676   }
677 
678   auto &MinNumMaxNum = getActionDefinitionsBuilder({
679       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680 
681   if (ST.hasVOP3PInsts()) {
682     MinNumMaxNum.customFor(FPTypesPK16)
683       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684       .clampMaxNumElements(0, S16, 2)
685       .clampScalar(0, S16, S64)
686       .scalarize(0);
687   } else if (ST.has16BitInsts()) {
688     MinNumMaxNum.customFor(FPTypes16)
689       .clampScalar(0, S16, S64)
690       .scalarize(0);
691   } else {
692     MinNumMaxNum.customFor(FPTypesBase)
693       .clampScalar(0, S32, S64)
694       .scalarize(0);
695   }
696 
697   if (ST.hasVOP3PInsts())
698     FPOpActions.clampMaxNumElements(0, S16, 2);
699 
700   FPOpActions
701     .scalarize(0)
702     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703 
704   TrigActions
705     .scalarize(0)
706     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707 
708   FDIVActions
709     .scalarize(0)
710     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711 
712   getActionDefinitionsBuilder({G_FNEG, G_FABS})
713     .legalFor(FPTypesPK16)
714     .clampMaxNumElements(0, S16, 2)
715     .scalarize(0)
716     .clampScalar(0, S16, S64);
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720       .legalFor({S32, S64, S16})
721       .scalarize(0)
722       .clampScalar(0, S16, S64);
723   } else {
724     getActionDefinitionsBuilder(G_FSQRT)
725       .legalFor({S32, S64})
726       .scalarize(0)
727       .clampScalar(0, S32, S64);
728 
729     if (ST.hasFractBug()) {
730       getActionDefinitionsBuilder(G_FFLOOR)
731         .customFor({S64})
732         .legalFor({S32, S64})
733         .scalarize(0)
734         .clampScalar(0, S32, S64);
735     } else {
736       getActionDefinitionsBuilder(G_FFLOOR)
737         .legalFor({S32, S64})
738         .scalarize(0)
739         .clampScalar(0, S32, S64);
740     }
741   }
742 
743   getActionDefinitionsBuilder(G_FPTRUNC)
744     .legalFor({{S32, S64}, {S16, S32}})
745     .scalarize(0)
746     .lower();
747 
748   getActionDefinitionsBuilder(G_FPEXT)
749     .legalFor({{S64, S32}, {S32, S16}})
750     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751     .scalarize(0);
752 
753   getActionDefinitionsBuilder(G_FSUB)
754       // Use actual fsub instruction
755       .legalFor({S32})
756       // Must use fadd + fneg
757       .lowerFor({S64, S16, V2S16})
758       .scalarize(0)
759       .clampScalar(0, S32, S64);
760 
761   // Whether this is legal depends on the floating point mode for the function.
762   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764     FMad.customFor({S32, S16});
765   else if (ST.hasMadMacF32Insts())
766     FMad.customFor({S32});
767   else if (ST.hasMadF16())
768     FMad.customFor({S16});
769   FMad.scalarize(0)
770       .lower();
771 
772   auto &FRem = getActionDefinitionsBuilder(G_FREM);
773   if (ST.has16BitInsts()) {
774     FRem.customFor({S16, S32, S64});
775   } else {
776     FRem.minScalar(0, S32)
777         .customFor({S32, S64});
778   }
779   FRem.scalarize(0);
780 
781   // TODO: Do we need to clamp maximum bitwidth?
782   getActionDefinitionsBuilder(G_TRUNC)
783     .legalIf(isScalar(0))
784     .legalFor({{V2S16, V2S32}})
785     .clampMaxNumElements(0, S16, 2)
786     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787     // situations (like an invalid implicit use), we don't want to infinite loop
788     // in the legalizer.
789     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790     .alwaysLegal();
791 
792   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794                {S32, S1}, {S64, S1}, {S16, S1}})
795     .scalarize(0)
796     .clampScalar(0, S32, S64)
797     .widenScalarToNextPow2(1, 32);
798 
799   // TODO: Split s1->s64 during regbankselect for VALU.
800   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802     .lowerFor({{S32, S64}})
803     .lowerIf(typeIs(1, S1))
804     .customFor({{S64, S64}});
805   if (ST.has16BitInsts())
806     IToFP.legalFor({{S16, S16}});
807   IToFP.clampScalar(1, S32, S64)
808        .minScalar(0, S32)
809        .scalarize(0)
810        .widenScalarToNextPow2(1);
811 
812   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814     .customFor({{S64, S64}})
815     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816   if (ST.has16BitInsts())
817     FPToI.legalFor({{S16, S16}});
818   else
819     FPToI.minScalar(1, S32);
820 
821   FPToI.minScalar(0, S32)
822        .scalarize(0)
823        .lower();
824 
825   // Lower roundeven into G_FRINT
826   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
827     .scalarize(0)
828     .lower();
829 
830   if (ST.has16BitInsts()) {
831     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832       .legalFor({S16, S32, S64})
833       .clampScalar(0, S16, S64)
834       .scalarize(0);
835   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
836     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
837       .legalFor({S32, S64})
838       .clampScalar(0, S32, S64)
839       .scalarize(0);
840   } else {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S32})
843       .customFor({S64})
844       .clampScalar(0, S32, S64)
845       .scalarize(0);
846   }
847 
848   getActionDefinitionsBuilder(G_PTR_ADD)
849     .legalIf(all(isPointer(0), sameSize(0, 1)))
850     .scalarize(0)
851     .scalarSameSizeAs(1, 0);
852 
853   getActionDefinitionsBuilder(G_PTRMASK)
854     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
855     .scalarSameSizeAs(1, 0)
856     .scalarize(0);
857 
858   auto &CmpBuilder =
859     getActionDefinitionsBuilder(G_ICMP)
860     // The compare output type differs based on the register bank of the output,
861     // so make both s1 and s32 legal.
862     //
863     // Scalar compares producing output in scc will be promoted to s32, as that
864     // is the allocatable register type that will be needed for the copy from
865     // scc. This will be promoted during RegBankSelect, and we assume something
866     // before that won't try to use s32 result types.
867     //
868     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
869     // bank.
870     .legalForCartesianProduct(
871       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
872     .legalForCartesianProduct(
873       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
874   if (ST.has16BitInsts()) {
875     CmpBuilder.legalFor({{S1, S16}});
876   }
877 
878   CmpBuilder
879     .widenScalarToNextPow2(1)
880     .clampScalar(1, S32, S64)
881     .scalarize(0)
882     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
883 
884   getActionDefinitionsBuilder(G_FCMP)
885     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
886     .widenScalarToNextPow2(1)
887     .clampScalar(1, S32, S64)
888     .scalarize(0);
889 
890   // FIXME: fpow has a selection pattern that should move to custom lowering.
891   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
892   if (ST.has16BitInsts())
893     Exp2Ops.legalFor({S32, S16});
894   else
895     Exp2Ops.legalFor({S32});
896   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
897   Exp2Ops.scalarize(0);
898 
899   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
900   if (ST.has16BitInsts())
901     ExpOps.customFor({{S32}, {S16}});
902   else
903     ExpOps.customFor({S32});
904   ExpOps.clampScalar(0, MinScalarFPTy, S32)
905         .scalarize(0);
906 
907   getActionDefinitionsBuilder(G_FPOWI)
908     .clampScalar(0, MinScalarFPTy, S32)
909     .lower();
910 
911   // The 64-bit versions produce 32-bit results, but only on the SALU.
912   getActionDefinitionsBuilder(G_CTPOP)
913     .legalFor({{S32, S32}, {S32, S64}})
914     .clampScalar(0, S32, S32)
915     .clampScalar(1, S32, S64)
916     .scalarize(0)
917     .widenScalarToNextPow2(0, 32)
918     .widenScalarToNextPow2(1, 32);
919 
920   // The hardware instructions return a different result on 0 than the generic
921   // instructions expect. The hardware produces -1, but these produce the
922   // bitwidth.
923   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
924     .scalarize(0)
925     .clampScalar(0, S32, S32)
926     .clampScalar(1, S32, S64)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32)
929     .lower();
930 
931   // The 64-bit versions produce 32-bit results, but only on the SALU.
932   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
933     .legalFor({{S32, S32}, {S32, S64}})
934     .clampScalar(0, S32, S32)
935     .clampScalar(1, S32, S64)
936     .scalarize(0)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32);
939 
940   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
941   // RegBankSelect.
942   getActionDefinitionsBuilder(G_BITREVERSE)
943     .legalFor({S32, S64})
944     .clampScalar(0, S32, S64)
945     .scalarize(0)
946     .widenScalarToNextPow2(0);
947 
948   if (ST.has16BitInsts()) {
949     getActionDefinitionsBuilder(G_BSWAP)
950       .legalFor({S16, S32, V2S16})
951       .clampMaxNumElements(0, S16, 2)
952       // FIXME: Fixing non-power-of-2 before clamp is workaround for
953       // narrowScalar limitation.
954       .widenScalarToNextPow2(0)
955       .clampScalar(0, S16, S32)
956       .scalarize(0);
957 
958     if (ST.hasVOP3PInsts()) {
959       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
960         .legalFor({S32, S16, V2S16})
961         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
962         .clampMaxNumElements(0, S16, 2)
963         .minScalar(0, S16)
964         .widenScalarToNextPow2(0)
965         .scalarize(0)
966         .lower();
967     } else {
968       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
969         .legalFor({S32, S16})
970         .widenScalarToNextPow2(0)
971         .minScalar(0, S16)
972         .scalarize(0)
973         .lower();
974     }
975   } else {
976     // TODO: Should have same legality without v_perm_b32
977     getActionDefinitionsBuilder(G_BSWAP)
978       .legalFor({S32})
979       .lowerIf(scalarNarrowerThan(0, 32))
980       // FIXME: Fixing non-power-of-2 before clamp is workaround for
981       // narrowScalar limitation.
982       .widenScalarToNextPow2(0)
983       .maxScalar(0, S32)
984       .scalarize(0)
985       .lower();
986 
987     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
988       .legalFor({S32})
989       .minScalar(0, S32)
990       .widenScalarToNextPow2(0)
991       .scalarize(0)
992       .lower();
993   }
994 
995   getActionDefinitionsBuilder(G_INTTOPTR)
996     // List the common cases
997     .legalForCartesianProduct(AddrSpaces64, {S64})
998     .legalForCartesianProduct(AddrSpaces32, {S32})
999     .scalarize(0)
1000     // Accept any address space as long as the size matches
1001     .legalIf(sameSize(0, 1))
1002     .widenScalarIf(smallerThan(1, 0),
1003       [](const LegalityQuery &Query) {
1004         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1005       })
1006     .narrowScalarIf(largerThan(1, 0),
1007       [](const LegalityQuery &Query) {
1008         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1009       });
1010 
1011   getActionDefinitionsBuilder(G_PTRTOINT)
1012     // List the common cases
1013     .legalForCartesianProduct(AddrSpaces64, {S64})
1014     .legalForCartesianProduct(AddrSpaces32, {S32})
1015     .scalarize(0)
1016     // Accept any address space as long as the size matches
1017     .legalIf(sameSize(0, 1))
1018     .widenScalarIf(smallerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       })
1022     .narrowScalarIf(
1023       largerThan(0, 1),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1029     .scalarize(0)
1030     .custom();
1031 
1032   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1033                                     bool IsLoad) -> bool {
1034     const LLT DstTy = Query.Types[0];
1035 
1036     // Split vector extloads.
1037     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1038     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1039 
1040     if (MemSize < DstTy.getSizeInBits())
1041       MemSize = std::max(MemSize, AlignBits);
1042 
1043     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1044       return true;
1045 
1046     const LLT PtrTy = Query.Types[1];
1047     unsigned AS = PtrTy.getAddressSpace();
1048     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1049       return true;
1050 
1051     // Catch weird sized loads that don't evenly divide into the access sizes
1052     // TODO: May be able to widen depending on alignment etc.
1053     unsigned NumRegs = (MemSize + 31) / 32;
1054     if (NumRegs == 3) {
1055       if (!ST.hasDwordx3LoadStores())
1056         return true;
1057     } else {
1058       // If the alignment allows, these should have been widened.
1059       if (!isPowerOf2_32(NumRegs))
1060         return true;
1061     }
1062 
1063     if (AlignBits < MemSize) {
1064       const SITargetLowering *TLI = ST.getTargetLowering();
1065       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1066                                                       Align(AlignBits / 8));
1067     }
1068 
1069     return false;
1070   };
1071 
1072   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1073   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1074   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1075 
1076   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1077   // LDS
1078   // TODO: Unsupported flat for SI.
1079 
1080   for (unsigned Op : {G_LOAD, G_STORE}) {
1081     const bool IsStore = Op == G_STORE;
1082 
1083     auto &Actions = getActionDefinitionsBuilder(Op);
1084     // Explicitly list some common cases.
1085     // TODO: Does this help compile time at all?
1086     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1087                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1088                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1089                                       {S64, GlobalPtr, 64, GlobalAlign32},
1090                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1091                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1092                                       {S32, GlobalPtr, 8, GlobalAlign8},
1093                                       {S32, GlobalPtr, 16, GlobalAlign16},
1094 
1095                                       {S32, LocalPtr, 32, 32},
1096                                       {S64, LocalPtr, 64, 32},
1097                                       {V2S32, LocalPtr, 64, 32},
1098                                       {S32, LocalPtr, 8, 8},
1099                                       {S32, LocalPtr, 16, 16},
1100                                       {V2S16, LocalPtr, 32, 32},
1101 
1102                                       {S32, PrivatePtr, 32, 32},
1103                                       {S32, PrivatePtr, 8, 8},
1104                                       {S32, PrivatePtr, 16, 16},
1105                                       {V2S16, PrivatePtr, 32, 32},
1106 
1107                                       {S32, ConstantPtr, 32, GlobalAlign32},
1108                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1109                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1110                                       {S64, ConstantPtr, 64, GlobalAlign32},
1111                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1112     Actions.legalIf(
1113       [=](const LegalityQuery &Query) -> bool {
1114         return isLoadStoreLegal(ST, Query, Op);
1115       });
1116 
1117     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1118     // 64-bits.
1119     //
1120     // TODO: Should generalize bitcast action into coerce, which will also cover
1121     // inserting addrspacecasts.
1122     Actions.customIf(typeIs(1, Constant32Ptr));
1123 
1124     // Turn any illegal element vectors into something easier to deal
1125     // with. These will ultimately produce 32-bit scalar shifts to extract the
1126     // parts anyway.
1127     //
1128     // For odd 16-bit element vectors, prefer to split those into pieces with
1129     // 16-bit vector parts.
1130     Actions.bitcastIf(
1131       [=](const LegalityQuery &Query) -> bool {
1132         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1133                                           Query.MMODescrs[0].SizeInBits);
1134       }, bitcastToRegisterType(0));
1135 
1136     if (!IsStore) {
1137       // Widen suitably aligned loads by loading extra bytes. The standard
1138       // legalization actions can't properly express widening memory operands.
1139       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1140         return shouldWidenLoad(ST, Query, G_LOAD);
1141       });
1142     }
1143 
1144     // FIXME: load/store narrowing should be moved to lower action
1145     Actions
1146         .narrowScalarIf(
1147             [=](const LegalityQuery &Query) -> bool {
1148               return !Query.Types[0].isVector() &&
1149                      needToSplitMemOp(Query, Op == G_LOAD);
1150             },
1151             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1152               const LLT DstTy = Query.Types[0];
1153               const LLT PtrTy = Query.Types[1];
1154 
1155               const unsigned DstSize = DstTy.getSizeInBits();
1156               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1157 
1158               // Split extloads.
1159               if (DstSize > MemSize)
1160                 return std::make_pair(0, LLT::scalar(MemSize));
1161 
1162               if (!isPowerOf2_32(DstSize)) {
1163                 // We're probably decomposing an odd sized store. Try to split
1164                 // to the widest type. TODO: Account for alignment. As-is it
1165                 // should be OK, since the new parts will be further legalized.
1166                 unsigned FloorSize = PowerOf2Floor(DstSize);
1167                 return std::make_pair(0, LLT::scalar(FloorSize));
1168               }
1169 
1170               if (DstSize > 32 && (DstSize % 32 != 0)) {
1171                 // FIXME: Need a way to specify non-extload of larger size if
1172                 // suitably aligned.
1173                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1174               }
1175 
1176               unsigned MaxSize = maxSizeForAddrSpace(ST,
1177                                                      PtrTy.getAddressSpace(),
1178                                                      Op == G_LOAD);
1179               if (MemSize > MaxSize)
1180                 return std::make_pair(0, LLT::scalar(MaxSize));
1181 
1182               unsigned Align = Query.MMODescrs[0].AlignInBits;
1183               return std::make_pair(0, LLT::scalar(Align));
1184             })
1185         .fewerElementsIf(
1186             [=](const LegalityQuery &Query) -> bool {
1187               return Query.Types[0].isVector() &&
1188                      needToSplitMemOp(Query, Op == G_LOAD);
1189             },
1190             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1191               const LLT DstTy = Query.Types[0];
1192               const LLT PtrTy = Query.Types[1];
1193 
1194               LLT EltTy = DstTy.getElementType();
1195               unsigned MaxSize = maxSizeForAddrSpace(ST,
1196                                                      PtrTy.getAddressSpace(),
1197                                                      Op == G_LOAD);
1198 
1199               // FIXME: Handle widened to power of 2 results better. This ends
1200               // up scalarizing.
1201               // FIXME: 3 element stores scalarized on SI
1202 
1203               // Split if it's too large for the address space.
1204               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1205                 unsigned NumElts = DstTy.getNumElements();
1206                 unsigned EltSize = EltTy.getSizeInBits();
1207 
1208                 if (MaxSize % EltSize == 0) {
1209                   return std::make_pair(
1210                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1211                 }
1212 
1213                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1214 
1215                 // FIXME: Refine when odd breakdowns handled
1216                 // The scalars will need to be re-legalized.
1217                 if (NumPieces == 1 || NumPieces >= NumElts ||
1218                     NumElts % NumPieces != 0)
1219                   return std::make_pair(0, EltTy);
1220 
1221                 return std::make_pair(0,
1222                                       LLT::vector(NumElts / NumPieces, EltTy));
1223               }
1224 
1225               // FIXME: We could probably handle weird extending loads better.
1226               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1227               if (DstTy.getSizeInBits() > MemSize)
1228                 return std::make_pair(0, EltTy);
1229 
1230               unsigned EltSize = EltTy.getSizeInBits();
1231               unsigned DstSize = DstTy.getSizeInBits();
1232               if (!isPowerOf2_32(DstSize)) {
1233                 // We're probably decomposing an odd sized store. Try to split
1234                 // to the widest type. TODO: Account for alignment. As-is it
1235                 // should be OK, since the new parts will be further legalized.
1236                 unsigned FloorSize = PowerOf2Floor(DstSize);
1237                 return std::make_pair(
1238                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1239               }
1240 
1241               // Need to split because of alignment.
1242               unsigned Align = Query.MMODescrs[0].AlignInBits;
1243               if (EltSize > Align &&
1244                   (EltSize / Align < DstTy.getNumElements())) {
1245                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1246               }
1247 
1248               // May need relegalization for the scalars.
1249               return std::make_pair(0, EltTy);
1250             })
1251     .lowerIfMemSizeNotPow2()
1252     .minScalar(0, S32);
1253 
1254     if (IsStore)
1255       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1256 
1257     Actions
1258         .widenScalarToNextPow2(0)
1259         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1260         .lower();
1261   }
1262 
1263   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1264                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1265                                                   {S32, GlobalPtr, 16, 2 * 8},
1266                                                   {S32, LocalPtr, 8, 8},
1267                                                   {S32, LocalPtr, 16, 16},
1268                                                   {S32, PrivatePtr, 8, 8},
1269                                                   {S32, PrivatePtr, 16, 16},
1270                                                   {S32, ConstantPtr, 8, 8},
1271                                                   {S32, ConstantPtr, 16, 2 * 8}});
1272   if (ST.hasFlatAddressSpace()) {
1273     ExtLoads.legalForTypesWithMemDesc(
1274         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1275   }
1276 
1277   ExtLoads.clampScalar(0, S32, S32)
1278           .widenScalarToNextPow2(0)
1279           .unsupportedIfMemSizeNotPow2()
1280           .lower();
1281 
1282   auto &Atomics = getActionDefinitionsBuilder(
1283     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1284      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1285      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1286      G_ATOMICRMW_UMIN})
1287     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1288                {S64, GlobalPtr}, {S64, LocalPtr},
1289                {S32, RegionPtr}, {S64, RegionPtr}});
1290   if (ST.hasFlatAddressSpace()) {
1291     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1292   }
1293 
1294   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1295   if (ST.hasLDSFPAtomics()) {
1296     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1297     if (ST.hasGFX90AInsts())
1298       Atomic.legalFor({{S64, LocalPtr}});
1299   }
1300   if (ST.hasAtomicFaddInsts())
1301     Atomic.legalFor({{S32, GlobalPtr}});
1302 
1303   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1304   // demarshalling
1305   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1306     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1307                 {S32, FlatPtr}, {S64, FlatPtr}})
1308     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1309                {S32, RegionPtr}, {S64, RegionPtr}});
1310   // TODO: Pointer types, any 32-bit or 64-bit vector
1311 
1312   // Condition should be s32 for scalar, s1 for vector.
1313   getActionDefinitionsBuilder(G_SELECT)
1314     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1315           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1316           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1317     .clampScalar(0, S16, S64)
1318     .scalarize(1)
1319     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1320     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1321     .clampMaxNumElements(0, S32, 2)
1322     .clampMaxNumElements(0, LocalPtr, 2)
1323     .clampMaxNumElements(0, PrivatePtr, 2)
1324     .scalarize(0)
1325     .widenScalarToNextPow2(0)
1326     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1327 
1328   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1329   // be more flexible with the shift amount type.
1330   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1331     .legalFor({{S32, S32}, {S64, S32}});
1332   if (ST.has16BitInsts()) {
1333     if (ST.hasVOP3PInsts()) {
1334       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1335             .clampMaxNumElements(0, S16, 2);
1336     } else
1337       Shifts.legalFor({{S16, S16}});
1338 
1339     // TODO: Support 16-bit shift amounts for all types
1340     Shifts.widenScalarIf(
1341       [=](const LegalityQuery &Query) {
1342         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1343         // 32-bit amount.
1344         const LLT ValTy = Query.Types[0];
1345         const LLT AmountTy = Query.Types[1];
1346         return ValTy.getSizeInBits() <= 16 &&
1347                AmountTy.getSizeInBits() < 16;
1348       }, changeTo(1, S16));
1349     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1350     Shifts.clampScalar(1, S32, S32);
1351     Shifts.clampScalar(0, S16, S64);
1352     Shifts.widenScalarToNextPow2(0, 16);
1353 
1354     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1355       .minScalar(0, S16)
1356       .scalarize(0)
1357       .lower();
1358   } else {
1359     // Make sure we legalize the shift amount type first, as the general
1360     // expansion for the shifted type will produce much worse code if it hasn't
1361     // been truncated already.
1362     Shifts.clampScalar(1, S32, S32);
1363     Shifts.clampScalar(0, S32, S64);
1364     Shifts.widenScalarToNextPow2(0, 32);
1365 
1366     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1367       .minScalar(0, S32)
1368       .scalarize(0)
1369       .lower();
1370   }
1371   Shifts.scalarize(0);
1372 
1373   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1374     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1375     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1376     unsigned IdxTypeIdx = 2;
1377 
1378     getActionDefinitionsBuilder(Op)
1379       .customIf([=](const LegalityQuery &Query) {
1380           const LLT EltTy = Query.Types[EltTypeIdx];
1381           const LLT VecTy = Query.Types[VecTypeIdx];
1382           const LLT IdxTy = Query.Types[IdxTypeIdx];
1383           const unsigned EltSize = EltTy.getSizeInBits();
1384           return (EltSize == 32 || EltSize == 64) &&
1385                   VecTy.getSizeInBits() % 32 == 0 &&
1386                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1387                   IdxTy.getSizeInBits() == 32;
1388         })
1389       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1390                  bitcastToVectorElement32(VecTypeIdx))
1391       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1392       .bitcastIf(
1393         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1394         [=](const LegalityQuery &Query) {
1395           // For > 64-bit element types, try to turn this into a 64-bit
1396           // element vector since we may be able to do better indexing
1397           // if this is scalar. If not, fall back to 32.
1398           const LLT EltTy = Query.Types[EltTypeIdx];
1399           const LLT VecTy = Query.Types[VecTypeIdx];
1400           const unsigned DstEltSize = EltTy.getSizeInBits();
1401           const unsigned VecSize = VecTy.getSizeInBits();
1402 
1403           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1404           return std::make_pair(
1405             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1406         })
1407       .clampScalar(EltTypeIdx, S32, S64)
1408       .clampScalar(VecTypeIdx, S32, S64)
1409       .clampScalar(IdxTypeIdx, S32, S32)
1410       .clampMaxNumElements(VecTypeIdx, S32, 32)
1411       // TODO: Clamp elements for 64-bit vectors?
1412       // It should only be necessary with variable indexes.
1413       // As a last resort, lower to the stack
1414       .lower();
1415   }
1416 
1417   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1418     .unsupportedIf([=](const LegalityQuery &Query) {
1419         const LLT &EltTy = Query.Types[1].getElementType();
1420         return Query.Types[0] != EltTy;
1421       });
1422 
1423   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1424     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1425     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1426 
1427     // FIXME: Doesn't handle extract of illegal sizes.
1428     getActionDefinitionsBuilder(Op)
1429       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1430       // FIXME: Multiples of 16 should not be legal.
1431       .legalIf([=](const LegalityQuery &Query) {
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           const LLT LitTy = Query.Types[LitTyIdx];
1434           return (BigTy.getSizeInBits() % 32 == 0) &&
1435                  (LitTy.getSizeInBits() % 16 == 0);
1436         })
1437       .widenScalarIf(
1438         [=](const LegalityQuery &Query) {
1439           const LLT BigTy = Query.Types[BigTyIdx];
1440           return (BigTy.getScalarSizeInBits() < 16);
1441         },
1442         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1443       .widenScalarIf(
1444         [=](const LegalityQuery &Query) {
1445           const LLT LitTy = Query.Types[LitTyIdx];
1446           return (LitTy.getScalarSizeInBits() < 16);
1447         },
1448         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1449       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1450       .widenScalarToNextPow2(BigTyIdx, 32);
1451 
1452   }
1453 
1454   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1455     .legalForCartesianProduct(AllS32Vectors, {S32})
1456     .legalForCartesianProduct(AllS64Vectors, {S64})
1457     .clampNumElements(0, V16S32, V32S32)
1458     .clampNumElements(0, V2S64, V16S64)
1459     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1460 
1461   if (ST.hasScalarPackInsts()) {
1462     BuildVector
1463       // FIXME: Should probably widen s1 vectors straight to s32
1464       .minScalarOrElt(0, S16)
1465       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1466       .minScalar(1, S32);
1467 
1468     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1469       .legalFor({V2S16, S32})
1470       .lower();
1471     BuildVector.minScalarOrElt(0, S32);
1472   } else {
1473     BuildVector.customFor({V2S16, S16});
1474     BuildVector.minScalarOrElt(0, S32);
1475 
1476     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1477       .customFor({V2S16, S32})
1478       .lower();
1479   }
1480 
1481   BuildVector.legalIf(isRegisterType(0));
1482 
1483   // FIXME: Clamp maximum size
1484   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1485     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1486     .clampMaxNumElements(0, S32, 32)
1487     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1488     .clampMaxNumElements(0, S16, 64);
1489 
1490   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1491   // pre-legalize.
1492   if (ST.hasVOP3PInsts()) {
1493     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1494       .customFor({V2S16, V2S16})
1495       .lower();
1496   } else
1497     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1498 
1499   // Merge/Unmerge
1500   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1501     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1502     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1503 
1504     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1505       const LLT Ty = Query.Types[TypeIdx];
1506       if (Ty.isVector()) {
1507         const LLT &EltTy = Ty.getElementType();
1508         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1509           return true;
1510         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1511           return true;
1512       }
1513       return false;
1514     };
1515 
1516     auto &Builder = getActionDefinitionsBuilder(Op)
1517       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1518       .lowerFor({{S16, V2S16}})
1519       .lowerIf([=](const LegalityQuery &Query) {
1520           const LLT BigTy = Query.Types[BigTyIdx];
1521           return BigTy.getSizeInBits() == 32;
1522         })
1523       // Try to widen to s16 first for small types.
1524       // TODO: Only do this on targets with legal s16 shifts
1525       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1526       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1527       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1528       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1529                            elementTypeIs(1, S16)),
1530                        changeTo(1, V2S16))
1531       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1532       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1533       // valid.
1534       .clampScalar(LitTyIdx, S32, S512)
1535       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1536       // Break up vectors with weird elements into scalars
1537       .fewerElementsIf(
1538         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1539         scalarize(0))
1540       .fewerElementsIf(
1541         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1542         scalarize(1))
1543       .clampScalar(BigTyIdx, S32, MaxScalar);
1544 
1545     if (Op == G_MERGE_VALUES) {
1546       Builder.widenScalarIf(
1547         // TODO: Use 16-bit shifts if legal for 8-bit values?
1548         [=](const LegalityQuery &Query) {
1549           const LLT Ty = Query.Types[LitTyIdx];
1550           return Ty.getSizeInBits() < 32;
1551         },
1552         changeTo(LitTyIdx, S32));
1553     }
1554 
1555     Builder.widenScalarIf(
1556       [=](const LegalityQuery &Query) {
1557         const LLT Ty = Query.Types[BigTyIdx];
1558         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1559           Ty.getSizeInBits() % 16 != 0;
1560       },
1561       [=](const LegalityQuery &Query) {
1562         // Pick the next power of 2, or a multiple of 64 over 128.
1563         // Whichever is smaller.
1564         const LLT &Ty = Query.Types[BigTyIdx];
1565         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1566         if (NewSizeInBits >= 256) {
1567           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1568           if (RoundedTo < NewSizeInBits)
1569             NewSizeInBits = RoundedTo;
1570         }
1571         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1572       })
1573       // Any vectors left are the wrong size. Scalarize them.
1574       .scalarize(0)
1575       .scalarize(1);
1576   }
1577 
1578   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1579   // RegBankSelect.
1580   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1581     .legalFor({{S32}, {S64}});
1582 
1583   if (ST.hasVOP3PInsts()) {
1584     SextInReg.lowerFor({{V2S16}})
1585       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1586       // get more vector shift opportunities, since we'll get those when
1587       // expanded.
1588       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1589   } else if (ST.has16BitInsts()) {
1590     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1591   } else {
1592     // Prefer to promote to s32 before lowering if we don't have 16-bit
1593     // shifts. This avoid a lot of intermediate truncate and extend operations.
1594     SextInReg.lowerFor({{S32}, {S64}});
1595   }
1596 
1597   SextInReg
1598     .scalarize(0)
1599     .clampScalar(0, S32, S64)
1600     .lower();
1601 
1602   // TODO: Only Try to form v2s16 with legal packed instructions.
1603   getActionDefinitionsBuilder(G_FSHR)
1604     .legalFor({{S32, S32}})
1605     .lowerFor({{V2S16, V2S16}})
1606     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1607     .scalarize(0)
1608     .lower();
1609 
1610   if (ST.hasVOP3PInsts()) {
1611     getActionDefinitionsBuilder(G_FSHL)
1612       .lowerFor({{V2S16, V2S16}})
1613       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1614       .scalarize(0)
1615       .lower();
1616   } else {
1617     getActionDefinitionsBuilder(G_FSHL)
1618       .scalarize(0)
1619       .lower();
1620   }
1621 
1622   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1623     .legalFor({S64});
1624 
1625   getActionDefinitionsBuilder(G_FENCE)
1626     .alwaysLegal();
1627 
1628   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1629       .scalarize(0)
1630       .minScalar(0, S32)
1631       .lower();
1632 
1633   getActionDefinitionsBuilder({
1634       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1635       G_FCOPYSIGN,
1636 
1637       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1638       G_ATOMICRMW_NAND,
1639       G_ATOMICRMW_FSUB,
1640       G_READ_REGISTER,
1641       G_WRITE_REGISTER,
1642 
1643       G_SADDO, G_SSUBO,
1644 
1645        // TODO: Implement
1646       G_FMINIMUM, G_FMAXIMUM}).lower();
1647 
1648   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1649         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1650         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1651     .unsupported();
1652 
1653   computeTables();
1654   verify(*ST.getInstrInfo());
1655 }
1656 
1657 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1658                                          MachineInstr &MI) const {
1659   MachineIRBuilder &B = Helper.MIRBuilder;
1660   MachineRegisterInfo &MRI = *B.getMRI();
1661 
1662   switch (MI.getOpcode()) {
1663   case TargetOpcode::G_ADDRSPACE_CAST:
1664     return legalizeAddrSpaceCast(MI, MRI, B);
1665   case TargetOpcode::G_FRINT:
1666     return legalizeFrint(MI, MRI, B);
1667   case TargetOpcode::G_FCEIL:
1668     return legalizeFceil(MI, MRI, B);
1669   case TargetOpcode::G_FREM:
1670     return legalizeFrem(MI, MRI, B);
1671   case TargetOpcode::G_INTRINSIC_TRUNC:
1672     return legalizeIntrinsicTrunc(MI, MRI, B);
1673   case TargetOpcode::G_SITOFP:
1674     return legalizeITOFP(MI, MRI, B, true);
1675   case TargetOpcode::G_UITOFP:
1676     return legalizeITOFP(MI, MRI, B, false);
1677   case TargetOpcode::G_FPTOSI:
1678     return legalizeFPTOI(MI, MRI, B, true);
1679   case TargetOpcode::G_FPTOUI:
1680     return legalizeFPTOI(MI, MRI, B, false);
1681   case TargetOpcode::G_FMINNUM:
1682   case TargetOpcode::G_FMAXNUM:
1683   case TargetOpcode::G_FMINNUM_IEEE:
1684   case TargetOpcode::G_FMAXNUM_IEEE:
1685     return legalizeMinNumMaxNum(Helper, MI);
1686   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1687     return legalizeExtractVectorElt(MI, MRI, B);
1688   case TargetOpcode::G_INSERT_VECTOR_ELT:
1689     return legalizeInsertVectorElt(MI, MRI, B);
1690   case TargetOpcode::G_SHUFFLE_VECTOR:
1691     return legalizeShuffleVector(MI, MRI, B);
1692   case TargetOpcode::G_FSIN:
1693   case TargetOpcode::G_FCOS:
1694     return legalizeSinCos(MI, MRI, B);
1695   case TargetOpcode::G_GLOBAL_VALUE:
1696     return legalizeGlobalValue(MI, MRI, B);
1697   case TargetOpcode::G_LOAD:
1698     return legalizeLoad(Helper, MI);
1699   case TargetOpcode::G_FMAD:
1700     return legalizeFMad(MI, MRI, B);
1701   case TargetOpcode::G_FDIV:
1702     return legalizeFDIV(MI, MRI, B);
1703   case TargetOpcode::G_UDIV:
1704   case TargetOpcode::G_UREM:
1705     return legalizeUDIV_UREM(MI, MRI, B);
1706   case TargetOpcode::G_SDIV:
1707   case TargetOpcode::G_SREM:
1708     return legalizeSDIV_SREM(MI, MRI, B);
1709   case TargetOpcode::G_ATOMIC_CMPXCHG:
1710     return legalizeAtomicCmpXChg(MI, MRI, B);
1711   case TargetOpcode::G_FLOG:
1712     return legalizeFlog(MI, B, numbers::ln2f);
1713   case TargetOpcode::G_FLOG10:
1714     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1715   case TargetOpcode::G_FEXP:
1716     return legalizeFExp(MI, B);
1717   case TargetOpcode::G_FPOW:
1718     return legalizeFPow(MI, B);
1719   case TargetOpcode::G_FFLOOR:
1720     return legalizeFFloor(MI, MRI, B);
1721   case TargetOpcode::G_BUILD_VECTOR:
1722     return legalizeBuildVector(MI, MRI, B);
1723   default:
1724     return false;
1725   }
1726 
1727   llvm_unreachable("expected switch to return");
1728 }
1729 
1730 Register AMDGPULegalizerInfo::getSegmentAperture(
1731   unsigned AS,
1732   MachineRegisterInfo &MRI,
1733   MachineIRBuilder &B) const {
1734   MachineFunction &MF = B.getMF();
1735   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1736   const LLT S32 = LLT::scalar(32);
1737 
1738   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1739 
1740   if (ST.hasApertureRegs()) {
1741     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1742     // getreg.
1743     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1744         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1745         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1746     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1747         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1748         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1749     unsigned Encoding =
1750         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1751         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1752         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1753 
1754     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1755 
1756     B.buildInstr(AMDGPU::S_GETREG_B32)
1757       .addDef(GetReg)
1758       .addImm(Encoding);
1759     MRI.setType(GetReg, S32);
1760 
1761     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1762     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1763   }
1764 
1765   Register QueuePtr = MRI.createGenericVirtualRegister(
1766     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1767 
1768   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1769     return Register();
1770 
1771   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1772   // private_segment_aperture_base_hi.
1773   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1774 
1775   // TODO: can we be smarter about machine pointer info?
1776   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1777   MachineMemOperand *MMO = MF.getMachineMemOperand(
1778       PtrInfo,
1779       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1780           MachineMemOperand::MOInvariant,
1781       4, commonAlignment(Align(64), StructOffset));
1782 
1783   Register LoadAddr;
1784 
1785   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1786   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1787 }
1788 
1789 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1790   MachineInstr &MI, MachineRegisterInfo &MRI,
1791   MachineIRBuilder &B) const {
1792   MachineFunction &MF = B.getMF();
1793 
1794   const LLT S32 = LLT::scalar(32);
1795   Register Dst = MI.getOperand(0).getReg();
1796   Register Src = MI.getOperand(1).getReg();
1797 
1798   LLT DstTy = MRI.getType(Dst);
1799   LLT SrcTy = MRI.getType(Src);
1800   unsigned DestAS = DstTy.getAddressSpace();
1801   unsigned SrcAS = SrcTy.getAddressSpace();
1802 
1803   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1804   // vector element.
1805   assert(!DstTy.isVector());
1806 
1807   const AMDGPUTargetMachine &TM
1808     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1809 
1810   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1811     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1812     return true;
1813   }
1814 
1815   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1816     // Truncate.
1817     B.buildExtract(Dst, Src, 0);
1818     MI.eraseFromParent();
1819     return true;
1820   }
1821 
1822   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1823     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1824     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1825 
1826     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1827     // another. Merge operands are required to be the same type, but creating an
1828     // extra ptrtoint would be kind of pointless.
1829     auto HighAddr = B.buildConstant(
1830       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1831     B.buildMerge(Dst, {Src, HighAddr});
1832     MI.eraseFromParent();
1833     return true;
1834   }
1835 
1836   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1837     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1838            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1839     unsigned NullVal = TM.getNullPointerValue(DestAS);
1840 
1841     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1842     auto FlatNull = B.buildConstant(SrcTy, 0);
1843 
1844     // Extract low 32-bits of the pointer.
1845     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1846 
1847     auto CmpRes =
1848         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1849     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1850 
1851     MI.eraseFromParent();
1852     return true;
1853   }
1854 
1855   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1856     return false;
1857 
1858   if (!ST.hasFlatAddressSpace())
1859     return false;
1860 
1861   auto SegmentNull =
1862       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1863   auto FlatNull =
1864       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1865 
1866   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1867   if (!ApertureReg.isValid())
1868     return false;
1869 
1870   auto CmpRes =
1871       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1872 
1873   // Coerce the type of the low half of the result so we can use merge_values.
1874   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1875 
1876   // TODO: Should we allow mismatched types but matching sizes in merges to
1877   // avoid the ptrtoint?
1878   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1879   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1880 
1881   MI.eraseFromParent();
1882   return true;
1883 }
1884 
1885 bool AMDGPULegalizerInfo::legalizeFrint(
1886   MachineInstr &MI, MachineRegisterInfo &MRI,
1887   MachineIRBuilder &B) const {
1888   Register Src = MI.getOperand(1).getReg();
1889   LLT Ty = MRI.getType(Src);
1890   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1891 
1892   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1893   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1894 
1895   auto C1 = B.buildFConstant(Ty, C1Val);
1896   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1897 
1898   // TODO: Should this propagate fast-math-flags?
1899   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1900   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1901 
1902   auto C2 = B.buildFConstant(Ty, C2Val);
1903   auto Fabs = B.buildFAbs(Ty, Src);
1904 
1905   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1906   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1907   MI.eraseFromParent();
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFceil(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914 
1915   const LLT S1 = LLT::scalar(1);
1916   const LLT S64 = LLT::scalar(64);
1917 
1918   Register Src = MI.getOperand(1).getReg();
1919   assert(MRI.getType(Src) == S64);
1920 
1921   // result = trunc(src)
1922   // if (src > 0.0 && src != result)
1923   //   result += 1.0
1924 
1925   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1926 
1927   const auto Zero = B.buildFConstant(S64, 0.0);
1928   const auto One = B.buildFConstant(S64, 1.0);
1929   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1930   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1931   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1932   auto Add = B.buildSelect(S64, And, One, Zero);
1933 
1934   // TODO: Should this propagate fast-math-flags?
1935   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1936   return true;
1937 }
1938 
1939 bool AMDGPULegalizerInfo::legalizeFrem(
1940   MachineInstr &MI, MachineRegisterInfo &MRI,
1941   MachineIRBuilder &B) const {
1942     Register DstReg = MI.getOperand(0).getReg();
1943     Register Src0Reg = MI.getOperand(1).getReg();
1944     Register Src1Reg = MI.getOperand(2).getReg();
1945     auto Flags = MI.getFlags();
1946     LLT Ty = MRI.getType(DstReg);
1947 
1948     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1949     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1950     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1951     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1952     MI.eraseFromParent();
1953     return true;
1954 }
1955 
1956 static MachineInstrBuilder extractF64Exponent(Register Hi,
1957                                               MachineIRBuilder &B) {
1958   const unsigned FractBits = 52;
1959   const unsigned ExpBits = 11;
1960   LLT S32 = LLT::scalar(32);
1961 
1962   auto Const0 = B.buildConstant(S32, FractBits - 32);
1963   auto Const1 = B.buildConstant(S32, ExpBits);
1964 
1965   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1966     .addUse(Hi)
1967     .addUse(Const0.getReg(0))
1968     .addUse(Const1.getReg(0));
1969 
1970   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1971 }
1972 
1973 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1974   MachineInstr &MI, MachineRegisterInfo &MRI,
1975   MachineIRBuilder &B) const {
1976   const LLT S1 = LLT::scalar(1);
1977   const LLT S32 = LLT::scalar(32);
1978   const LLT S64 = LLT::scalar(64);
1979 
1980   Register Src = MI.getOperand(1).getReg();
1981   assert(MRI.getType(Src) == S64);
1982 
1983   // TODO: Should this use extract since the low half is unused?
1984   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1985   Register Hi = Unmerge.getReg(1);
1986 
1987   // Extract the upper half, since this is where we will find the sign and
1988   // exponent.
1989   auto Exp = extractF64Exponent(Hi, B);
1990 
1991   const unsigned FractBits = 52;
1992 
1993   // Extract the sign bit.
1994   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1995   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1996 
1997   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1998 
1999   const auto Zero32 = B.buildConstant(S32, 0);
2000 
2001   // Extend back to 64-bits.
2002   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2003 
2004   auto Shr = B.buildAShr(S64, FractMask, Exp);
2005   auto Not = B.buildNot(S64, Shr);
2006   auto Tmp0 = B.buildAnd(S64, Src, Not);
2007   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2008 
2009   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2010   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2011 
2012   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2013   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2014   MI.eraseFromParent();
2015   return true;
2016 }
2017 
2018 bool AMDGPULegalizerInfo::legalizeITOFP(
2019   MachineInstr &MI, MachineRegisterInfo &MRI,
2020   MachineIRBuilder &B, bool Signed) const {
2021 
2022   Register Dst = MI.getOperand(0).getReg();
2023   Register Src = MI.getOperand(1).getReg();
2024 
2025   const LLT S64 = LLT::scalar(64);
2026   const LLT S32 = LLT::scalar(32);
2027 
2028   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2029 
2030   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2031 
2032   auto CvtHi = Signed ?
2033     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2034     B.buildUITOFP(S64, Unmerge.getReg(1));
2035 
2036   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2037 
2038   auto ThirtyTwo = B.buildConstant(S32, 32);
2039   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2040     .addUse(CvtHi.getReg(0))
2041     .addUse(ThirtyTwo.getReg(0));
2042 
2043   // TODO: Should this propagate fast-math-flags?
2044   B.buildFAdd(Dst, LdExp, CvtLo);
2045   MI.eraseFromParent();
2046   return true;
2047 }
2048 
2049 // TODO: Copied from DAG implementation. Verify logic and document how this
2050 // actually works.
2051 bool AMDGPULegalizerInfo::legalizeFPTOI(
2052   MachineInstr &MI, MachineRegisterInfo &MRI,
2053   MachineIRBuilder &B, bool Signed) const {
2054 
2055   Register Dst = MI.getOperand(0).getReg();
2056   Register Src = MI.getOperand(1).getReg();
2057 
2058   const LLT S64 = LLT::scalar(64);
2059   const LLT S32 = LLT::scalar(32);
2060 
2061   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2062 
2063   unsigned Flags = MI.getFlags();
2064 
2065   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2066   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2067   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2068 
2069   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2070   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2071   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2072 
2073   auto Hi = Signed ?
2074     B.buildFPTOSI(S32, FloorMul) :
2075     B.buildFPTOUI(S32, FloorMul);
2076   auto Lo = B.buildFPTOUI(S32, Fma);
2077 
2078   B.buildMerge(Dst, { Lo, Hi });
2079   MI.eraseFromParent();
2080 
2081   return true;
2082 }
2083 
2084 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2085                                                MachineInstr &MI) const {
2086   MachineFunction &MF = Helper.MIRBuilder.getMF();
2087   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2088 
2089   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2090                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2091 
2092   // With ieee_mode disabled, the instructions have the correct behavior
2093   // already for G_FMINNUM/G_FMAXNUM
2094   if (!MFI->getMode().IEEE)
2095     return !IsIEEEOp;
2096 
2097   if (IsIEEEOp)
2098     return true;
2099 
2100   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2101 }
2102 
2103 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2104   MachineInstr &MI, MachineRegisterInfo &MRI,
2105   MachineIRBuilder &B) const {
2106   // TODO: Should move some of this into LegalizerHelper.
2107 
2108   // TODO: Promote dynamic indexing of s16 to s32
2109 
2110   // FIXME: Artifact combiner probably should have replaced the truncated
2111   // constant before this, so we shouldn't need
2112   // getConstantVRegValWithLookThrough.
2113   Optional<ValueAndVReg> MaybeIdxVal =
2114       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2115   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2116     return true;
2117   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2118 
2119   Register Dst = MI.getOperand(0).getReg();
2120   Register Vec = MI.getOperand(1).getReg();
2121 
2122   LLT VecTy = MRI.getType(Vec);
2123   LLT EltTy = VecTy.getElementType();
2124   assert(EltTy == MRI.getType(Dst));
2125 
2126   if (IdxVal < VecTy.getNumElements())
2127     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2128   else
2129     B.buildUndef(Dst);
2130 
2131   MI.eraseFromParent();
2132   return true;
2133 }
2134 
2135 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2136   MachineInstr &MI, MachineRegisterInfo &MRI,
2137   MachineIRBuilder &B) const {
2138   // TODO: Should move some of this into LegalizerHelper.
2139 
2140   // TODO: Promote dynamic indexing of s16 to s32
2141 
2142   // FIXME: Artifact combiner probably should have replaced the truncated
2143   // constant before this, so we shouldn't need
2144   // getConstantVRegValWithLookThrough.
2145   Optional<ValueAndVReg> MaybeIdxVal =
2146       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2147   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2148     return true;
2149 
2150   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2151   Register Dst = MI.getOperand(0).getReg();
2152   Register Vec = MI.getOperand(1).getReg();
2153   Register Ins = MI.getOperand(2).getReg();
2154 
2155   LLT VecTy = MRI.getType(Vec);
2156   LLT EltTy = VecTy.getElementType();
2157   assert(EltTy == MRI.getType(Ins));
2158 
2159   if (IdxVal < VecTy.getNumElements())
2160     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2161   else
2162     B.buildUndef(Dst);
2163 
2164   MI.eraseFromParent();
2165   return true;
2166 }
2167 
2168 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2169   MachineInstr &MI, MachineRegisterInfo &MRI,
2170   MachineIRBuilder &B) const {
2171   const LLT V2S16 = LLT::vector(2, 16);
2172 
2173   Register Dst = MI.getOperand(0).getReg();
2174   Register Src0 = MI.getOperand(1).getReg();
2175   LLT DstTy = MRI.getType(Dst);
2176   LLT SrcTy = MRI.getType(Src0);
2177 
2178   if (SrcTy == V2S16 && DstTy == V2S16 &&
2179       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2180     return true;
2181 
2182   MachineIRBuilder HelperBuilder(MI);
2183   GISelObserverWrapper DummyObserver;
2184   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2185   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2186 }
2187 
2188 bool AMDGPULegalizerInfo::legalizeSinCos(
2189   MachineInstr &MI, MachineRegisterInfo &MRI,
2190   MachineIRBuilder &B) const {
2191 
2192   Register DstReg = MI.getOperand(0).getReg();
2193   Register SrcReg = MI.getOperand(1).getReg();
2194   LLT Ty = MRI.getType(DstReg);
2195   unsigned Flags = MI.getFlags();
2196 
2197   Register TrigVal;
2198   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2199   if (ST.hasTrigReducedRange()) {
2200     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2201     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2202       .addUse(MulVal.getReg(0))
2203       .setMIFlags(Flags).getReg(0);
2204   } else
2205     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2206 
2207   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2208     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2209   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2210     .addUse(TrigVal)
2211     .setMIFlags(Flags);
2212   MI.eraseFromParent();
2213   return true;
2214 }
2215 
2216 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2217                                                   MachineIRBuilder &B,
2218                                                   const GlobalValue *GV,
2219                                                   int64_t Offset,
2220                                                   unsigned GAFlags) const {
2221   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2222   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2223   // to the following code sequence:
2224   //
2225   // For constant address space:
2226   //   s_getpc_b64 s[0:1]
2227   //   s_add_u32 s0, s0, $symbol
2228   //   s_addc_u32 s1, s1, 0
2229   //
2230   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2231   //   a fixup or relocation is emitted to replace $symbol with a literal
2232   //   constant, which is a pc-relative offset from the encoding of the $symbol
2233   //   operand to the global variable.
2234   //
2235   // For global address space:
2236   //   s_getpc_b64 s[0:1]
2237   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2238   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2239   //
2240   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2241   //   fixups or relocations are emitted to replace $symbol@*@lo and
2242   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2243   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2244   //   operand to the global variable.
2245   //
2246   // What we want here is an offset from the value returned by s_getpc
2247   // (which is the address of the s_add_u32 instruction) to the global
2248   // variable, but since the encoding of $symbol starts 4 bytes after the start
2249   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2250   // small. This requires us to add 4 to the global variable offset in order to
2251   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2252   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2253   // instruction.
2254 
2255   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2256 
2257   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2258     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2259 
2260   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2261     .addDef(PCReg);
2262 
2263   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2264   if (GAFlags == SIInstrInfo::MO_NONE)
2265     MIB.addImm(0);
2266   else
2267     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2268 
2269   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2270 
2271   if (PtrTy.getSizeInBits() == 32)
2272     B.buildExtract(DstReg, PCReg, 0);
2273   return true;
2274  }
2275 
2276 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2277   MachineInstr &MI, MachineRegisterInfo &MRI,
2278   MachineIRBuilder &B) const {
2279   Register DstReg = MI.getOperand(0).getReg();
2280   LLT Ty = MRI.getType(DstReg);
2281   unsigned AS = Ty.getAddressSpace();
2282 
2283   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2284   MachineFunction &MF = B.getMF();
2285   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2286 
2287   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2288     if (!MFI->isModuleEntryFunction()) {
2289       const Function &Fn = MF.getFunction();
2290       DiagnosticInfoUnsupported BadLDSDecl(
2291         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2292         DS_Warning);
2293       Fn.getContext().diagnose(BadLDSDecl);
2294 
2295       // We currently don't have a way to correctly allocate LDS objects that
2296       // aren't directly associated with a kernel. We do force inlining of
2297       // functions that use local objects. However, if these dead functions are
2298       // not eliminated, we don't want a compile time error. Just emit a warning
2299       // and a trap, since there should be no callable path here.
2300       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2301       B.buildUndef(DstReg);
2302       MI.eraseFromParent();
2303       return true;
2304     }
2305 
2306     // TODO: We could emit code to handle the initialization somewhere.
2307     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2308       const SITargetLowering *TLI = ST.getTargetLowering();
2309       if (!TLI->shouldUseLDSConstAddress(GV)) {
2310         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2311         return true; // Leave in place;
2312       }
2313 
2314       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2315         Type *Ty = GV->getValueType();
2316         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2317         // zero-sized type in other languages to declare the dynamic shared
2318         // memory which size is not known at the compile time. They will be
2319         // allocated by the runtime and placed directly after the static
2320         // allocated ones. They all share the same offset.
2321         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2322           // Adjust alignment for that dynamic shared memory array.
2323           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2324           LLT S32 = LLT::scalar(32);
2325           auto Sz =
2326               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2327           B.buildIntToPtr(DstReg, Sz);
2328           MI.eraseFromParent();
2329           return true;
2330         }
2331       }
2332 
2333       B.buildConstant(
2334           DstReg,
2335           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2336       MI.eraseFromParent();
2337       return true;
2338     }
2339 
2340     const Function &Fn = MF.getFunction();
2341     DiagnosticInfoUnsupported BadInit(
2342       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2343     Fn.getContext().diagnose(BadInit);
2344     return true;
2345   }
2346 
2347   const SITargetLowering *TLI = ST.getTargetLowering();
2348 
2349   if (TLI->shouldEmitFixup(GV)) {
2350     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2351     MI.eraseFromParent();
2352     return true;
2353   }
2354 
2355   if (TLI->shouldEmitPCReloc(GV)) {
2356     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2357     MI.eraseFromParent();
2358     return true;
2359   }
2360 
2361   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2362   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2363 
2364   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2365       MachinePointerInfo::getGOT(MF),
2366       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2367           MachineMemOperand::MOInvariant,
2368       8 /*Size*/, Align(8));
2369 
2370   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2371 
2372   if (Ty.getSizeInBits() == 32) {
2373     // Truncate if this is a 32-bit constant adrdess.
2374     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2375     B.buildExtract(DstReg, Load, 0);
2376   } else
2377     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2378 
2379   MI.eraseFromParent();
2380   return true;
2381 }
2382 
2383 static LLT widenToNextPowerOf2(LLT Ty) {
2384   if (Ty.isVector())
2385     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2386   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2387 }
2388 
2389 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2390                                        MachineInstr &MI) const {
2391   MachineIRBuilder &B = Helper.MIRBuilder;
2392   MachineRegisterInfo &MRI = *B.getMRI();
2393   GISelChangeObserver &Observer = Helper.Observer;
2394 
2395   Register PtrReg = MI.getOperand(1).getReg();
2396   LLT PtrTy = MRI.getType(PtrReg);
2397   unsigned AddrSpace = PtrTy.getAddressSpace();
2398 
2399   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2400     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2401     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2402     Observer.changingInstr(MI);
2403     MI.getOperand(1).setReg(Cast.getReg(0));
2404     Observer.changedInstr(MI);
2405     return true;
2406   }
2407 
2408   Register ValReg = MI.getOperand(0).getReg();
2409   LLT ValTy = MRI.getType(ValReg);
2410 
2411   MachineMemOperand *MMO = *MI.memoperands_begin();
2412   const unsigned ValSize = ValTy.getSizeInBits();
2413   const unsigned MemSize = 8 * MMO->getSize();
2414   const Align MemAlign = MMO->getAlign();
2415   const unsigned AlignInBits = 8 * MemAlign.value();
2416 
2417   // Widen non-power-of-2 loads to the alignment if needed
2418   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2419     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2420 
2421     // This was already the correct extending load result type, so just adjust
2422     // the memory type.
2423     if (WideMemSize == ValSize) {
2424       MachineFunction &MF = B.getMF();
2425 
2426       MachineMemOperand *WideMMO =
2427           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2428       Observer.changingInstr(MI);
2429       MI.setMemRefs(MF, {WideMMO});
2430       Observer.changedInstr(MI);
2431       return true;
2432     }
2433 
2434     // Don't bother handling edge case that should probably never be produced.
2435     if (ValSize > WideMemSize)
2436       return false;
2437 
2438     LLT WideTy = widenToNextPowerOf2(ValTy);
2439 
2440     Register WideLoad;
2441     if (!WideTy.isVector()) {
2442       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2443       B.buildTrunc(ValReg, WideLoad).getReg(0);
2444     } else {
2445       // Extract the subvector.
2446 
2447       if (isRegisterType(ValTy)) {
2448         // If this a case where G_EXTRACT is legal, use it.
2449         // (e.g. <3 x s32> -> <4 x s32>)
2450         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2451         B.buildExtract(ValReg, WideLoad, 0);
2452       } else {
2453         // For cases where the widened type isn't a nice register value, unmerge
2454         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2455         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2456         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2457         B.setInsertPt(B.getMBB(), MI.getIterator());
2458         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2459       }
2460     }
2461 
2462     MI.eraseFromParent();
2463     return true;
2464   }
2465 
2466   return false;
2467 }
2468 
2469 bool AMDGPULegalizerInfo::legalizeFMad(
2470   MachineInstr &MI, MachineRegisterInfo &MRI,
2471   MachineIRBuilder &B) const {
2472   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2473   assert(Ty.isScalar());
2474 
2475   MachineFunction &MF = B.getMF();
2476   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2477 
2478   // TODO: Always legal with future ftz flag.
2479   // FIXME: Do we need just output?
2480   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2481     return true;
2482   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2483     return true;
2484 
2485   MachineIRBuilder HelperBuilder(MI);
2486   GISelObserverWrapper DummyObserver;
2487   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2488   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2489 }
2490 
2491 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2492   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2493   Register DstReg = MI.getOperand(0).getReg();
2494   Register PtrReg = MI.getOperand(1).getReg();
2495   Register CmpVal = MI.getOperand(2).getReg();
2496   Register NewVal = MI.getOperand(3).getReg();
2497 
2498   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2499          "this should not have been custom lowered");
2500 
2501   LLT ValTy = MRI.getType(CmpVal);
2502   LLT VecTy = LLT::vector(2, ValTy);
2503 
2504   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2505 
2506   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2507     .addDef(DstReg)
2508     .addUse(PtrReg)
2509     .addUse(PackedVal)
2510     .setMemRefs(MI.memoperands());
2511 
2512   MI.eraseFromParent();
2513   return true;
2514 }
2515 
2516 bool AMDGPULegalizerInfo::legalizeFlog(
2517   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2518   Register Dst = MI.getOperand(0).getReg();
2519   Register Src = MI.getOperand(1).getReg();
2520   LLT Ty = B.getMRI()->getType(Dst);
2521   unsigned Flags = MI.getFlags();
2522 
2523   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2524   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2525 
2526   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2527   MI.eraseFromParent();
2528   return true;
2529 }
2530 
2531 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2532                                        MachineIRBuilder &B) const {
2533   Register Dst = MI.getOperand(0).getReg();
2534   Register Src = MI.getOperand(1).getReg();
2535   unsigned Flags = MI.getFlags();
2536   LLT Ty = B.getMRI()->getType(Dst);
2537 
2538   auto K = B.buildFConstant(Ty, numbers::log2e);
2539   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2540   B.buildFExp2(Dst, Mul, Flags);
2541   MI.eraseFromParent();
2542   return true;
2543 }
2544 
2545 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2546                                        MachineIRBuilder &B) const {
2547   Register Dst = MI.getOperand(0).getReg();
2548   Register Src0 = MI.getOperand(1).getReg();
2549   Register Src1 = MI.getOperand(2).getReg();
2550   unsigned Flags = MI.getFlags();
2551   LLT Ty = B.getMRI()->getType(Dst);
2552   const LLT S16 = LLT::scalar(16);
2553   const LLT S32 = LLT::scalar(32);
2554 
2555   if (Ty == S32) {
2556     auto Log = B.buildFLog2(S32, Src0, Flags);
2557     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2558       .addUse(Log.getReg(0))
2559       .addUse(Src1)
2560       .setMIFlags(Flags);
2561     B.buildFExp2(Dst, Mul, Flags);
2562   } else if (Ty == S16) {
2563     // There's no f16 fmul_legacy, so we need to convert for it.
2564     auto Log = B.buildFLog2(S16, Src0, Flags);
2565     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2566     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2567     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2568       .addUse(Ext0.getReg(0))
2569       .addUse(Ext1.getReg(0))
2570       .setMIFlags(Flags);
2571 
2572     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2573   } else
2574     return false;
2575 
2576   MI.eraseFromParent();
2577   return true;
2578 }
2579 
2580 // Find a source register, ignoring any possible source modifiers.
2581 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2582   Register ModSrc = OrigSrc;
2583   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2584     ModSrc = SrcFNeg->getOperand(1).getReg();
2585     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2586       ModSrc = SrcFAbs->getOperand(1).getReg();
2587   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2588     ModSrc = SrcFAbs->getOperand(1).getReg();
2589   return ModSrc;
2590 }
2591 
2592 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2593                                          MachineRegisterInfo &MRI,
2594                                          MachineIRBuilder &B) const {
2595 
2596   const LLT S1 = LLT::scalar(1);
2597   const LLT S64 = LLT::scalar(64);
2598   Register Dst = MI.getOperand(0).getReg();
2599   Register OrigSrc = MI.getOperand(1).getReg();
2600   unsigned Flags = MI.getFlags();
2601   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2602          "this should not have been custom lowered");
2603 
2604   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2605   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2606   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2607   // V_FRACT bug is:
2608   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2609   //
2610   // Convert floor(x) to (x - fract(x))
2611 
2612   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2613     .addUse(OrigSrc)
2614     .setMIFlags(Flags);
2615 
2616   // Give source modifier matching some assistance before obscuring a foldable
2617   // pattern.
2618 
2619   // TODO: We can avoid the neg on the fract? The input sign to fract
2620   // shouldn't matter?
2621   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2622 
2623   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2624 
2625   Register Min = MRI.createGenericVirtualRegister(S64);
2626 
2627   // We don't need to concern ourselves with the snan handling difference, so
2628   // use the one which will directly select.
2629   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2630   if (MFI->getMode().IEEE)
2631     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2632   else
2633     B.buildFMinNum(Min, Fract, Const, Flags);
2634 
2635   Register CorrectedFract = Min;
2636   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2637     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2638     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2639   }
2640 
2641   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2642   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2643 
2644   MI.eraseFromParent();
2645   return true;
2646 }
2647 
2648 // Turn an illegal packed v2s16 build vector into bit operations.
2649 // TODO: This should probably be a bitcast action in LegalizerHelper.
2650 bool AMDGPULegalizerInfo::legalizeBuildVector(
2651   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2652   Register Dst = MI.getOperand(0).getReg();
2653   const LLT S32 = LLT::scalar(32);
2654   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2655 
2656   Register Src0 = MI.getOperand(1).getReg();
2657   Register Src1 = MI.getOperand(2).getReg();
2658   assert(MRI.getType(Src0) == LLT::scalar(16));
2659 
2660   auto Merge = B.buildMerge(S32, {Src0, Src1});
2661   B.buildBitcast(Dst, Merge);
2662 
2663   MI.eraseFromParent();
2664   return true;
2665 }
2666 
2667 // Check that this is a G_XOR x, -1
2668 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2669   if (MI.getOpcode() != TargetOpcode::G_XOR)
2670     return false;
2671   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2672   return ConstVal && *ConstVal == -1;
2673 }
2674 
2675 // Return the use branch instruction, otherwise null if the usage is invalid.
2676 static MachineInstr *
2677 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2678                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2679   Register CondDef = MI.getOperand(0).getReg();
2680   if (!MRI.hasOneNonDBGUse(CondDef))
2681     return nullptr;
2682 
2683   MachineBasicBlock *Parent = MI.getParent();
2684   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2685 
2686   if (isNot(MRI, *UseMI)) {
2687     Register NegatedCond = UseMI->getOperand(0).getReg();
2688     if (!MRI.hasOneNonDBGUse(NegatedCond))
2689       return nullptr;
2690 
2691     // We're deleting the def of this value, so we need to remove it.
2692     UseMI->eraseFromParent();
2693 
2694     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2695     Negated = true;
2696   }
2697 
2698   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2699     return nullptr;
2700 
2701   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2702   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2703   if (Next == Parent->end()) {
2704     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2705     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2706       return nullptr;
2707     UncondBrTarget = &*NextMBB;
2708   } else {
2709     if (Next->getOpcode() != AMDGPU::G_BR)
2710       return nullptr;
2711     Br = &*Next;
2712     UncondBrTarget = Br->getOperand(0).getMBB();
2713   }
2714 
2715   return UseMI;
2716 }
2717 
2718 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2719                                          const ArgDescriptor *Arg,
2720                                          const TargetRegisterClass *ArgRC,
2721                                          LLT ArgTy) const {
2722   MCRegister SrcReg = Arg->getRegister();
2723   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2724   assert(DstReg.isVirtual() && "Virtual register expected");
2725 
2726   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2727                                              ArgTy);
2728   if (Arg->isMasked()) {
2729     // TODO: Should we try to emit this once in the entry block?
2730     const LLT S32 = LLT::scalar(32);
2731     const unsigned Mask = Arg->getMask();
2732     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2733 
2734     Register AndMaskSrc = LiveIn;
2735 
2736     if (Shift != 0) {
2737       auto ShiftAmt = B.buildConstant(S32, Shift);
2738       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2739     }
2740 
2741     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2742   } else {
2743     B.buildCopy(DstReg, LiveIn);
2744   }
2745 
2746   return true;
2747 }
2748 
2749 bool AMDGPULegalizerInfo::loadInputValue(
2750     Register DstReg, MachineIRBuilder &B,
2751     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2752   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2753   const ArgDescriptor *Arg;
2754   const TargetRegisterClass *ArgRC;
2755   LLT ArgTy;
2756   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2757 
2758   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2759     return false; // TODO: Handle these
2760   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2761 }
2762 
2763 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2764     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2765     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2766   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2767     return false;
2768 
2769   MI.eraseFromParent();
2770   return true;
2771 }
2772 
2773 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2774                                        MachineRegisterInfo &MRI,
2775                                        MachineIRBuilder &B) const {
2776   Register Dst = MI.getOperand(0).getReg();
2777   LLT DstTy = MRI.getType(Dst);
2778   LLT S16 = LLT::scalar(16);
2779   LLT S32 = LLT::scalar(32);
2780   LLT S64 = LLT::scalar(64);
2781 
2782   if (DstTy == S16)
2783     return legalizeFDIV16(MI, MRI, B);
2784   if (DstTy == S32)
2785     return legalizeFDIV32(MI, MRI, B);
2786   if (DstTy == S64)
2787     return legalizeFDIV64(MI, MRI, B);
2788 
2789   return false;
2790 }
2791 
2792 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2793                                                   Register DstReg,
2794                                                   Register X,
2795                                                   Register Y,
2796                                                   bool IsDiv) const {
2797   const LLT S1 = LLT::scalar(1);
2798   const LLT S32 = LLT::scalar(32);
2799 
2800   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2801   // algorithm used here.
2802 
2803   // Initial estimate of inv(y).
2804   auto FloatY = B.buildUITOFP(S32, Y);
2805   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2806   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2807   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2808   auto Z = B.buildFPTOUI(S32, ScaledY);
2809 
2810   // One round of UNR.
2811   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2812   auto NegYZ = B.buildMul(S32, NegY, Z);
2813   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2814 
2815   // Quotient/remainder estimate.
2816   auto Q = B.buildUMulH(S32, X, Z);
2817   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2818 
2819   // First quotient/remainder refinement.
2820   auto One = B.buildConstant(S32, 1);
2821   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2822   if (IsDiv)
2823     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2824   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2825 
2826   // Second quotient/remainder refinement.
2827   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2828   if (IsDiv)
2829     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2830   else
2831     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2832 }
2833 
2834 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2835                                               MachineRegisterInfo &MRI,
2836                                               MachineIRBuilder &B) const {
2837   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2838   Register DstReg = MI.getOperand(0).getReg();
2839   Register Num = MI.getOperand(1).getReg();
2840   Register Den = MI.getOperand(2).getReg();
2841   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2842   MI.eraseFromParent();
2843   return true;
2844 }
2845 
2846 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2847 //
2848 // Return lo, hi of result
2849 //
2850 // %cvt.lo = G_UITOFP Val.lo
2851 // %cvt.hi = G_UITOFP Val.hi
2852 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2853 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2854 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2855 // %mul2 = G_FMUL %mul1, 2**(-32)
2856 // %trunc = G_INTRINSIC_TRUNC %mul2
2857 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2858 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2859 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2860                                                        Register Val) {
2861   const LLT S32 = LLT::scalar(32);
2862   auto Unmerge = B.buildUnmerge(S32, Val);
2863 
2864   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2865   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2866 
2867   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2868                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2869 
2870   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2871   auto Mul1 =
2872       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2873 
2874   // 2**(-32)
2875   auto Mul2 =
2876       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2877   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2878 
2879   // -(2**32)
2880   auto Mad2 = B.buildFMAD(S32, Trunc,
2881                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2882 
2883   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2884   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2885 
2886   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2887 }
2888 
2889 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2890                                                   Register DstReg,
2891                                                   Register Numer,
2892                                                   Register Denom,
2893                                                   bool IsDiv) const {
2894   const LLT S32 = LLT::scalar(32);
2895   const LLT S64 = LLT::scalar(64);
2896   const LLT S1 = LLT::scalar(1);
2897   Register RcpLo, RcpHi;
2898 
2899   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2900 
2901   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2902 
2903   auto Zero64 = B.buildConstant(S64, 0);
2904   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2905 
2906   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2907   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2908 
2909   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2910   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2911   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2912 
2913   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2914   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2915   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2916   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2917 
2918   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2919   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2920   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2921   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2922   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2923 
2924   auto Zero32 = B.buildConstant(S32, 0);
2925   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2926   auto Add2_HiC =
2927       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2928   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2929   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2930 
2931   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2932   Register NumerLo = UnmergeNumer.getReg(0);
2933   Register NumerHi = UnmergeNumer.getReg(1);
2934 
2935   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2936   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2937   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2938   Register Mul3_Lo = UnmergeMul3.getReg(0);
2939   Register Mul3_Hi = UnmergeMul3.getReg(1);
2940   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2941   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2942   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2943   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2944 
2945   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2946   Register DenomLo = UnmergeDenom.getReg(0);
2947   Register DenomHi = UnmergeDenom.getReg(1);
2948 
2949   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2950   auto C1 = B.buildSExt(S32, CmpHi);
2951 
2952   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2953   auto C2 = B.buildSExt(S32, CmpLo);
2954 
2955   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2956   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2957 
2958   // TODO: Here and below portions of the code can be enclosed into if/endif.
2959   // Currently control flow is unconditional and we have 4 selects after
2960   // potential endif to substitute PHIs.
2961 
2962   // if C3 != 0 ...
2963   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2964   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2965   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2966   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2967 
2968   auto One64 = B.buildConstant(S64, 1);
2969   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2970 
2971   auto C4 =
2972       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2973   auto C5 =
2974       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2975   auto C6 = B.buildSelect(
2976       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2977 
2978   // if (C6 != 0)
2979   auto Add4 = B.buildAdd(S64, Add3, One64);
2980   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2981 
2982   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2983   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2984   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2985 
2986   // endif C6
2987   // endif C3
2988 
2989   if (IsDiv) {
2990     auto Sel1 = B.buildSelect(
2991         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2992     B.buildSelect(DstReg,
2993                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2994   } else {
2995     auto Sel2 = B.buildSelect(
2996         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2997     B.buildSelect(DstReg,
2998                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2999   }
3000 }
3001 
3002 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
3003                                             MachineRegisterInfo &MRI,
3004                                             MachineIRBuilder &B) const {
3005   const LLT S64 = LLT::scalar(64);
3006   const LLT S32 = LLT::scalar(32);
3007   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
3008   Register DstReg = MI.getOperand(0).getReg();
3009   Register Num = MI.getOperand(1).getReg();
3010   Register Den = MI.getOperand(2).getReg();
3011   LLT Ty = MRI.getType(DstReg);
3012 
3013   if (Ty == S32)
3014     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3015   else if (Ty == S64)
3016     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3017   else
3018     return false;
3019 
3020   MI.eraseFromParent();
3021   return true;
3022 
3023 }
3024 
3025 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3026                                             MachineRegisterInfo &MRI,
3027                                             MachineIRBuilder &B) const {
3028   const LLT S64 = LLT::scalar(64);
3029   const LLT S32 = LLT::scalar(32);
3030 
3031   Register DstReg = MI.getOperand(0).getReg();
3032   const LLT Ty = MRI.getType(DstReg);
3033   if (Ty != S32 && Ty != S64)
3034     return false;
3035 
3036   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3037 
3038   Register LHS = MI.getOperand(1).getReg();
3039   Register RHS = MI.getOperand(2).getReg();
3040 
3041   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3042   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3043   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3044 
3045   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3046   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3047 
3048   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3049   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3050 
3051   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3052   if (Ty == S32)
3053     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3054   else
3055     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3056 
3057   Register Sign;
3058   if (IsDiv)
3059     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3060   else
3061     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3062 
3063   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3064   B.buildSub(DstReg, UDivRem, Sign);
3065 
3066   MI.eraseFromParent();
3067   return true;
3068 }
3069 
3070 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3071                                                  MachineRegisterInfo &MRI,
3072                                                  MachineIRBuilder &B) const {
3073   Register Res = MI.getOperand(0).getReg();
3074   Register LHS = MI.getOperand(1).getReg();
3075   Register RHS = MI.getOperand(2).getReg();
3076   uint16_t Flags = MI.getFlags();
3077   LLT ResTy = MRI.getType(Res);
3078 
3079   const MachineFunction &MF = B.getMF();
3080   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3081                             MI.getFlag(MachineInstr::FmAfn);
3082 
3083   if (!AllowInaccurateRcp)
3084     return false;
3085 
3086   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3087     // 1 / x -> RCP(x)
3088     if (CLHS->isExactlyValue(1.0)) {
3089       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3090         .addUse(RHS)
3091         .setMIFlags(Flags);
3092 
3093       MI.eraseFromParent();
3094       return true;
3095     }
3096 
3097     // -1 / x -> RCP( FNEG(x) )
3098     if (CLHS->isExactlyValue(-1.0)) {
3099       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3100       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3101         .addUse(FNeg.getReg(0))
3102         .setMIFlags(Flags);
3103 
3104       MI.eraseFromParent();
3105       return true;
3106     }
3107   }
3108 
3109   // x / y -> x * (1.0 / y)
3110   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3111     .addUse(RHS)
3112     .setMIFlags(Flags);
3113   B.buildFMul(Res, LHS, RCP, Flags);
3114 
3115   MI.eraseFromParent();
3116   return true;
3117 }
3118 
3119 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3120                                                    MachineRegisterInfo &MRI,
3121                                                    MachineIRBuilder &B) const {
3122   Register Res = MI.getOperand(0).getReg();
3123   Register X = MI.getOperand(1).getReg();
3124   Register Y = MI.getOperand(2).getReg();
3125   uint16_t Flags = MI.getFlags();
3126   LLT ResTy = MRI.getType(Res);
3127 
3128   const MachineFunction &MF = B.getMF();
3129   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3130                             MI.getFlag(MachineInstr::FmAfn);
3131 
3132   if (!AllowInaccurateRcp)
3133     return false;
3134 
3135   auto NegY = B.buildFNeg(ResTy, Y);
3136   auto One = B.buildFConstant(ResTy, 1.0);
3137 
3138   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3139     .addUse(Y)
3140     .setMIFlags(Flags);
3141 
3142   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3143   R = B.buildFMA(ResTy, Tmp0, R, R);
3144 
3145   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3146   R = B.buildFMA(ResTy, Tmp1, R, R);
3147 
3148   auto Ret = B.buildFMul(ResTy, X, R);
3149   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3150 
3151   B.buildFMA(Res, Tmp2, R, Ret);
3152   MI.eraseFromParent();
3153   return true;
3154 }
3155 
3156 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3157                                          MachineRegisterInfo &MRI,
3158                                          MachineIRBuilder &B) const {
3159   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3160     return true;
3161 
3162   Register Res = MI.getOperand(0).getReg();
3163   Register LHS = MI.getOperand(1).getReg();
3164   Register RHS = MI.getOperand(2).getReg();
3165 
3166   uint16_t Flags = MI.getFlags();
3167 
3168   LLT S16 = LLT::scalar(16);
3169   LLT S32 = LLT::scalar(32);
3170 
3171   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3172   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3173 
3174   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3175     .addUse(RHSExt.getReg(0))
3176     .setMIFlags(Flags);
3177 
3178   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3179   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3180 
3181   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3182     .addUse(RDst.getReg(0))
3183     .addUse(RHS)
3184     .addUse(LHS)
3185     .setMIFlags(Flags);
3186 
3187   MI.eraseFromParent();
3188   return true;
3189 }
3190 
3191 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3192 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3193 static void toggleSPDenormMode(bool Enable,
3194                                MachineIRBuilder &B,
3195                                const GCNSubtarget &ST,
3196                                AMDGPU::SIModeRegisterDefaults Mode) {
3197   // Set SP denorm mode to this value.
3198   unsigned SPDenormMode =
3199     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3200 
3201   if (ST.hasDenormModeInst()) {
3202     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3203     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3204 
3205     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3206     B.buildInstr(AMDGPU::S_DENORM_MODE)
3207       .addImm(NewDenormModeValue);
3208 
3209   } else {
3210     // Select FP32 bit field in mode register.
3211     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3212                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3213                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3214 
3215     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3216       .addImm(SPDenormMode)
3217       .addImm(SPDenormModeBitField);
3218   }
3219 }
3220 
3221 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3222                                          MachineRegisterInfo &MRI,
3223                                          MachineIRBuilder &B) const {
3224   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3225     return true;
3226 
3227   Register Res = MI.getOperand(0).getReg();
3228   Register LHS = MI.getOperand(1).getReg();
3229   Register RHS = MI.getOperand(2).getReg();
3230   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3231   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3232 
3233   uint16_t Flags = MI.getFlags();
3234 
3235   LLT S32 = LLT::scalar(32);
3236   LLT S1 = LLT::scalar(1);
3237 
3238   auto One = B.buildFConstant(S32, 1.0f);
3239 
3240   auto DenominatorScaled =
3241     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3242       .addUse(LHS)
3243       .addUse(RHS)
3244       .addImm(0)
3245       .setMIFlags(Flags);
3246   auto NumeratorScaled =
3247     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3248       .addUse(LHS)
3249       .addUse(RHS)
3250       .addImm(1)
3251       .setMIFlags(Flags);
3252 
3253   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3254     .addUse(DenominatorScaled.getReg(0))
3255     .setMIFlags(Flags);
3256   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3257 
3258   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3259   // aren't modeled as reading it.
3260   if (!Mode.allFP32Denormals())
3261     toggleSPDenormMode(true, B, ST, Mode);
3262 
3263   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3264   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3265   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3266   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3267   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3268   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3269 
3270   if (!Mode.allFP32Denormals())
3271     toggleSPDenormMode(false, B, ST, Mode);
3272 
3273   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3274     .addUse(Fma4.getReg(0))
3275     .addUse(Fma1.getReg(0))
3276     .addUse(Fma3.getReg(0))
3277     .addUse(NumeratorScaled.getReg(1))
3278     .setMIFlags(Flags);
3279 
3280   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3281     .addUse(Fmas.getReg(0))
3282     .addUse(RHS)
3283     .addUse(LHS)
3284     .setMIFlags(Flags);
3285 
3286   MI.eraseFromParent();
3287   return true;
3288 }
3289 
3290 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3291                                          MachineRegisterInfo &MRI,
3292                                          MachineIRBuilder &B) const {
3293   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3294     return true;
3295 
3296   Register Res = MI.getOperand(0).getReg();
3297   Register LHS = MI.getOperand(1).getReg();
3298   Register RHS = MI.getOperand(2).getReg();
3299 
3300   uint16_t Flags = MI.getFlags();
3301 
3302   LLT S64 = LLT::scalar(64);
3303   LLT S1 = LLT::scalar(1);
3304 
3305   auto One = B.buildFConstant(S64, 1.0);
3306 
3307   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3308     .addUse(LHS)
3309     .addUse(RHS)
3310     .addImm(0)
3311     .setMIFlags(Flags);
3312 
3313   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3314 
3315   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3316     .addUse(DivScale0.getReg(0))
3317     .setMIFlags(Flags);
3318 
3319   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3320   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3321   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3322 
3323   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3324     .addUse(LHS)
3325     .addUse(RHS)
3326     .addImm(1)
3327     .setMIFlags(Flags);
3328 
3329   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3330   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3331   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3332 
3333   Register Scale;
3334   if (!ST.hasUsableDivScaleConditionOutput()) {
3335     // Workaround a hardware bug on SI where the condition output from div_scale
3336     // is not usable.
3337 
3338     LLT S32 = LLT::scalar(32);
3339 
3340     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3341     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3342     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3343     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3344 
3345     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3346                               Scale1Unmerge.getReg(1));
3347     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3348                               Scale0Unmerge.getReg(1));
3349     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3350   } else {
3351     Scale = DivScale1.getReg(1);
3352   }
3353 
3354   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3355     .addUse(Fma4.getReg(0))
3356     .addUse(Fma3.getReg(0))
3357     .addUse(Mul.getReg(0))
3358     .addUse(Scale)
3359     .setMIFlags(Flags);
3360 
3361   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3362     .addUse(Fmas.getReg(0))
3363     .addUse(RHS)
3364     .addUse(LHS)
3365     .setMIFlags(Flags);
3366 
3367   MI.eraseFromParent();
3368   return true;
3369 }
3370 
3371 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3372                                                  MachineRegisterInfo &MRI,
3373                                                  MachineIRBuilder &B) const {
3374   Register Res = MI.getOperand(0).getReg();
3375   Register LHS = MI.getOperand(2).getReg();
3376   Register RHS = MI.getOperand(3).getReg();
3377   uint16_t Flags = MI.getFlags();
3378 
3379   LLT S32 = LLT::scalar(32);
3380   LLT S1 = LLT::scalar(1);
3381 
3382   auto Abs = B.buildFAbs(S32, RHS, Flags);
3383   const APFloat C0Val(1.0f);
3384 
3385   auto C0 = B.buildConstant(S32, 0x6f800000);
3386   auto C1 = B.buildConstant(S32, 0x2f800000);
3387   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3388 
3389   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3390   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3391 
3392   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3393 
3394   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3395     .addUse(Mul0.getReg(0))
3396     .setMIFlags(Flags);
3397 
3398   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3399 
3400   B.buildFMul(Res, Sel, Mul1, Flags);
3401 
3402   MI.eraseFromParent();
3403   return true;
3404 }
3405 
3406 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3407 // FIXME: Why do we handle this one but not other removed instructions?
3408 //
3409 // Reciprocal square root.  The clamp prevents infinite results, clamping
3410 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3411 // +-max_float.
3412 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3413                                                     MachineRegisterInfo &MRI,
3414                                                     MachineIRBuilder &B) const {
3415   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3416     return true;
3417 
3418   Register Dst = MI.getOperand(0).getReg();
3419   Register Src = MI.getOperand(2).getReg();
3420   auto Flags = MI.getFlags();
3421 
3422   LLT Ty = MRI.getType(Dst);
3423 
3424   const fltSemantics *FltSemantics;
3425   if (Ty == LLT::scalar(32))
3426     FltSemantics = &APFloat::IEEEsingle();
3427   else if (Ty == LLT::scalar(64))
3428     FltSemantics = &APFloat::IEEEdouble();
3429   else
3430     return false;
3431 
3432   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3433     .addUse(Src)
3434     .setMIFlags(Flags);
3435 
3436   // We don't need to concern ourselves with the snan handling difference, since
3437   // the rsq quieted (or not) so use the one which will directly select.
3438   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3439   const bool UseIEEE = MFI->getMode().IEEE;
3440 
3441   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3442   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3443                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3444 
3445   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3446 
3447   if (UseIEEE)
3448     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3449   else
3450     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3451   MI.eraseFromParent();
3452   return true;
3453 }
3454 
3455 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3456   switch (IID) {
3457   case Intrinsic::amdgcn_ds_fadd:
3458     return AMDGPU::G_ATOMICRMW_FADD;
3459   case Intrinsic::amdgcn_ds_fmin:
3460     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3461   case Intrinsic::amdgcn_ds_fmax:
3462     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3463   default:
3464     llvm_unreachable("not a DS FP intrinsic");
3465   }
3466 }
3467 
3468 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3469                                                       MachineInstr &MI,
3470                                                       Intrinsic::ID IID) const {
3471   GISelChangeObserver &Observer = Helper.Observer;
3472   Observer.changingInstr(MI);
3473 
3474   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3475 
3476   // The remaining operands were used to set fields in the MemOperand on
3477   // construction.
3478   for (int I = 6; I > 3; --I)
3479     MI.RemoveOperand(I);
3480 
3481   MI.RemoveOperand(1); // Remove the intrinsic ID.
3482   Observer.changedInstr(MI);
3483   return true;
3484 }
3485 
3486 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3487                                             MachineRegisterInfo &MRI,
3488                                             MachineIRBuilder &B) const {
3489   uint64_t Offset =
3490     ST.getTargetLowering()->getImplicitParameterOffset(
3491       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3492   LLT DstTy = MRI.getType(DstReg);
3493   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3494 
3495   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3496   if (!loadInputValue(KernargPtrReg, B,
3497                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3498     return false;
3499 
3500   // FIXME: This should be nuw
3501   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3502   return true;
3503 }
3504 
3505 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3506                                                  MachineRegisterInfo &MRI,
3507                                                  MachineIRBuilder &B) const {
3508   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3509   if (!MFI->isEntryFunction()) {
3510     return legalizePreloadedArgIntrin(MI, MRI, B,
3511                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3512   }
3513 
3514   Register DstReg = MI.getOperand(0).getReg();
3515   if (!getImplicitArgPtr(DstReg, MRI, B))
3516     return false;
3517 
3518   MI.eraseFromParent();
3519   return true;
3520 }
3521 
3522 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3523                                               MachineRegisterInfo &MRI,
3524                                               MachineIRBuilder &B,
3525                                               unsigned AddrSpace) const {
3526   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3527   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3528   Register Hi32 = Unmerge.getReg(1);
3529 
3530   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3531   MI.eraseFromParent();
3532   return true;
3533 }
3534 
3535 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3536 // offset (the offset that is included in bounds checking and swizzling, to be
3537 // split between the instruction's voffset and immoffset fields) and soffset
3538 // (the offset that is excluded from bounds checking and swizzling, to go in
3539 // the instruction's soffset field).  This function takes the first kind of
3540 // offset and figures out how to split it between voffset and immoffset.
3541 std::tuple<Register, unsigned, unsigned>
3542 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3543                                         Register OrigOffset) const {
3544   const unsigned MaxImm = 4095;
3545   Register BaseReg;
3546   unsigned TotalConstOffset;
3547   const LLT S32 = LLT::scalar(32);
3548   MachineRegisterInfo &MRI = *B.getMRI();
3549 
3550   std::tie(BaseReg, TotalConstOffset) =
3551       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3552 
3553   unsigned ImmOffset = TotalConstOffset;
3554 
3555   // If BaseReg is a pointer, convert it to int.
3556   if (MRI.getType(BaseReg).isPointer())
3557     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3558 
3559   // If the immediate value is too big for the immoffset field, put the value
3560   // and -4096 into the immoffset field so that the value that is copied/added
3561   // for the voffset field is a multiple of 4096, and it stands more chance
3562   // of being CSEd with the copy/add for another similar load/store.
3563   // However, do not do that rounding down to a multiple of 4096 if that is a
3564   // negative number, as it appears to be illegal to have a negative offset
3565   // in the vgpr, even if adding the immediate offset makes it positive.
3566   unsigned Overflow = ImmOffset & ~MaxImm;
3567   ImmOffset -= Overflow;
3568   if ((int32_t)Overflow < 0) {
3569     Overflow += ImmOffset;
3570     ImmOffset = 0;
3571   }
3572 
3573   if (Overflow != 0) {
3574     if (!BaseReg) {
3575       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3576     } else {
3577       auto OverflowVal = B.buildConstant(S32, Overflow);
3578       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3579     }
3580   }
3581 
3582   if (!BaseReg)
3583     BaseReg = B.buildConstant(S32, 0).getReg(0);
3584 
3585   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3586 }
3587 
3588 /// Handle register layout difference for f16 images for some subtargets.
3589 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3590                                              MachineRegisterInfo &MRI,
3591                                              Register Reg,
3592                                              bool ImageStore) const {
3593   const LLT S16 = LLT::scalar(16);
3594   const LLT S32 = LLT::scalar(32);
3595   LLT StoreVT = MRI.getType(Reg);
3596   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3597 
3598   if (ST.hasUnpackedD16VMem()) {
3599     auto Unmerge = B.buildUnmerge(S16, Reg);
3600 
3601     SmallVector<Register, 4> WideRegs;
3602     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3603       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3604 
3605     int NumElts = StoreVT.getNumElements();
3606 
3607     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3608   }
3609 
3610   if (ImageStore && ST.hasImageStoreD16Bug()) {
3611     if (StoreVT.getNumElements() == 2) {
3612       SmallVector<Register, 4> PackedRegs;
3613       Reg = B.buildBitcast(S32, Reg).getReg(0);
3614       PackedRegs.push_back(Reg);
3615       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3616       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3617     }
3618 
3619     if (StoreVT.getNumElements() == 3) {
3620       SmallVector<Register, 4> PackedRegs;
3621       auto Unmerge = B.buildUnmerge(S16, Reg);
3622       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3623         PackedRegs.push_back(Unmerge.getReg(I));
3624       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3625       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3626       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3627     }
3628 
3629     if (StoreVT.getNumElements() == 4) {
3630       SmallVector<Register, 4> PackedRegs;
3631       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3632       auto Unmerge = B.buildUnmerge(S32, Reg);
3633       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3634         PackedRegs.push_back(Unmerge.getReg(I));
3635       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3636       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3637     }
3638 
3639     llvm_unreachable("invalid data type");
3640   }
3641 
3642   return Reg;
3643 }
3644 
3645 Register AMDGPULegalizerInfo::fixStoreSourceType(
3646   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3647   MachineRegisterInfo *MRI = B.getMRI();
3648   LLT Ty = MRI->getType(VData);
3649 
3650   const LLT S16 = LLT::scalar(16);
3651 
3652   // Fixup illegal register types for i8 stores.
3653   if (Ty == LLT::scalar(8) || Ty == S16) {
3654     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3655     return AnyExt;
3656   }
3657 
3658   if (Ty.isVector()) {
3659     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3660       if (IsFormat)
3661         return handleD16VData(B, *MRI, VData);
3662     }
3663   }
3664 
3665   return VData;
3666 }
3667 
3668 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3669                                               MachineRegisterInfo &MRI,
3670                                               MachineIRBuilder &B,
3671                                               bool IsTyped,
3672                                               bool IsFormat) const {
3673   Register VData = MI.getOperand(1).getReg();
3674   LLT Ty = MRI.getType(VData);
3675   LLT EltTy = Ty.getScalarType();
3676   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3677   const LLT S32 = LLT::scalar(32);
3678 
3679   VData = fixStoreSourceType(B, VData, IsFormat);
3680   Register RSrc = MI.getOperand(2).getReg();
3681 
3682   MachineMemOperand *MMO = *MI.memoperands_begin();
3683   const int MemSize = MMO->getSize();
3684 
3685   unsigned ImmOffset;
3686   unsigned TotalOffset;
3687 
3688   // The typed intrinsics add an immediate after the registers.
3689   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3690 
3691   // The struct intrinsic variants add one additional operand over raw.
3692   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3693   Register VIndex;
3694   int OpOffset = 0;
3695   if (HasVIndex) {
3696     VIndex = MI.getOperand(3).getReg();
3697     OpOffset = 1;
3698   }
3699 
3700   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3701   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3702 
3703   unsigned Format = 0;
3704   if (IsTyped) {
3705     Format = MI.getOperand(5 + OpOffset).getImm();
3706     ++OpOffset;
3707   }
3708 
3709   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3710 
3711   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3712   if (TotalOffset != 0)
3713     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3714 
3715   unsigned Opc;
3716   if (IsTyped) {
3717     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3718                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3719   } else if (IsFormat) {
3720     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3721                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3722   } else {
3723     switch (MemSize) {
3724     case 1:
3725       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3726       break;
3727     case 2:
3728       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3729       break;
3730     default:
3731       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3732       break;
3733     }
3734   }
3735 
3736   if (!VIndex)
3737     VIndex = B.buildConstant(S32, 0).getReg(0);
3738 
3739   auto MIB = B.buildInstr(Opc)
3740     .addUse(VData)              // vdata
3741     .addUse(RSrc)               // rsrc
3742     .addUse(VIndex)             // vindex
3743     .addUse(VOffset)            // voffset
3744     .addUse(SOffset)            // soffset
3745     .addImm(ImmOffset);         // offset(imm)
3746 
3747   if (IsTyped)
3748     MIB.addImm(Format);
3749 
3750   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3751      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3752      .addMemOperand(MMO);
3753 
3754   MI.eraseFromParent();
3755   return true;
3756 }
3757 
3758 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3759                                              MachineRegisterInfo &MRI,
3760                                              MachineIRBuilder &B,
3761                                              bool IsFormat,
3762                                              bool IsTyped) const {
3763   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3764   MachineMemOperand *MMO = *MI.memoperands_begin();
3765   const int MemSize = MMO->getSize();
3766   const LLT S32 = LLT::scalar(32);
3767 
3768   Register Dst = MI.getOperand(0).getReg();
3769   Register RSrc = MI.getOperand(2).getReg();
3770 
3771   // The typed intrinsics add an immediate after the registers.
3772   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3773 
3774   // The struct intrinsic variants add one additional operand over raw.
3775   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3776   Register VIndex;
3777   int OpOffset = 0;
3778   if (HasVIndex) {
3779     VIndex = MI.getOperand(3).getReg();
3780     OpOffset = 1;
3781   }
3782 
3783   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3784   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3785 
3786   unsigned Format = 0;
3787   if (IsTyped) {
3788     Format = MI.getOperand(5 + OpOffset).getImm();
3789     ++OpOffset;
3790   }
3791 
3792   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3793   unsigned ImmOffset;
3794   unsigned TotalOffset;
3795 
3796   LLT Ty = MRI.getType(Dst);
3797   LLT EltTy = Ty.getScalarType();
3798   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3799   const bool Unpacked = ST.hasUnpackedD16VMem();
3800 
3801   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3802   if (TotalOffset != 0)
3803     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3804 
3805   unsigned Opc;
3806 
3807   if (IsTyped) {
3808     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3809                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3810   } else if (IsFormat) {
3811     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3812                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3813   } else {
3814     switch (MemSize) {
3815     case 1:
3816       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3817       break;
3818     case 2:
3819       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3820       break;
3821     default:
3822       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3823       break;
3824     }
3825   }
3826 
3827   Register LoadDstReg;
3828 
3829   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3830   LLT UnpackedTy = Ty.changeElementSize(32);
3831 
3832   if (IsExtLoad)
3833     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3834   else if (Unpacked && IsD16 && Ty.isVector())
3835     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3836   else
3837     LoadDstReg = Dst;
3838 
3839   if (!VIndex)
3840     VIndex = B.buildConstant(S32, 0).getReg(0);
3841 
3842   auto MIB = B.buildInstr(Opc)
3843     .addDef(LoadDstReg)         // vdata
3844     .addUse(RSrc)               // rsrc
3845     .addUse(VIndex)             // vindex
3846     .addUse(VOffset)            // voffset
3847     .addUse(SOffset)            // soffset
3848     .addImm(ImmOffset);         // offset(imm)
3849 
3850   if (IsTyped)
3851     MIB.addImm(Format);
3852 
3853   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3854      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3855      .addMemOperand(MMO);
3856 
3857   if (LoadDstReg != Dst) {
3858     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3859 
3860     // Widen result for extending loads was widened.
3861     if (IsExtLoad)
3862       B.buildTrunc(Dst, LoadDstReg);
3863     else {
3864       // Repack to original 16-bit vector result
3865       // FIXME: G_TRUNC should work, but legalization currently fails
3866       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3867       SmallVector<Register, 4> Repack;
3868       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3869         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3870       B.buildMerge(Dst, Repack);
3871     }
3872   }
3873 
3874   MI.eraseFromParent();
3875   return true;
3876 }
3877 
3878 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3879                                                MachineIRBuilder &B,
3880                                                bool IsInc) const {
3881   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3882                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3883   B.buildInstr(Opc)
3884     .addDef(MI.getOperand(0).getReg())
3885     .addUse(MI.getOperand(2).getReg())
3886     .addUse(MI.getOperand(3).getReg())
3887     .cloneMemRefs(MI);
3888   MI.eraseFromParent();
3889   return true;
3890 }
3891 
3892 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3893   switch (IntrID) {
3894   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3895   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3896     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3897   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3898   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3899     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3900   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3901   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3902     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3903   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3904   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3905     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3906   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3907   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3908     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3909   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3910   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3911     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3912   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3913   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3914     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3915   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3916   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3917     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3918   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3919   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3920     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3921   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3922   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3923     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3924   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3925   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3926     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3927   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3928   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3929     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3930   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3931   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3932     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3933   case Intrinsic::amdgcn_buffer_atomic_fadd:
3934   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3935   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3936     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3937   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3938   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3939     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3940   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3941   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3942     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3943   default:
3944     llvm_unreachable("unhandled atomic opcode");
3945   }
3946 }
3947 
3948 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3949                                                MachineIRBuilder &B,
3950                                                Intrinsic::ID IID) const {
3951   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3952                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3953   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3954 
3955   Register Dst;
3956 
3957   int OpOffset = 0;
3958   if (HasReturn) {
3959     // A few FP atomics do not support return values.
3960     Dst = MI.getOperand(0).getReg();
3961   } else {
3962     OpOffset = -1;
3963   }
3964 
3965   Register VData = MI.getOperand(2 + OpOffset).getReg();
3966   Register CmpVal;
3967 
3968   if (IsCmpSwap) {
3969     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3970     ++OpOffset;
3971   }
3972 
3973   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3974   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3975 
3976   // The struct intrinsic variants add one additional operand over raw.
3977   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3978   Register VIndex;
3979   if (HasVIndex) {
3980     VIndex = MI.getOperand(4 + OpOffset).getReg();
3981     ++OpOffset;
3982   }
3983 
3984   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3985   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3986   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3987 
3988   MachineMemOperand *MMO = *MI.memoperands_begin();
3989 
3990   unsigned ImmOffset;
3991   unsigned TotalOffset;
3992   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3993   if (TotalOffset != 0)
3994     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3995 
3996   if (!VIndex)
3997     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3998 
3999   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4000 
4001   if (HasReturn)
4002     MIB.addDef(Dst);
4003 
4004   MIB.addUse(VData); // vdata
4005 
4006   if (IsCmpSwap)
4007     MIB.addReg(CmpVal);
4008 
4009   MIB.addUse(RSrc)               // rsrc
4010      .addUse(VIndex)             // vindex
4011      .addUse(VOffset)            // voffset
4012      .addUse(SOffset)            // soffset
4013      .addImm(ImmOffset)          // offset(imm)
4014      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4015      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4016      .addMemOperand(MMO);
4017 
4018   MI.eraseFromParent();
4019   return true;
4020 }
4021 
4022 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4023 /// vector with s16 typed elements.
4024 static void packImageA16AddressToDwords(
4025     MachineIRBuilder &B, MachineInstr &MI,
4026     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4027     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4028   const LLT S16 = LLT::scalar(16);
4029   const LLT V2S16 = LLT::vector(2, 16);
4030 
4031   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4032     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4033     if (!SrcOp.isReg())
4034       continue; // _L to _LZ may have eliminated this.
4035 
4036     Register AddrReg = SrcOp.getReg();
4037 
4038     if (I < Intr->GradientStart) {
4039       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4040       PackedAddrs.push_back(AddrReg);
4041     } else {
4042       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4043       // derivatives dx/dh and dx/dv are packed with undef.
4044       if (((I + 1) >= EndIdx) ||
4045           ((Intr->NumGradients / 2) % 2 == 1 &&
4046            (I == static_cast<unsigned>(Intr->GradientStart +
4047                                        (Intr->NumGradients / 2) - 1) ||
4048             I == static_cast<unsigned>(Intr->GradientStart +
4049                                        Intr->NumGradients - 1))) ||
4050           // Check for _L to _LZ optimization
4051           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4052         PackedAddrs.push_back(
4053             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4054                 .getReg(0));
4055       } else {
4056         PackedAddrs.push_back(
4057             B.buildBuildVector(
4058                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4059                 .getReg(0));
4060         ++I;
4061       }
4062     }
4063   }
4064 }
4065 
4066 /// Convert from separate vaddr components to a single vector address register,
4067 /// and replace the remaining operands with $noreg.
4068 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4069                                      int DimIdx, int NumVAddrs) {
4070   const LLT S32 = LLT::scalar(32);
4071 
4072   SmallVector<Register, 8> AddrRegs;
4073   for (int I = 0; I != NumVAddrs; ++I) {
4074     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4075     if (SrcOp.isReg()) {
4076       AddrRegs.push_back(SrcOp.getReg());
4077       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4078     }
4079   }
4080 
4081   int NumAddrRegs = AddrRegs.size();
4082   if (NumAddrRegs != 1) {
4083     // Round up to 8 elements for v5-v7
4084     // FIXME: Missing intermediate sized register classes and instructions.
4085     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4086       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4087       auto Undef = B.buildUndef(S32);
4088       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4089       NumAddrRegs = RoundedNumRegs;
4090     }
4091 
4092     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4093     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4094   }
4095 
4096   for (int I = 1; I != NumVAddrs; ++I) {
4097     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4098     if (SrcOp.isReg())
4099       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4100   }
4101 }
4102 
4103 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4104 ///
4105 /// Depending on the subtarget, load/store with 16-bit element data need to be
4106 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4107 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4108 /// registers.
4109 ///
4110 /// We don't want to directly select image instructions just yet, but also want
4111 /// to exposes all register repacking to the legalizer/combiners. We also don't
4112 /// want a selected instrution entering RegBankSelect. In order to avoid
4113 /// defining a multitude of intermediate image instructions, directly hack on
4114 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4115 /// now unnecessary arguments with $noreg.
4116 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4117     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4118     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4119 
4120   const unsigned NumDefs = MI.getNumExplicitDefs();
4121   const unsigned ArgOffset = NumDefs + 1;
4122   bool IsTFE = NumDefs == 2;
4123   // We are only processing the operands of d16 image operations on subtargets
4124   // that use the unpacked register layout, or need to repack the TFE result.
4125 
4126   // TODO: Do we need to guard against already legalized intrinsics?
4127   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4128       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4129 
4130   MachineRegisterInfo *MRI = B.getMRI();
4131   const LLT S32 = LLT::scalar(32);
4132   const LLT S16 = LLT::scalar(16);
4133   const LLT V2S16 = LLT::vector(2, 16);
4134 
4135   unsigned DMask = 0;
4136 
4137   // Check for 16 bit addresses and pack if true.
4138   LLT GradTy =
4139       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4140   LLT AddrTy =
4141       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4142   const bool IsG16 = GradTy == S16;
4143   const bool IsA16 = AddrTy == S16;
4144 
4145   int DMaskLanes = 0;
4146   if (!BaseOpcode->Atomic) {
4147     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4148     if (BaseOpcode->Gather4) {
4149       DMaskLanes = 4;
4150     } else if (DMask != 0) {
4151       DMaskLanes = countPopulation(DMask);
4152     } else if (!IsTFE && !BaseOpcode->Store) {
4153       // If dmask is 0, this is a no-op load. This can be eliminated.
4154       B.buildUndef(MI.getOperand(0));
4155       MI.eraseFromParent();
4156       return true;
4157     }
4158   }
4159 
4160   Observer.changingInstr(MI);
4161   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4162 
4163   unsigned NewOpcode = NumDefs == 0 ?
4164     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4165 
4166   // Track that we legalized this
4167   MI.setDesc(B.getTII().get(NewOpcode));
4168 
4169   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4170   // dmask to be at least 1 otherwise the instruction will fail
4171   if (IsTFE && DMask == 0) {
4172     DMask = 0x1;
4173     DMaskLanes = 1;
4174     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4175   }
4176 
4177   if (BaseOpcode->Atomic) {
4178     Register VData0 = MI.getOperand(2).getReg();
4179     LLT Ty = MRI->getType(VData0);
4180 
4181     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4182     if (Ty.isVector())
4183       return false;
4184 
4185     if (BaseOpcode->AtomicX2) {
4186       Register VData1 = MI.getOperand(3).getReg();
4187       // The two values are packed in one register.
4188       LLT PackedTy = LLT::vector(2, Ty);
4189       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4190       MI.getOperand(2).setReg(Concat.getReg(0));
4191       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4192     }
4193   }
4194 
4195   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4196 
4197   // Optimize _L to _LZ when _L is zero
4198   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4199           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4200     const ConstantFP *ConstantLod;
4201 
4202     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4203                  m_GFCst(ConstantLod))) {
4204       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4205         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4206         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4207             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4208                                                       Intr->Dim);
4209 
4210         // The starting indexes should remain in the same place.
4211         --CorrectedNumVAddrs;
4212 
4213         MI.getOperand(MI.getNumExplicitDefs())
4214             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4215         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4216         Intr = NewImageDimIntr;
4217       }
4218     }
4219   }
4220 
4221   // Optimize _mip away, when 'lod' is zero
4222   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4223     int64_t ConstantLod;
4224     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4225                  m_ICst(ConstantLod))) {
4226       if (ConstantLod == 0) {
4227         // TODO: Change intrinsic opcode and remove operand instead or replacing
4228         // it with 0, as the _L to _LZ handling is done above.
4229         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4230         --CorrectedNumVAddrs;
4231       }
4232     }
4233   }
4234 
4235   // Rewrite the addressing register layout before doing anything else.
4236   if (IsA16 || IsG16) {
4237     if (IsA16) {
4238       // Target must support the feature and gradients need to be 16 bit too
4239       if (!ST.hasA16() || !IsG16)
4240         return false;
4241     } else if (!ST.hasG16())
4242       return false;
4243 
4244     if (Intr->NumVAddrs > 1) {
4245       SmallVector<Register, 4> PackedRegs;
4246       // Don't compress addresses for G16
4247       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4248       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4249                                   PackEndIdx);
4250 
4251       if (!IsA16) {
4252         // Add uncompressed address
4253         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4254           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4255           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4256           PackedRegs.push_back(AddrReg);
4257         }
4258       }
4259 
4260       // See also below in the non-a16 branch
4261       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4262 
4263       if (!UseNSA && PackedRegs.size() > 1) {
4264         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4265         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4266         PackedRegs[0] = Concat.getReg(0);
4267         PackedRegs.resize(1);
4268       }
4269 
4270       const unsigned NumPacked = PackedRegs.size();
4271       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4272         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4273         if (!SrcOp.isReg()) {
4274           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4275           continue;
4276         }
4277 
4278         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4279 
4280         if (I - Intr->VAddrStart < NumPacked)
4281           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4282         else
4283           SrcOp.setReg(AMDGPU::NoRegister);
4284       }
4285     }
4286   } else {
4287     // If the register allocator cannot place the address registers contiguously
4288     // without introducing moves, then using the non-sequential address encoding
4289     // is always preferable, since it saves VALU instructions and is usually a
4290     // wash in terms of code size or even better.
4291     //
4292     // However, we currently have no way of hinting to the register allocator
4293     // that MIMG addresses should be placed contiguously when it is possible to
4294     // do so, so force non-NSA for the common 2-address case as a heuristic.
4295     //
4296     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4297     // allocation when possible.
4298     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4299 
4300     if (!UseNSA && Intr->NumVAddrs > 1)
4301       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4302                                Intr->NumVAddrs);
4303   }
4304 
4305   int Flags = 0;
4306   if (IsA16)
4307     Flags |= 1;
4308   if (IsG16)
4309     Flags |= 2;
4310   MI.addOperand(MachineOperand::CreateImm(Flags));
4311 
4312   if (BaseOpcode->Store) { // No TFE for stores?
4313     // TODO: Handle dmask trim
4314     Register VData = MI.getOperand(1).getReg();
4315     LLT Ty = MRI->getType(VData);
4316     if (!Ty.isVector() || Ty.getElementType() != S16)
4317       return true;
4318 
4319     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4320     if (RepackedReg != VData) {
4321       MI.getOperand(1).setReg(RepackedReg);
4322     }
4323 
4324     return true;
4325   }
4326 
4327   Register DstReg = MI.getOperand(0).getReg();
4328   LLT Ty = MRI->getType(DstReg);
4329   const LLT EltTy = Ty.getScalarType();
4330   const bool IsD16 = Ty.getScalarType() == S16;
4331   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4332 
4333   // Confirm that the return type is large enough for the dmask specified
4334   if (NumElts < DMaskLanes)
4335     return false;
4336 
4337   if (NumElts > 4 || DMaskLanes > 4)
4338     return false;
4339 
4340   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4341   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4342 
4343   // The raw dword aligned data component of the load. The only legal cases
4344   // where this matters should be when using the packed D16 format, for
4345   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4346   LLT RoundedTy;
4347 
4348   // S32 vector to to cover all data, plus TFE result element.
4349   LLT TFETy;
4350 
4351   // Register type to use for each loaded component. Will be S32 or V2S16.
4352   LLT RegTy;
4353 
4354   if (IsD16 && ST.hasUnpackedD16VMem()) {
4355     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4356     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4357     RegTy = S32;
4358   } else {
4359     unsigned EltSize = EltTy.getSizeInBits();
4360     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4361     unsigned RoundedSize = 32 * RoundedElts;
4362     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4363     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4364     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4365   }
4366 
4367   // The return type does not need adjustment.
4368   // TODO: Should we change s16 case to s32 or <2 x s16>?
4369   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4370     return true;
4371 
4372   Register Dst1Reg;
4373 
4374   // Insert after the instruction.
4375   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4376 
4377   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4378   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4379   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4380   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4381 
4382   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4383 
4384   MI.getOperand(0).setReg(NewResultReg);
4385 
4386   // In the IR, TFE is supposed to be used with a 2 element struct return
4387   // type. The intruction really returns these two values in one contiguous
4388   // register, with one additional dword beyond the loaded data. Rewrite the
4389   // return type to use a single register result.
4390 
4391   if (IsTFE) {
4392     Dst1Reg = MI.getOperand(1).getReg();
4393     if (MRI->getType(Dst1Reg) != S32)
4394       return false;
4395 
4396     // TODO: Make sure the TFE operand bit is set.
4397     MI.RemoveOperand(1);
4398 
4399     // Handle the easy case that requires no repack instructions.
4400     if (Ty == S32) {
4401       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4402       return true;
4403     }
4404   }
4405 
4406   // Now figure out how to copy the new result register back into the old
4407   // result.
4408   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4409 
4410   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4411 
4412   if (ResultNumRegs == 1) {
4413     assert(!IsTFE);
4414     ResultRegs[0] = NewResultReg;
4415   } else {
4416     // We have to repack into a new vector of some kind.
4417     for (int I = 0; I != NumDataRegs; ++I)
4418       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4419     B.buildUnmerge(ResultRegs, NewResultReg);
4420 
4421     // Drop the final TFE element to get the data part. The TFE result is
4422     // directly written to the right place already.
4423     if (IsTFE)
4424       ResultRegs.resize(NumDataRegs);
4425   }
4426 
4427   // For an s16 scalar result, we form an s32 result with a truncate regardless
4428   // of packed vs. unpacked.
4429   if (IsD16 && !Ty.isVector()) {
4430     B.buildTrunc(DstReg, ResultRegs[0]);
4431     return true;
4432   }
4433 
4434   // Avoid a build/concat_vector of 1 entry.
4435   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4436     B.buildBitcast(DstReg, ResultRegs[0]);
4437     return true;
4438   }
4439 
4440   assert(Ty.isVector());
4441 
4442   if (IsD16) {
4443     // For packed D16 results with TFE enabled, all the data components are
4444     // S32. Cast back to the expected type.
4445     //
4446     // TODO: We don't really need to use load s32 elements. We would only need one
4447     // cast for the TFE result if a multiple of v2s16 was used.
4448     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4449       for (Register &Reg : ResultRegs)
4450         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4451     } else if (ST.hasUnpackedD16VMem()) {
4452       for (Register &Reg : ResultRegs)
4453         Reg = B.buildTrunc(S16, Reg).getReg(0);
4454     }
4455   }
4456 
4457   auto padWithUndef = [&](LLT Ty, int NumElts) {
4458     if (NumElts == 0)
4459       return;
4460     Register Undef = B.buildUndef(Ty).getReg(0);
4461     for (int I = 0; I != NumElts; ++I)
4462       ResultRegs.push_back(Undef);
4463   };
4464 
4465   // Pad out any elements eliminated due to the dmask.
4466   LLT ResTy = MRI->getType(ResultRegs[0]);
4467   if (!ResTy.isVector()) {
4468     padWithUndef(ResTy, NumElts - ResultRegs.size());
4469     B.buildBuildVector(DstReg, ResultRegs);
4470     return true;
4471   }
4472 
4473   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4474   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4475 
4476   // Deal with the one annoying legal case.
4477   const LLT V3S16 = LLT::vector(3, 16);
4478   if (Ty == V3S16) {
4479     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4480     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4481     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4482     return true;
4483   }
4484 
4485   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4486   B.buildConcatVectors(DstReg, ResultRegs);
4487   return true;
4488 }
4489 
4490 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4491   LegalizerHelper &Helper, MachineInstr &MI) const {
4492   MachineIRBuilder &B = Helper.MIRBuilder;
4493   GISelChangeObserver &Observer = Helper.Observer;
4494 
4495   Register Dst = MI.getOperand(0).getReg();
4496   LLT Ty = B.getMRI()->getType(Dst);
4497   unsigned Size = Ty.getSizeInBits();
4498   MachineFunction &MF = B.getMF();
4499 
4500   Observer.changingInstr(MI);
4501 
4502   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4503     Ty = getBitcastRegisterType(Ty);
4504     Helper.bitcastDst(MI, Ty, 0);
4505     Dst = MI.getOperand(0).getReg();
4506     B.setInsertPt(B.getMBB(), MI);
4507   }
4508 
4509   // FIXME: We don't really need this intermediate instruction. The intrinsic
4510   // should be fixed to have a memory operand. Since it's readnone, we're not
4511   // allowed to add one.
4512   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4513   MI.RemoveOperand(1); // Remove intrinsic ID
4514 
4515   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4516   // TODO: Should this use datalayout alignment?
4517   const unsigned MemSize = (Size + 7) / 8;
4518   const Align MemAlign(4);
4519   MachineMemOperand *MMO = MF.getMachineMemOperand(
4520       MachinePointerInfo(),
4521       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4522           MachineMemOperand::MOInvariant,
4523       MemSize, MemAlign);
4524   MI.addMemOperand(MF, MMO);
4525 
4526   // There are no 96-bit result scalar loads, but widening to 128-bit should
4527   // always be legal. We may need to restore this to a 96-bit result if it turns
4528   // out this needs to be converted to a vector load during RegBankSelect.
4529   if (!isPowerOf2_32(Size)) {
4530     if (Ty.isVector())
4531       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4532     else
4533       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4534   }
4535 
4536   Observer.changedInstr(MI);
4537   return true;
4538 }
4539 
4540 // TODO: Move to selection
4541 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4542                                                 MachineRegisterInfo &MRI,
4543                                                 MachineIRBuilder &B) const {
4544   if (!ST.isTrapHandlerEnabled() ||
4545       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4546     return legalizeTrapEndpgm(MI, MRI, B);
4547 
4548   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4549     switch (*HsaAbiVer) {
4550     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4551     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4552       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4553     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4554       return ST.supportsGetDoorbellID() ?
4555           legalizeTrapHsa(MI, MRI, B) :
4556           legalizeTrapHsaQueuePtr(MI, MRI, B);
4557     }
4558   }
4559 
4560   llvm_unreachable("Unknown trap handler");
4561 }
4562 
4563 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4564     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4565   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4566   MI.eraseFromParent();
4567   return true;
4568 }
4569 
4570 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4571     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4572   // Pass queue pointer to trap handler as input, and insert trap instruction
4573   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4574   Register LiveIn =
4575     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4576   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4577     return false;
4578 
4579   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4580   B.buildCopy(SGPR01, LiveIn);
4581   B.buildInstr(AMDGPU::S_TRAP)
4582       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4583       .addReg(SGPR01, RegState::Implicit);
4584 
4585   MI.eraseFromParent();
4586   return true;
4587 }
4588 
4589 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4590     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4591   B.buildInstr(AMDGPU::S_TRAP)
4592       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4593   MI.eraseFromParent();
4594   return true;
4595 }
4596 
4597 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4598     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4599   // Is non-HSA path or trap-handler disabled? then, report a warning
4600   // accordingly
4601   if (!ST.isTrapHandlerEnabled() ||
4602       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4603     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4604                                      "debugtrap handler not supported",
4605                                      MI.getDebugLoc(), DS_Warning);
4606     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4607     Ctx.diagnose(NoTrap);
4608   } else {
4609     // Insert debug-trap instruction
4610     B.buildInstr(AMDGPU::S_TRAP)
4611         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4612   }
4613 
4614   MI.eraseFromParent();
4615   return true;
4616 }
4617 
4618 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4619                                                MachineIRBuilder &B) const {
4620   MachineRegisterInfo &MRI = *B.getMRI();
4621   const LLT S16 = LLT::scalar(16);
4622   const LLT S32 = LLT::scalar(32);
4623 
4624   Register DstReg = MI.getOperand(0).getReg();
4625   Register NodePtr = MI.getOperand(2).getReg();
4626   Register RayExtent = MI.getOperand(3).getReg();
4627   Register RayOrigin = MI.getOperand(4).getReg();
4628   Register RayDir = MI.getOperand(5).getReg();
4629   Register RayInvDir = MI.getOperand(6).getReg();
4630   Register TDescr = MI.getOperand(7).getReg();
4631 
4632   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4633   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4634   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4635                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4636                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4637                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4638 
4639   SmallVector<Register, 12> Ops;
4640   if (Is64) {
4641     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4642     Ops.push_back(Unmerge.getReg(0));
4643     Ops.push_back(Unmerge.getReg(1));
4644   } else {
4645     Ops.push_back(NodePtr);
4646   }
4647   Ops.push_back(RayExtent);
4648 
4649   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4650     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4651     Ops.push_back(Unmerge.getReg(0));
4652     Ops.push_back(Unmerge.getReg(1));
4653     Ops.push_back(Unmerge.getReg(2));
4654   };
4655 
4656   packLanes(RayOrigin);
4657   if (IsA16) {
4658     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4659     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4660     Register R1 = MRI.createGenericVirtualRegister(S32);
4661     Register R2 = MRI.createGenericVirtualRegister(S32);
4662     Register R3 = MRI.createGenericVirtualRegister(S32);
4663     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4664     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4665     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4666     Ops.push_back(R1);
4667     Ops.push_back(R2);
4668     Ops.push_back(R3);
4669   } else {
4670     packLanes(RayDir);
4671     packLanes(RayInvDir);
4672   }
4673 
4674   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4675     .addDef(DstReg)
4676     .addImm(Opcode);
4677 
4678   for (Register R : Ops) {
4679     MIB.addUse(R);
4680   }
4681 
4682   MIB.addUse(TDescr)
4683      .addImm(IsA16 ? 1 : 0)
4684      .cloneMemRefs(MI);
4685 
4686   MI.eraseFromParent();
4687   return true;
4688 }
4689 
4690 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4691                                             MachineInstr &MI) const {
4692   MachineIRBuilder &B = Helper.MIRBuilder;
4693   MachineRegisterInfo &MRI = *B.getMRI();
4694 
4695   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4696   auto IntrID = MI.getIntrinsicID();
4697   switch (IntrID) {
4698   case Intrinsic::amdgcn_if:
4699   case Intrinsic::amdgcn_else: {
4700     MachineInstr *Br = nullptr;
4701     MachineBasicBlock *UncondBrTarget = nullptr;
4702     bool Negated = false;
4703     if (MachineInstr *BrCond =
4704             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4705       const SIRegisterInfo *TRI
4706         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4707 
4708       Register Def = MI.getOperand(1).getReg();
4709       Register Use = MI.getOperand(3).getReg();
4710 
4711       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4712 
4713       if (Negated)
4714         std::swap(CondBrTarget, UncondBrTarget);
4715 
4716       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4717       if (IntrID == Intrinsic::amdgcn_if) {
4718         B.buildInstr(AMDGPU::SI_IF)
4719           .addDef(Def)
4720           .addUse(Use)
4721           .addMBB(UncondBrTarget);
4722       } else {
4723         B.buildInstr(AMDGPU::SI_ELSE)
4724             .addDef(Def)
4725             .addUse(Use)
4726             .addMBB(UncondBrTarget);
4727       }
4728 
4729       if (Br) {
4730         Br->getOperand(0).setMBB(CondBrTarget);
4731       } else {
4732         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4733         // since we're swapping branch targets it needs to be reinserted.
4734         // FIXME: IRTranslator should probably not do this
4735         B.buildBr(*CondBrTarget);
4736       }
4737 
4738       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4739       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4740       MI.eraseFromParent();
4741       BrCond->eraseFromParent();
4742       return true;
4743     }
4744 
4745     return false;
4746   }
4747   case Intrinsic::amdgcn_loop: {
4748     MachineInstr *Br = nullptr;
4749     MachineBasicBlock *UncondBrTarget = nullptr;
4750     bool Negated = false;
4751     if (MachineInstr *BrCond =
4752             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4753       const SIRegisterInfo *TRI
4754         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4755 
4756       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4757       Register Reg = MI.getOperand(2).getReg();
4758 
4759       if (Negated)
4760         std::swap(CondBrTarget, UncondBrTarget);
4761 
4762       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4763       B.buildInstr(AMDGPU::SI_LOOP)
4764         .addUse(Reg)
4765         .addMBB(UncondBrTarget);
4766 
4767       if (Br)
4768         Br->getOperand(0).setMBB(CondBrTarget);
4769       else
4770         B.buildBr(*CondBrTarget);
4771 
4772       MI.eraseFromParent();
4773       BrCond->eraseFromParent();
4774       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4775       return true;
4776     }
4777 
4778     return false;
4779   }
4780   case Intrinsic::amdgcn_kernarg_segment_ptr:
4781     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4782       // This only makes sense to call in a kernel, so just lower to null.
4783       B.buildConstant(MI.getOperand(0).getReg(), 0);
4784       MI.eraseFromParent();
4785       return true;
4786     }
4787 
4788     return legalizePreloadedArgIntrin(
4789       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4790   case Intrinsic::amdgcn_implicitarg_ptr:
4791     return legalizeImplicitArgPtr(MI, MRI, B);
4792   case Intrinsic::amdgcn_workitem_id_x:
4793     return legalizePreloadedArgIntrin(MI, MRI, B,
4794                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4795   case Intrinsic::amdgcn_workitem_id_y:
4796     return legalizePreloadedArgIntrin(MI, MRI, B,
4797                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4798   case Intrinsic::amdgcn_workitem_id_z:
4799     return legalizePreloadedArgIntrin(MI, MRI, B,
4800                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4801   case Intrinsic::amdgcn_workgroup_id_x:
4802     return legalizePreloadedArgIntrin(MI, MRI, B,
4803                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4804   case Intrinsic::amdgcn_workgroup_id_y:
4805     return legalizePreloadedArgIntrin(MI, MRI, B,
4806                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4807   case Intrinsic::amdgcn_workgroup_id_z:
4808     return legalizePreloadedArgIntrin(MI, MRI, B,
4809                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4810   case Intrinsic::amdgcn_dispatch_ptr:
4811     return legalizePreloadedArgIntrin(MI, MRI, B,
4812                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4813   case Intrinsic::amdgcn_queue_ptr:
4814     return legalizePreloadedArgIntrin(MI, MRI, B,
4815                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4816   case Intrinsic::amdgcn_implicit_buffer_ptr:
4817     return legalizePreloadedArgIntrin(
4818       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4819   case Intrinsic::amdgcn_dispatch_id:
4820     return legalizePreloadedArgIntrin(MI, MRI, B,
4821                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4822   case Intrinsic::amdgcn_fdiv_fast:
4823     return legalizeFDIVFastIntrin(MI, MRI, B);
4824   case Intrinsic::amdgcn_is_shared:
4825     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4826   case Intrinsic::amdgcn_is_private:
4827     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4828   case Intrinsic::amdgcn_wavefrontsize: {
4829     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4830     MI.eraseFromParent();
4831     return true;
4832   }
4833   case Intrinsic::amdgcn_s_buffer_load:
4834     return legalizeSBufferLoad(Helper, MI);
4835   case Intrinsic::amdgcn_raw_buffer_store:
4836   case Intrinsic::amdgcn_struct_buffer_store:
4837     return legalizeBufferStore(MI, MRI, B, false, false);
4838   case Intrinsic::amdgcn_raw_buffer_store_format:
4839   case Intrinsic::amdgcn_struct_buffer_store_format:
4840     return legalizeBufferStore(MI, MRI, B, false, true);
4841   case Intrinsic::amdgcn_raw_tbuffer_store:
4842   case Intrinsic::amdgcn_struct_tbuffer_store:
4843     return legalizeBufferStore(MI, MRI, B, true, true);
4844   case Intrinsic::amdgcn_raw_buffer_load:
4845   case Intrinsic::amdgcn_struct_buffer_load:
4846     return legalizeBufferLoad(MI, MRI, B, false, false);
4847   case Intrinsic::amdgcn_raw_buffer_load_format:
4848   case Intrinsic::amdgcn_struct_buffer_load_format:
4849     return legalizeBufferLoad(MI, MRI, B, true, false);
4850   case Intrinsic::amdgcn_raw_tbuffer_load:
4851   case Intrinsic::amdgcn_struct_tbuffer_load:
4852     return legalizeBufferLoad(MI, MRI, B, true, true);
4853   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4854   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4855   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4856   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4857   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4858   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4859   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4860   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4861   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4862   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4863   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4864   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4865   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4866   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4867   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4868   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4869   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4870   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4871   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4872   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4873   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4874   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4875   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4876   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4877   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4878   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4879   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4880   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4881   case Intrinsic::amdgcn_buffer_atomic_fadd:
4882   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4883   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4884   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4885   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4886     return legalizeBufferAtomic(MI, B, IntrID);
4887   case Intrinsic::amdgcn_atomic_inc:
4888     return legalizeAtomicIncDec(MI, B, true);
4889   case Intrinsic::amdgcn_atomic_dec:
4890     return legalizeAtomicIncDec(MI, B, false);
4891   case Intrinsic::trap:
4892     return legalizeTrapIntrinsic(MI, MRI, B);
4893   case Intrinsic::debugtrap:
4894     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4895   case Intrinsic::amdgcn_rsq_clamp:
4896     return legalizeRsqClampIntrinsic(MI, MRI, B);
4897   case Intrinsic::amdgcn_ds_fadd:
4898   case Intrinsic::amdgcn_ds_fmin:
4899   case Intrinsic::amdgcn_ds_fmax:
4900     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4901   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4902     return legalizeBVHIntrinsic(MI, B);
4903   default: {
4904     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4905             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4906       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4907     return true;
4908   }
4909   }
4910 
4911   return true;
4912 }
4913