1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226   return [=](const LegalityQuery &Query) {
227     const LLT Ty = Query.Types[TypeIdx];
228     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230   };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237                                     bool IsLoad) {
238   switch (AS) {
239   case AMDGPUAS::PRIVATE_ADDRESS:
240     // FIXME: Private element size.
241     return ST.enableFlatScratch() ? 128 : 32;
242   case AMDGPUAS::LOCAL_ADDRESS:
243     return ST.useDS128() ? 128 : 64;
244   case AMDGPUAS::GLOBAL_ADDRESS:
245   case AMDGPUAS::CONSTANT_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247     // Treat constant and global as identical. SMRD loads are sometimes usable for
248     // global loads (ideally constant address space should be eliminated)
249     // depending on the context. Legality cannot be context dependent, but
250     // RegBankSelect can split the load as necessary depending on the pointer
251     // register bank/uniformity and if the memory is invariant or not written in a
252     // kernel.
253     return IsLoad ? 512 : 128;
254   default:
255     // Flat addresses may contextually need to be split to 32-bit parts if they
256     // may alias scratch depending on the subtarget.
257     return 128;
258   }
259 }
260 
261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262                                  const LegalityQuery &Query,
263                                  unsigned Opcode) {
264   const LLT Ty = Query.Types[0];
265 
266   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267   const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 
269   unsigned RegSize = Ty.getSizeInBits();
270   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272   unsigned AS = Query.Types[1].getAddressSpace();
273 
274   // All of these need to be custom lowered to cast the pointer operand.
275   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276     return false;
277 
278   // TODO: We should be able to widen loads if the alignment is high enough, but
279   // we also need to modify the memory access size.
280 #if 0
281   // Accept widening loads based on alignment.
282   if (IsLoad && MemSize < Size)
283     MemSize = std::max(MemSize, Align);
284 #endif
285 
286   // Only 1-byte and 2-byte to 32-bit extloads are valid.
287   if (MemSize != RegSize && RegSize != 32)
288     return false;
289 
290   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291     return false;
292 
293   switch (MemSize) {
294   case 8:
295   case 16:
296   case 32:
297   case 64:
298   case 128:
299     break;
300   case 96:
301     if (!ST.hasDwordx3LoadStores())
302       return false;
303     break;
304   case 256:
305   case 512:
306     // These may contextually need to be broken down.
307     break;
308   default:
309     return false;
310   }
311 
312   assert(RegSize >= MemSize);
313 
314   if (AlignBits < MemSize) {
315     const SITargetLowering *TLI = ST.getTargetLowering();
316     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317                                                  Align(AlignBits / 8)))
318       return false;
319   }
320 
321   return true;
322 }
323 
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329   if (EnableNewLegality)
330     return false;
331 
332   const unsigned Size = Ty.getSizeInBits();
333   if (Size <= 64)
334     return false;
335   if (!Ty.isVector())
336     return true;
337 
338   LLT EltTy = Ty.getElementType();
339   if (EltTy.isPointer())
340     return true;
341 
342   unsigned EltSize = EltTy.getSizeInBits();
343   return EltSize != 32 && EltSize != 64;
344 }
345 
346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347                              unsigned Opcode) {
348   const LLT Ty = Query.Types[0];
349   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350          !loadStoreBitcastWorkaround(Ty);
351 }
352 
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356                                        const unsigned MemSizeInBits) {
357   const unsigned Size = Ty.getSizeInBits();
358     if (Size != MemSizeInBits)
359       return Size <= 32 && Ty.isVector();
360 
361   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362     return true;
363   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364          !isRegisterVectorElementType(Ty.getElementType());
365 }
366 
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371                             unsigned AlignInBits, unsigned AddrSpace,
372                             unsigned Opcode) {
373   // We don't want to widen cases that are naturally legal.
374   if (isPowerOf2_32(SizeInBits))
375     return false;
376 
377   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378   // end up widening these for a scalar load during RegBankSelect, since there
379   // aren't 96-bit scalar loads.
380   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381     return false;
382 
383   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384     return false;
385 
386   // A load is known dereferenceable up to the alignment, so it's legal to widen
387   // to it.
388   //
389   // TODO: Could check dereferenceable for less aligned cases.
390   unsigned RoundedSize = NextPowerOf2(SizeInBits);
391   if (AlignInBits < RoundedSize)
392     return false;
393 
394   // Do not widen if it would introduce a slow unaligned load.
395   const SITargetLowering *TLI = ST.getTargetLowering();
396   bool Fast = false;
397   return TLI->allowsMisalignedMemoryAccessesImpl(
398              RoundedSize, AddrSpace, Align(AlignInBits / 8),
399              MachineMemOperand::MOLoad, &Fast) &&
400          Fast;
401 }
402 
403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404                             unsigned Opcode) {
405   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406     return false;
407 
408   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409                          Query.MMODescrs[0].AlignInBits,
410                          Query.Types[1].getAddressSpace(), Opcode);
411 }
412 
413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414                                          const GCNTargetMachine &TM)
415   :  ST(ST_) {
416   using namespace TargetOpcode;
417 
418   auto GetAddrSpacePtr = [&TM](unsigned AS) {
419     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420   };
421 
422   const LLT S1 = LLT::scalar(1);
423   const LLT S8 = LLT::scalar(8);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S8 = LLT::vector(2, 8);
433   const LLT V2S16 = LLT::vector(2, 16);
434   const LLT V4S16 = LLT::vector(4, 16);
435 
436   const LLT V2S32 = LLT::vector(2, 32);
437   const LLT V3S32 = LLT::vector(3, 32);
438   const LLT V4S32 = LLT::vector(4, 32);
439   const LLT V5S32 = LLT::vector(5, 32);
440   const LLT V6S32 = LLT::vector(6, 32);
441   const LLT V7S32 = LLT::vector(7, 32);
442   const LLT V8S32 = LLT::vector(8, 32);
443   const LLT V9S32 = LLT::vector(9, 32);
444   const LLT V10S32 = LLT::vector(10, 32);
445   const LLT V11S32 = LLT::vector(11, 32);
446   const LLT V12S32 = LLT::vector(12, 32);
447   const LLT V13S32 = LLT::vector(13, 32);
448   const LLT V14S32 = LLT::vector(14, 32);
449   const LLT V15S32 = LLT::vector(15, 32);
450   const LLT V16S32 = LLT::vector(16, 32);
451   const LLT V32S32 = LLT::vector(32, 32);
452 
453   const LLT V2S64 = LLT::vector(2, 64);
454   const LLT V3S64 = LLT::vector(3, 64);
455   const LLT V4S64 = LLT::vector(4, 64);
456   const LLT V5S64 = LLT::vector(5, 64);
457   const LLT V6S64 = LLT::vector(6, 64);
458   const LLT V7S64 = LLT::vector(7, 64);
459   const LLT V8S64 = LLT::vector(8, 64);
460   const LLT V16S64 = LLT::vector(16, 64);
461 
462   std::initializer_list<LLT> AllS32Vectors =
463     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465   std::initializer_list<LLT> AllS64Vectors =
466     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467 
468   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475 
476   const LLT CodePtr = FlatPtr;
477 
478   const std::initializer_list<LLT> AddrSpaces64 = {
479     GlobalPtr, ConstantPtr, FlatPtr
480   };
481 
482   const std::initializer_list<LLT> AddrSpaces32 = {
483     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484   };
485 
486   const std::initializer_list<LLT> FPTypesBase = {
487     S32, S64
488   };
489 
490   const std::initializer_list<LLT> FPTypes16 = {
491     S32, S64, S16
492   };
493 
494   const std::initializer_list<LLT> FPTypesPK16 = {
495     S32, S64, S16, V2S16
496   };
497 
498   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499 
500   setAction({G_BRCOND, S1}, Legal); // VCC branches
501   setAction({G_BRCOND, S32}, Legal); // SCC branches
502 
503   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504   // elements for v3s16
505   getActionDefinitionsBuilder(G_PHI)
506     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507     .legalFor(AllS32Vectors)
508     .legalFor(AllS64Vectors)
509     .legalFor(AddrSpaces64)
510     .legalFor(AddrSpaces32)
511     .legalIf(isPointer(0))
512     .clampScalar(0, S16, S256)
513     .widenScalarToNextPow2(0, 32)
514     .clampMaxNumElements(0, S32, 16)
515     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516     .scalarize(0);
517 
518   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519     // Full set of gfx9 features.
520     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521       .legalFor({S32, S16, V2S16})
522       .clampScalar(0, S16, S32)
523       .clampMaxNumElements(0, S16, 2)
524       .scalarize(0)
525       .widenScalarToNextPow2(0, 32);
526 
527     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528       .legalFor({S32, S16, V2S16}) // Clamp modifier
529       .minScalarOrElt(0, S16)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32)
533       .lower();
534   } else if (ST.has16BitInsts()) {
535     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536       .legalFor({S32, S16})
537       .clampScalar(0, S16, S32)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540 
541     // Technically the saturating operations require clamp bit support, but this
542     // was introduced at the same time as 16-bit operations.
543     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544       .legalFor({S32, S16}) // Clamp modifier
545       .minScalar(0, S16)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 16)
548       .lower();
549 
550     // We're just lowering this, but it helps get a better result to try to
551     // coerce to the desired type first.
552     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553       .minScalar(0, S16)
554       .scalarize(0)
555       .lower();
556   } else {
557     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558       .legalFor({S32})
559       .clampScalar(0, S32, S32)
560       .scalarize(0);
561 
562     if (ST.hasIntClamp()) {
563       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564         .legalFor({S32}) // Clamp modifier.
565         .scalarize(0)
566         .minScalarOrElt(0, S32)
567         .lower();
568     } else {
569       // Clamp bit support was added in VI, along with 16-bit operations.
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .minScalar(0, S32)
572         .scalarize(0)
573         .lower();
574     }
575 
576     // FIXME: DAG expansion gets better results. The widening uses the smaller
577     // range values and goes for the min/max lowering directly.
578     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579       .minScalar(0, S32)
580       .scalarize(0)
581       .lower();
582   }
583 
584   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585     .customFor({S32, S64})
586     .clampScalar(0, S32, S64)
587     .widenScalarToNextPow2(0, 32)
588     .scalarize(0);
589 
590   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591                    .legalFor({S32})
592                    .maxScalarOrElt(0, S32);
593 
594   if (ST.hasVOP3PInsts()) {
595     Mulh
596       .clampMaxNumElements(0, S8, 2)
597       .lowerFor({V2S8});
598   }
599 
600   Mulh
601     .scalarize(0)
602     .lower();
603 
604   // Report legal for any types we can handle anywhere. For the cases only legal
605   // on the SALU, RegBankSelect will be able to re-legalize.
606   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608     .clampScalar(0, S32, S64)
609     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611     .widenScalarToNextPow2(0)
612     .scalarize(0);
613 
614   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616     .legalFor({{S32, S1}, {S32, S32}})
617     .minScalar(0, S32)
618     // TODO: .scalarize(0)
619     .lower();
620 
621   getActionDefinitionsBuilder(G_BITCAST)
622     // Don't worry about the size constraint.
623     .legalIf(all(isRegisterType(0), isRegisterType(1)))
624     .lower();
625 
626 
627   getActionDefinitionsBuilder(G_CONSTANT)
628     .legalFor({S1, S32, S64, S16, GlobalPtr,
629                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630     .legalIf(isPointer(0))
631     .clampScalar(0, S32, S64)
632     .widenScalarToNextPow2(0);
633 
634   getActionDefinitionsBuilder(G_FCONSTANT)
635     .legalFor({S32, S64, S16})
636     .clampScalar(0, S16, S64);
637 
638   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639       .legalIf(isRegisterType(0))
640       // s1 and s16 are special cases because they have legal operations on
641       // them, but don't really occupy registers in the normal way.
642       .legalFor({S1, S16})
643       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644       .clampScalarOrElt(0, S32, MaxScalar)
645       .widenScalarToNextPow2(0, 32)
646       .clampMaxNumElements(0, S32, 16);
647 
648   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649 
650   // If the amount is divergent, we have to do a wave reduction to get the
651   // maximum value, so this is expanded during RegBankSelect.
652   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653     .legalFor({{PrivatePtr, S32}});
654 
655   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656     .customIf(typeIsNot(0, PrivatePtr));
657 
658   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659 
660   auto &FPOpActions = getActionDefinitionsBuilder(
661     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662     .legalFor({S32, S64});
663   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664     .customFor({S32, S64});
665   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666     .customFor({S32, S64});
667 
668   if (ST.has16BitInsts()) {
669     if (ST.hasVOP3PInsts())
670       FPOpActions.legalFor({S16, V2S16});
671     else
672       FPOpActions.legalFor({S16});
673 
674     TrigActions.customFor({S16});
675     FDIVActions.customFor({S16});
676   }
677 
678   auto &MinNumMaxNum = getActionDefinitionsBuilder({
679       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680 
681   if (ST.hasVOP3PInsts()) {
682     MinNumMaxNum.customFor(FPTypesPK16)
683       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684       .clampMaxNumElements(0, S16, 2)
685       .clampScalar(0, S16, S64)
686       .scalarize(0);
687   } else if (ST.has16BitInsts()) {
688     MinNumMaxNum.customFor(FPTypes16)
689       .clampScalar(0, S16, S64)
690       .scalarize(0);
691   } else {
692     MinNumMaxNum.customFor(FPTypesBase)
693       .clampScalar(0, S32, S64)
694       .scalarize(0);
695   }
696 
697   if (ST.hasVOP3PInsts())
698     FPOpActions.clampMaxNumElements(0, S16, 2);
699 
700   FPOpActions
701     .scalarize(0)
702     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703 
704   TrigActions
705     .scalarize(0)
706     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707 
708   FDIVActions
709     .scalarize(0)
710     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711 
712   getActionDefinitionsBuilder({G_FNEG, G_FABS})
713     .legalFor(FPTypesPK16)
714     .clampMaxNumElements(0, S16, 2)
715     .scalarize(0)
716     .clampScalar(0, S16, S64);
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720       .legalFor({S32, S64, S16})
721       .scalarize(0)
722       .clampScalar(0, S16, S64);
723   } else {
724     getActionDefinitionsBuilder(G_FSQRT)
725       .legalFor({S32, S64})
726       .scalarize(0)
727       .clampScalar(0, S32, S64);
728 
729     if (ST.hasFractBug()) {
730       getActionDefinitionsBuilder(G_FFLOOR)
731         .customFor({S64})
732         .legalFor({S32, S64})
733         .scalarize(0)
734         .clampScalar(0, S32, S64);
735     } else {
736       getActionDefinitionsBuilder(G_FFLOOR)
737         .legalFor({S32, S64})
738         .scalarize(0)
739         .clampScalar(0, S32, S64);
740     }
741   }
742 
743   getActionDefinitionsBuilder(G_FPTRUNC)
744     .legalFor({{S32, S64}, {S16, S32}})
745     .scalarize(0)
746     .lower();
747 
748   getActionDefinitionsBuilder(G_FPEXT)
749     .legalFor({{S64, S32}, {S32, S16}})
750     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751     .scalarize(0);
752 
753   getActionDefinitionsBuilder(G_FSUB)
754       // Use actual fsub instruction
755       .legalFor({S32})
756       // Must use fadd + fneg
757       .lowerFor({S64, S16, V2S16})
758       .scalarize(0)
759       .clampScalar(0, S32, S64);
760 
761   // Whether this is legal depends on the floating point mode for the function.
762   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764     FMad.customFor({S32, S16});
765   else if (ST.hasMadMacF32Insts())
766     FMad.customFor({S32});
767   else if (ST.hasMadF16())
768     FMad.customFor({S16});
769   FMad.scalarize(0)
770       .lower();
771 
772   auto &FRem = getActionDefinitionsBuilder(G_FREM);
773   if (ST.has16BitInsts()) {
774     FRem.customFor({S16, S32, S64});
775   } else {
776     FRem.minScalar(0, S32)
777         .customFor({S32, S64});
778   }
779   FRem.scalarize(0);
780 
781   // TODO: Do we need to clamp maximum bitwidth?
782   getActionDefinitionsBuilder(G_TRUNC)
783     .legalIf(isScalar(0))
784     .legalFor({{V2S16, V2S32}})
785     .clampMaxNumElements(0, S16, 2)
786     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787     // situations (like an invalid implicit use), we don't want to infinite loop
788     // in the legalizer.
789     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790     .alwaysLegal();
791 
792   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794                {S32, S1}, {S64, S1}, {S16, S1}})
795     .scalarize(0)
796     .clampScalar(0, S32, S64)
797     .widenScalarToNextPow2(1, 32);
798 
799   // TODO: Split s1->s64 during regbankselect for VALU.
800   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802     .lowerFor({{S32, S64}})
803     .lowerIf(typeIs(1, S1))
804     .customFor({{S64, S64}});
805   if (ST.has16BitInsts())
806     IToFP.legalFor({{S16, S16}});
807   IToFP.clampScalar(1, S32, S64)
808        .minScalar(0, S32)
809        .scalarize(0)
810        .widenScalarToNextPow2(1);
811 
812   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814     .customFor({{S64, S64}})
815     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816   if (ST.has16BitInsts())
817     FPToI.legalFor({{S16, S16}});
818   else
819     FPToI.minScalar(1, S32);
820 
821   FPToI.minScalar(0, S32)
822        .scalarize(0)
823        .lower();
824 
825   // Lower roundeven into G_FRINT
826   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
827     .scalarize(0)
828     .lower();
829 
830   if (ST.has16BitInsts()) {
831     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832       .legalFor({S16, S32, S64})
833       .clampScalar(0, S16, S64)
834       .scalarize(0);
835   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
836     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
837       .legalFor({S32, S64})
838       .clampScalar(0, S32, S64)
839       .scalarize(0);
840   } else {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S32})
843       .customFor({S64})
844       .clampScalar(0, S32, S64)
845       .scalarize(0);
846   }
847 
848   getActionDefinitionsBuilder(G_PTR_ADD)
849     .legalIf(all(isPointer(0), sameSize(0, 1)))
850     .scalarize(0)
851     .scalarSameSizeAs(1, 0);
852 
853   getActionDefinitionsBuilder(G_PTRMASK)
854     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
855     .scalarSameSizeAs(1, 0)
856     .scalarize(0);
857 
858   auto &CmpBuilder =
859     getActionDefinitionsBuilder(G_ICMP)
860     // The compare output type differs based on the register bank of the output,
861     // so make both s1 and s32 legal.
862     //
863     // Scalar compares producing output in scc will be promoted to s32, as that
864     // is the allocatable register type that will be needed for the copy from
865     // scc. This will be promoted during RegBankSelect, and we assume something
866     // before that won't try to use s32 result types.
867     //
868     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
869     // bank.
870     .legalForCartesianProduct(
871       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
872     .legalForCartesianProduct(
873       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
874   if (ST.has16BitInsts()) {
875     CmpBuilder.legalFor({{S1, S16}});
876   }
877 
878   CmpBuilder
879     .widenScalarToNextPow2(1)
880     .clampScalar(1, S32, S64)
881     .scalarize(0)
882     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
883 
884   getActionDefinitionsBuilder(G_FCMP)
885     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
886     .widenScalarToNextPow2(1)
887     .clampScalar(1, S32, S64)
888     .scalarize(0);
889 
890   // FIXME: fpow has a selection pattern that should move to custom lowering.
891   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
892   if (ST.has16BitInsts())
893     Exp2Ops.legalFor({S32, S16});
894   else
895     Exp2Ops.legalFor({S32});
896   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
897   Exp2Ops.scalarize(0);
898 
899   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
900   if (ST.has16BitInsts())
901     ExpOps.customFor({{S32}, {S16}});
902   else
903     ExpOps.customFor({S32});
904   ExpOps.clampScalar(0, MinScalarFPTy, S32)
905         .scalarize(0);
906 
907   getActionDefinitionsBuilder(G_FPOWI)
908     .clampScalar(0, MinScalarFPTy, S32)
909     .lower();
910 
911   // The 64-bit versions produce 32-bit results, but only on the SALU.
912   getActionDefinitionsBuilder(G_CTPOP)
913     .legalFor({{S32, S32}, {S32, S64}})
914     .clampScalar(0, S32, S32)
915     .clampScalar(1, S32, S64)
916     .scalarize(0)
917     .widenScalarToNextPow2(0, 32)
918     .widenScalarToNextPow2(1, 32);
919 
920   // The hardware instructions return a different result on 0 than the generic
921   // instructions expect. The hardware produces -1, but these produce the
922   // bitwidth.
923   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
924     .scalarize(0)
925     .clampScalar(0, S32, S32)
926     .clampScalar(1, S32, S64)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32)
929     .lower();
930 
931   // The 64-bit versions produce 32-bit results, but only on the SALU.
932   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
933     .legalFor({{S32, S32}, {S32, S64}})
934     .clampScalar(0, S32, S32)
935     .clampScalar(1, S32, S64)
936     .scalarize(0)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32);
939 
940   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
941   // RegBankSelect.
942   getActionDefinitionsBuilder(G_BITREVERSE)
943     .legalFor({S32, S64})
944     .clampScalar(0, S32, S64)
945     .scalarize(0)
946     .widenScalarToNextPow2(0);
947 
948   if (ST.has16BitInsts()) {
949     getActionDefinitionsBuilder(G_BSWAP)
950       .legalFor({S16, S32, V2S16})
951       .clampMaxNumElements(0, S16, 2)
952       // FIXME: Fixing non-power-of-2 before clamp is workaround for
953       // narrowScalar limitation.
954       .widenScalarToNextPow2(0)
955       .clampScalar(0, S16, S32)
956       .scalarize(0);
957 
958     if (ST.hasVOP3PInsts()) {
959       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
960         .legalFor({S32, S16, V2S16})
961         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
962         .clampMaxNumElements(0, S16, 2)
963         .minScalar(0, S16)
964         .widenScalarToNextPow2(0)
965         .scalarize(0)
966         .lower();
967     } else {
968       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
969         .legalFor({S32, S16})
970         .widenScalarToNextPow2(0)
971         .minScalar(0, S16)
972         .scalarize(0)
973         .lower();
974     }
975   } else {
976     // TODO: Should have same legality without v_perm_b32
977     getActionDefinitionsBuilder(G_BSWAP)
978       .legalFor({S32})
979       .lowerIf(scalarNarrowerThan(0, 32))
980       // FIXME: Fixing non-power-of-2 before clamp is workaround for
981       // narrowScalar limitation.
982       .widenScalarToNextPow2(0)
983       .maxScalar(0, S32)
984       .scalarize(0)
985       .lower();
986 
987     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
988       .legalFor({S32})
989       .minScalar(0, S32)
990       .widenScalarToNextPow2(0)
991       .scalarize(0)
992       .lower();
993   }
994 
995   getActionDefinitionsBuilder(G_INTTOPTR)
996     // List the common cases
997     .legalForCartesianProduct(AddrSpaces64, {S64})
998     .legalForCartesianProduct(AddrSpaces32, {S32})
999     .scalarize(0)
1000     // Accept any address space as long as the size matches
1001     .legalIf(sameSize(0, 1))
1002     .widenScalarIf(smallerThan(1, 0),
1003       [](const LegalityQuery &Query) {
1004         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1005       })
1006     .narrowScalarIf(largerThan(1, 0),
1007       [](const LegalityQuery &Query) {
1008         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1009       });
1010 
1011   getActionDefinitionsBuilder(G_PTRTOINT)
1012     // List the common cases
1013     .legalForCartesianProduct(AddrSpaces64, {S64})
1014     .legalForCartesianProduct(AddrSpaces32, {S32})
1015     .scalarize(0)
1016     // Accept any address space as long as the size matches
1017     .legalIf(sameSize(0, 1))
1018     .widenScalarIf(smallerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       })
1022     .narrowScalarIf(
1023       largerThan(0, 1),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1029     .scalarize(0)
1030     .custom();
1031 
1032   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1033                                     bool IsLoad) -> bool {
1034     const LLT DstTy = Query.Types[0];
1035 
1036     // Split vector extloads.
1037     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1038     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1039 
1040     if (MemSize < DstTy.getSizeInBits())
1041       MemSize = std::max(MemSize, AlignBits);
1042 
1043     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1044       return true;
1045 
1046     const LLT PtrTy = Query.Types[1];
1047     unsigned AS = PtrTy.getAddressSpace();
1048     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1049       return true;
1050 
1051     // Catch weird sized loads that don't evenly divide into the access sizes
1052     // TODO: May be able to widen depending on alignment etc.
1053     unsigned NumRegs = (MemSize + 31) / 32;
1054     if (NumRegs == 3) {
1055       if (!ST.hasDwordx3LoadStores())
1056         return true;
1057     } else {
1058       // If the alignment allows, these should have been widened.
1059       if (!isPowerOf2_32(NumRegs))
1060         return true;
1061     }
1062 
1063     if (AlignBits < MemSize) {
1064       const SITargetLowering *TLI = ST.getTargetLowering();
1065       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1066                                                       Align(AlignBits / 8));
1067     }
1068 
1069     return false;
1070   };
1071 
1072   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1073   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1074   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1075 
1076   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1077   // LDS
1078   // TODO: Unsupported flat for SI.
1079 
1080   for (unsigned Op : {G_LOAD, G_STORE}) {
1081     const bool IsStore = Op == G_STORE;
1082 
1083     auto &Actions = getActionDefinitionsBuilder(Op);
1084     // Explicitly list some common cases.
1085     // TODO: Does this help compile time at all?
1086     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1087                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1088                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1089                                       {S64, GlobalPtr, 64, GlobalAlign32},
1090                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1091                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1092                                       {S32, GlobalPtr, 8, GlobalAlign8},
1093                                       {S32, GlobalPtr, 16, GlobalAlign16},
1094 
1095                                       {S32, LocalPtr, 32, 32},
1096                                       {S64, LocalPtr, 64, 32},
1097                                       {V2S32, LocalPtr, 64, 32},
1098                                       {S32, LocalPtr, 8, 8},
1099                                       {S32, LocalPtr, 16, 16},
1100                                       {V2S16, LocalPtr, 32, 32},
1101 
1102                                       {S32, PrivatePtr, 32, 32},
1103                                       {S32, PrivatePtr, 8, 8},
1104                                       {S32, PrivatePtr, 16, 16},
1105                                       {V2S16, PrivatePtr, 32, 32},
1106 
1107                                       {S32, ConstantPtr, 32, GlobalAlign32},
1108                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1109                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1110                                       {S64, ConstantPtr, 64, GlobalAlign32},
1111                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1112     Actions.legalIf(
1113       [=](const LegalityQuery &Query) -> bool {
1114         return isLoadStoreLegal(ST, Query, Op);
1115       });
1116 
1117     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1118     // 64-bits.
1119     //
1120     // TODO: Should generalize bitcast action into coerce, which will also cover
1121     // inserting addrspacecasts.
1122     Actions.customIf(typeIs(1, Constant32Ptr));
1123 
1124     // Turn any illegal element vectors into something easier to deal
1125     // with. These will ultimately produce 32-bit scalar shifts to extract the
1126     // parts anyway.
1127     //
1128     // For odd 16-bit element vectors, prefer to split those into pieces with
1129     // 16-bit vector parts.
1130     Actions.bitcastIf(
1131       [=](const LegalityQuery &Query) -> bool {
1132         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1133                                           Query.MMODescrs[0].SizeInBits);
1134       }, bitcastToRegisterType(0));
1135 
1136     if (!IsStore) {
1137       // Widen suitably aligned loads by loading extra bytes. The standard
1138       // legalization actions can't properly express widening memory operands.
1139       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1140         return shouldWidenLoad(ST, Query, G_LOAD);
1141       });
1142     }
1143 
1144     // FIXME: load/store narrowing should be moved to lower action
1145     Actions
1146         .narrowScalarIf(
1147             [=](const LegalityQuery &Query) -> bool {
1148               return !Query.Types[0].isVector() &&
1149                      needToSplitMemOp(Query, Op == G_LOAD);
1150             },
1151             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1152               const LLT DstTy = Query.Types[0];
1153               const LLT PtrTy = Query.Types[1];
1154 
1155               const unsigned DstSize = DstTy.getSizeInBits();
1156               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1157 
1158               // Split extloads.
1159               if (DstSize > MemSize)
1160                 return std::make_pair(0, LLT::scalar(MemSize));
1161 
1162               if (!isPowerOf2_32(DstSize)) {
1163                 // We're probably decomposing an odd sized store. Try to split
1164                 // to the widest type. TODO: Account for alignment. As-is it
1165                 // should be OK, since the new parts will be further legalized.
1166                 unsigned FloorSize = PowerOf2Floor(DstSize);
1167                 return std::make_pair(0, LLT::scalar(FloorSize));
1168               }
1169 
1170               if (DstSize > 32 && (DstSize % 32 != 0)) {
1171                 // FIXME: Need a way to specify non-extload of larger size if
1172                 // suitably aligned.
1173                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1174               }
1175 
1176               unsigned MaxSize = maxSizeForAddrSpace(ST,
1177                                                      PtrTy.getAddressSpace(),
1178                                                      Op == G_LOAD);
1179               if (MemSize > MaxSize)
1180                 return std::make_pair(0, LLT::scalar(MaxSize));
1181 
1182               unsigned Align = Query.MMODescrs[0].AlignInBits;
1183               return std::make_pair(0, LLT::scalar(Align));
1184             })
1185         .fewerElementsIf(
1186             [=](const LegalityQuery &Query) -> bool {
1187               return Query.Types[0].isVector() &&
1188                      needToSplitMemOp(Query, Op == G_LOAD);
1189             },
1190             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1191               const LLT DstTy = Query.Types[0];
1192               const LLT PtrTy = Query.Types[1];
1193 
1194               LLT EltTy = DstTy.getElementType();
1195               unsigned MaxSize = maxSizeForAddrSpace(ST,
1196                                                      PtrTy.getAddressSpace(),
1197                                                      Op == G_LOAD);
1198 
1199               // FIXME: Handle widened to power of 2 results better. This ends
1200               // up scalarizing.
1201               // FIXME: 3 element stores scalarized on SI
1202 
1203               // Split if it's too large for the address space.
1204               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1205                 unsigned NumElts = DstTy.getNumElements();
1206                 unsigned EltSize = EltTy.getSizeInBits();
1207 
1208                 if (MaxSize % EltSize == 0) {
1209                   return std::make_pair(
1210                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1211                 }
1212 
1213                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1214 
1215                 // FIXME: Refine when odd breakdowns handled
1216                 // The scalars will need to be re-legalized.
1217                 if (NumPieces == 1 || NumPieces >= NumElts ||
1218                     NumElts % NumPieces != 0)
1219                   return std::make_pair(0, EltTy);
1220 
1221                 return std::make_pair(0,
1222                                       LLT::vector(NumElts / NumPieces, EltTy));
1223               }
1224 
1225               // FIXME: We could probably handle weird extending loads better.
1226               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1227               if (DstTy.getSizeInBits() > MemSize)
1228                 return std::make_pair(0, EltTy);
1229 
1230               unsigned EltSize = EltTy.getSizeInBits();
1231               unsigned DstSize = DstTy.getSizeInBits();
1232               if (!isPowerOf2_32(DstSize)) {
1233                 // We're probably decomposing an odd sized store. Try to split
1234                 // to the widest type. TODO: Account for alignment. As-is it
1235                 // should be OK, since the new parts will be further legalized.
1236                 unsigned FloorSize = PowerOf2Floor(DstSize);
1237                 return std::make_pair(
1238                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1239               }
1240 
1241               // Need to split because of alignment.
1242               unsigned Align = Query.MMODescrs[0].AlignInBits;
1243               if (EltSize > Align &&
1244                   (EltSize / Align < DstTy.getNumElements())) {
1245                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1246               }
1247 
1248               // May need relegalization for the scalars.
1249               return std::make_pair(0, EltTy);
1250             })
1251     .lowerIfMemSizeNotPow2()
1252     .minScalar(0, S32);
1253 
1254     if (IsStore)
1255       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1256 
1257     Actions
1258         .widenScalarToNextPow2(0)
1259         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1260         .lower();
1261   }
1262 
1263   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1264                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1265                                                   {S32, GlobalPtr, 16, 2 * 8},
1266                                                   {S32, LocalPtr, 8, 8},
1267                                                   {S32, LocalPtr, 16, 16},
1268                                                   {S32, PrivatePtr, 8, 8},
1269                                                   {S32, PrivatePtr, 16, 16},
1270                                                   {S32, ConstantPtr, 8, 8},
1271                                                   {S32, ConstantPtr, 16, 2 * 8}});
1272   if (ST.hasFlatAddressSpace()) {
1273     ExtLoads.legalForTypesWithMemDesc(
1274         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1275   }
1276 
1277   ExtLoads.clampScalar(0, S32, S32)
1278           .widenScalarToNextPow2(0)
1279           .unsupportedIfMemSizeNotPow2()
1280           .lower();
1281 
1282   auto &Atomics = getActionDefinitionsBuilder(
1283     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1284      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1285      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1286      G_ATOMICRMW_UMIN})
1287     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1288                {S64, GlobalPtr}, {S64, LocalPtr},
1289                {S32, RegionPtr}, {S64, RegionPtr}});
1290   if (ST.hasFlatAddressSpace()) {
1291     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1292   }
1293 
1294   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1295   if (ST.hasLDSFPAtomics()) {
1296     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1297     if (ST.hasGFX90AInsts())
1298       Atomic.legalFor({{S64, LocalPtr}});
1299   }
1300   if (ST.hasAtomicFaddInsts())
1301     Atomic.legalFor({{S32, GlobalPtr}});
1302 
1303   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1304   // demarshalling
1305   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1306     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1307                 {S32, FlatPtr}, {S64, FlatPtr}})
1308     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1309                {S32, RegionPtr}, {S64, RegionPtr}});
1310   // TODO: Pointer types, any 32-bit or 64-bit vector
1311 
1312   // Condition should be s32 for scalar, s1 for vector.
1313   getActionDefinitionsBuilder(G_SELECT)
1314     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1315           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1316           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1317     .clampScalar(0, S16, S64)
1318     .scalarize(1)
1319     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1320     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1321     .clampMaxNumElements(0, S32, 2)
1322     .clampMaxNumElements(0, LocalPtr, 2)
1323     .clampMaxNumElements(0, PrivatePtr, 2)
1324     .scalarize(0)
1325     .widenScalarToNextPow2(0)
1326     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1327 
1328   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1329   // be more flexible with the shift amount type.
1330   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1331     .legalFor({{S32, S32}, {S64, S32}});
1332   if (ST.has16BitInsts()) {
1333     if (ST.hasVOP3PInsts()) {
1334       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1335             .clampMaxNumElements(0, S16, 2);
1336     } else
1337       Shifts.legalFor({{S16, S16}});
1338 
1339     // TODO: Support 16-bit shift amounts for all types
1340     Shifts.widenScalarIf(
1341       [=](const LegalityQuery &Query) {
1342         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1343         // 32-bit amount.
1344         const LLT ValTy = Query.Types[0];
1345         const LLT AmountTy = Query.Types[1];
1346         return ValTy.getSizeInBits() <= 16 &&
1347                AmountTy.getSizeInBits() < 16;
1348       }, changeTo(1, S16));
1349     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1350     Shifts.clampScalar(1, S32, S32);
1351     Shifts.clampScalar(0, S16, S64);
1352     Shifts.widenScalarToNextPow2(0, 16);
1353 
1354     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1355       .minScalar(0, S16)
1356       .scalarize(0)
1357       .lower();
1358   } else {
1359     // Make sure we legalize the shift amount type first, as the general
1360     // expansion for the shifted type will produce much worse code if it hasn't
1361     // been truncated already.
1362     Shifts.clampScalar(1, S32, S32);
1363     Shifts.clampScalar(0, S32, S64);
1364     Shifts.widenScalarToNextPow2(0, 32);
1365 
1366     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1367       .minScalar(0, S32)
1368       .scalarize(0)
1369       .lower();
1370   }
1371   Shifts.scalarize(0);
1372 
1373   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1374     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1375     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1376     unsigned IdxTypeIdx = 2;
1377 
1378     getActionDefinitionsBuilder(Op)
1379       .customIf([=](const LegalityQuery &Query) {
1380           const LLT EltTy = Query.Types[EltTypeIdx];
1381           const LLT VecTy = Query.Types[VecTypeIdx];
1382           const LLT IdxTy = Query.Types[IdxTypeIdx];
1383           const unsigned EltSize = EltTy.getSizeInBits();
1384           return (EltSize == 32 || EltSize == 64) &&
1385                   VecTy.getSizeInBits() % 32 == 0 &&
1386                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1387                   IdxTy.getSizeInBits() == 32;
1388         })
1389       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1390                  bitcastToVectorElement32(VecTypeIdx))
1391       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1392       .bitcastIf(
1393         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1394         [=](const LegalityQuery &Query) {
1395           // For > 64-bit element types, try to turn this into a 64-bit
1396           // element vector since we may be able to do better indexing
1397           // if this is scalar. If not, fall back to 32.
1398           const LLT EltTy = Query.Types[EltTypeIdx];
1399           const LLT VecTy = Query.Types[VecTypeIdx];
1400           const unsigned DstEltSize = EltTy.getSizeInBits();
1401           const unsigned VecSize = VecTy.getSizeInBits();
1402 
1403           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1404           return std::make_pair(
1405             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1406         })
1407       .clampScalar(EltTypeIdx, S32, S64)
1408       .clampScalar(VecTypeIdx, S32, S64)
1409       .clampScalar(IdxTypeIdx, S32, S32)
1410       .clampMaxNumElements(VecTypeIdx, S32, 32)
1411       // TODO: Clamp elements for 64-bit vectors?
1412       // It should only be necessary with variable indexes.
1413       // As a last resort, lower to the stack
1414       .lower();
1415   }
1416 
1417   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1418     .unsupportedIf([=](const LegalityQuery &Query) {
1419         const LLT &EltTy = Query.Types[1].getElementType();
1420         return Query.Types[0] != EltTy;
1421       });
1422 
1423   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1424     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1425     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1426 
1427     // FIXME: Doesn't handle extract of illegal sizes.
1428     getActionDefinitionsBuilder(Op)
1429       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1430       // FIXME: Multiples of 16 should not be legal.
1431       .legalIf([=](const LegalityQuery &Query) {
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           const LLT LitTy = Query.Types[LitTyIdx];
1434           return (BigTy.getSizeInBits() % 32 == 0) &&
1435                  (LitTy.getSizeInBits() % 16 == 0);
1436         })
1437       .widenScalarIf(
1438         [=](const LegalityQuery &Query) {
1439           const LLT BigTy = Query.Types[BigTyIdx];
1440           return (BigTy.getScalarSizeInBits() < 16);
1441         },
1442         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1443       .widenScalarIf(
1444         [=](const LegalityQuery &Query) {
1445           const LLT LitTy = Query.Types[LitTyIdx];
1446           return (LitTy.getScalarSizeInBits() < 16);
1447         },
1448         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1449       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1450       .widenScalarToNextPow2(BigTyIdx, 32);
1451 
1452   }
1453 
1454   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1455     .legalForCartesianProduct(AllS32Vectors, {S32})
1456     .legalForCartesianProduct(AllS64Vectors, {S64})
1457     .clampNumElements(0, V16S32, V32S32)
1458     .clampNumElements(0, V2S64, V16S64)
1459     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1460 
1461   if (ST.hasScalarPackInsts()) {
1462     BuildVector
1463       // FIXME: Should probably widen s1 vectors straight to s32
1464       .minScalarOrElt(0, S16)
1465       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1466       .minScalar(1, S32);
1467 
1468     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1469       .legalFor({V2S16, S32})
1470       .lower();
1471     BuildVector.minScalarOrElt(0, S32);
1472   } else {
1473     BuildVector.customFor({V2S16, S16});
1474     BuildVector.minScalarOrElt(0, S32);
1475 
1476     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1477       .customFor({V2S16, S32})
1478       .lower();
1479   }
1480 
1481   BuildVector.legalIf(isRegisterType(0));
1482 
1483   // FIXME: Clamp maximum size
1484   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1485     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1486     .clampMaxNumElements(0, S32, 32)
1487     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1488     .clampMaxNumElements(0, S16, 64);
1489 
1490   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1491   // pre-legalize.
1492   if (ST.hasVOP3PInsts()) {
1493     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1494       .customFor({V2S16, V2S16})
1495       .lower();
1496   } else
1497     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1498 
1499   // Merge/Unmerge
1500   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1501     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1502     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1503 
1504     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1505       const LLT Ty = Query.Types[TypeIdx];
1506       if (Ty.isVector()) {
1507         const LLT &EltTy = Ty.getElementType();
1508         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1509           return true;
1510         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1511           return true;
1512       }
1513       return false;
1514     };
1515 
1516     auto &Builder = getActionDefinitionsBuilder(Op)
1517       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1518       .lowerFor({{S16, V2S16}})
1519       .lowerIf([=](const LegalityQuery &Query) {
1520           const LLT BigTy = Query.Types[BigTyIdx];
1521           return BigTy.getSizeInBits() == 32;
1522         })
1523       // Try to widen to s16 first for small types.
1524       // TODO: Only do this on targets with legal s16 shifts
1525       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1526       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1527       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1528       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1529                            elementTypeIs(1, S16)),
1530                        changeTo(1, V2S16))
1531       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1532       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1533       // valid.
1534       .clampScalar(LitTyIdx, S32, S512)
1535       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1536       // Break up vectors with weird elements into scalars
1537       .fewerElementsIf(
1538         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1539         scalarize(0))
1540       .fewerElementsIf(
1541         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1542         scalarize(1))
1543       .clampScalar(BigTyIdx, S32, MaxScalar);
1544 
1545     if (Op == G_MERGE_VALUES) {
1546       Builder.widenScalarIf(
1547         // TODO: Use 16-bit shifts if legal for 8-bit values?
1548         [=](const LegalityQuery &Query) {
1549           const LLT Ty = Query.Types[LitTyIdx];
1550           return Ty.getSizeInBits() < 32;
1551         },
1552         changeTo(LitTyIdx, S32));
1553     }
1554 
1555     Builder.widenScalarIf(
1556       [=](const LegalityQuery &Query) {
1557         const LLT Ty = Query.Types[BigTyIdx];
1558         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1559           Ty.getSizeInBits() % 16 != 0;
1560       },
1561       [=](const LegalityQuery &Query) {
1562         // Pick the next power of 2, or a multiple of 64 over 128.
1563         // Whichever is smaller.
1564         const LLT &Ty = Query.Types[BigTyIdx];
1565         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1566         if (NewSizeInBits >= 256) {
1567           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1568           if (RoundedTo < NewSizeInBits)
1569             NewSizeInBits = RoundedTo;
1570         }
1571         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1572       })
1573       // Any vectors left are the wrong size. Scalarize them.
1574       .scalarize(0)
1575       .scalarize(1);
1576   }
1577 
1578   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1579   // RegBankSelect.
1580   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1581     .legalFor({{S32}, {S64}});
1582 
1583   if (ST.hasVOP3PInsts()) {
1584     SextInReg.lowerFor({{V2S16}})
1585       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1586       // get more vector shift opportunities, since we'll get those when
1587       // expanded.
1588       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1589   } else if (ST.has16BitInsts()) {
1590     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1591   } else {
1592     // Prefer to promote to s32 before lowering if we don't have 16-bit
1593     // shifts. This avoid a lot of intermediate truncate and extend operations.
1594     SextInReg.lowerFor({{S32}, {S64}});
1595   }
1596 
1597   SextInReg
1598     .scalarize(0)
1599     .clampScalar(0, S32, S64)
1600     .lower();
1601 
1602   // TODO: Only Try to form v2s16 with legal packed instructions.
1603   getActionDefinitionsBuilder(G_FSHR)
1604     .legalFor({{S32, S32}})
1605     .lowerFor({{V2S16, V2S16}})
1606     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1607     .scalarize(0)
1608     .lower();
1609 
1610   if (ST.hasVOP3PInsts()) {
1611     getActionDefinitionsBuilder(G_FSHL)
1612       .lowerFor({{V2S16, V2S16}})
1613       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1614       .scalarize(0)
1615       .lower();
1616   } else {
1617     getActionDefinitionsBuilder(G_FSHL)
1618       .scalarize(0)
1619       .lower();
1620   }
1621 
1622   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1623     .legalFor({S64});
1624 
1625   getActionDefinitionsBuilder(G_FENCE)
1626     .alwaysLegal();
1627 
1628   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1629       .scalarize(0)
1630       .minScalar(0, S32)
1631       .lower();
1632 
1633   getActionDefinitionsBuilder({
1634       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1635       G_FCOPYSIGN,
1636 
1637       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1638       G_ATOMICRMW_NAND,
1639       G_ATOMICRMW_FSUB,
1640       G_READ_REGISTER,
1641       G_WRITE_REGISTER,
1642 
1643       G_SADDO, G_SSUBO,
1644 
1645        // TODO: Implement
1646       G_FMINIMUM, G_FMAXIMUM}).lower();
1647 
1648   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1649         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1650         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1651     .unsupported();
1652 
1653   computeTables();
1654   verify(*ST.getInstrInfo());
1655 }
1656 
1657 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1658                                          MachineInstr &MI) const {
1659   MachineIRBuilder &B = Helper.MIRBuilder;
1660   MachineRegisterInfo &MRI = *B.getMRI();
1661 
1662   switch (MI.getOpcode()) {
1663   case TargetOpcode::G_ADDRSPACE_CAST:
1664     return legalizeAddrSpaceCast(MI, MRI, B);
1665   case TargetOpcode::G_FRINT:
1666     return legalizeFrint(MI, MRI, B);
1667   case TargetOpcode::G_FCEIL:
1668     return legalizeFceil(MI, MRI, B);
1669   case TargetOpcode::G_FREM:
1670     return legalizeFrem(MI, MRI, B);
1671   case TargetOpcode::G_INTRINSIC_TRUNC:
1672     return legalizeIntrinsicTrunc(MI, MRI, B);
1673   case TargetOpcode::G_SITOFP:
1674     return legalizeITOFP(MI, MRI, B, true);
1675   case TargetOpcode::G_UITOFP:
1676     return legalizeITOFP(MI, MRI, B, false);
1677   case TargetOpcode::G_FPTOSI:
1678     return legalizeFPTOI(MI, MRI, B, true);
1679   case TargetOpcode::G_FPTOUI:
1680     return legalizeFPTOI(MI, MRI, B, false);
1681   case TargetOpcode::G_FMINNUM:
1682   case TargetOpcode::G_FMAXNUM:
1683   case TargetOpcode::G_FMINNUM_IEEE:
1684   case TargetOpcode::G_FMAXNUM_IEEE:
1685     return legalizeMinNumMaxNum(Helper, MI);
1686   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1687     return legalizeExtractVectorElt(MI, MRI, B);
1688   case TargetOpcode::G_INSERT_VECTOR_ELT:
1689     return legalizeInsertVectorElt(MI, MRI, B);
1690   case TargetOpcode::G_SHUFFLE_VECTOR:
1691     return legalizeShuffleVector(MI, MRI, B);
1692   case TargetOpcode::G_FSIN:
1693   case TargetOpcode::G_FCOS:
1694     return legalizeSinCos(MI, MRI, B);
1695   case TargetOpcode::G_GLOBAL_VALUE:
1696     return legalizeGlobalValue(MI, MRI, B);
1697   case TargetOpcode::G_LOAD:
1698     return legalizeLoad(Helper, MI);
1699   case TargetOpcode::G_FMAD:
1700     return legalizeFMad(MI, MRI, B);
1701   case TargetOpcode::G_FDIV:
1702     return legalizeFDIV(MI, MRI, B);
1703   case TargetOpcode::G_UDIV:
1704   case TargetOpcode::G_UREM:
1705     return legalizeUDIV_UREM(MI, MRI, B);
1706   case TargetOpcode::G_SDIV:
1707   case TargetOpcode::G_SREM:
1708     return legalizeSDIV_SREM(MI, MRI, B);
1709   case TargetOpcode::G_ATOMIC_CMPXCHG:
1710     return legalizeAtomicCmpXChg(MI, MRI, B);
1711   case TargetOpcode::G_FLOG:
1712     return legalizeFlog(MI, B, numbers::ln2f);
1713   case TargetOpcode::G_FLOG10:
1714     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1715   case TargetOpcode::G_FEXP:
1716     return legalizeFExp(MI, B);
1717   case TargetOpcode::G_FPOW:
1718     return legalizeFPow(MI, B);
1719   case TargetOpcode::G_FFLOOR:
1720     return legalizeFFloor(MI, MRI, B);
1721   case TargetOpcode::G_BUILD_VECTOR:
1722     return legalizeBuildVector(MI, MRI, B);
1723   default:
1724     return false;
1725   }
1726 
1727   llvm_unreachable("expected switch to return");
1728 }
1729 
1730 Register AMDGPULegalizerInfo::getSegmentAperture(
1731   unsigned AS,
1732   MachineRegisterInfo &MRI,
1733   MachineIRBuilder &B) const {
1734   MachineFunction &MF = B.getMF();
1735   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1736   const LLT S32 = LLT::scalar(32);
1737 
1738   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1739 
1740   if (ST.hasApertureRegs()) {
1741     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1742     // getreg.
1743     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1744         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1745         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1746     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1747         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1748         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1749     unsigned Encoding =
1750         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1751         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1752         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1753 
1754     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1755 
1756     B.buildInstr(AMDGPU::S_GETREG_B32)
1757       .addDef(GetReg)
1758       .addImm(Encoding);
1759     MRI.setType(GetReg, S32);
1760 
1761     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1762     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1763   }
1764 
1765   Register QueuePtr = MRI.createGenericVirtualRegister(
1766     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1767 
1768   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1769     return Register();
1770 
1771   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1772   // private_segment_aperture_base_hi.
1773   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1774 
1775   // TODO: can we be smarter about machine pointer info?
1776   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1777   MachineMemOperand *MMO = MF.getMachineMemOperand(
1778       PtrInfo,
1779       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1780           MachineMemOperand::MOInvariant,
1781       4, commonAlignment(Align(64), StructOffset));
1782 
1783   Register LoadAddr;
1784 
1785   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1786   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1787 }
1788 
1789 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1790   MachineInstr &MI, MachineRegisterInfo &MRI,
1791   MachineIRBuilder &B) const {
1792   MachineFunction &MF = B.getMF();
1793 
1794   const LLT S32 = LLT::scalar(32);
1795   Register Dst = MI.getOperand(0).getReg();
1796   Register Src = MI.getOperand(1).getReg();
1797 
1798   LLT DstTy = MRI.getType(Dst);
1799   LLT SrcTy = MRI.getType(Src);
1800   unsigned DestAS = DstTy.getAddressSpace();
1801   unsigned SrcAS = SrcTy.getAddressSpace();
1802 
1803   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1804   // vector element.
1805   assert(!DstTy.isVector());
1806 
1807   const AMDGPUTargetMachine &TM
1808     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1809 
1810   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1811     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1812     return true;
1813   }
1814 
1815   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1816     // Truncate.
1817     B.buildExtract(Dst, Src, 0);
1818     MI.eraseFromParent();
1819     return true;
1820   }
1821 
1822   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1823     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1824     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1825 
1826     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1827     // another. Merge operands are required to be the same type, but creating an
1828     // extra ptrtoint would be kind of pointless.
1829     auto HighAddr = B.buildConstant(
1830       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1831     B.buildMerge(Dst, {Src, HighAddr});
1832     MI.eraseFromParent();
1833     return true;
1834   }
1835 
1836   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1837     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1838            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1839     unsigned NullVal = TM.getNullPointerValue(DestAS);
1840 
1841     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1842     auto FlatNull = B.buildConstant(SrcTy, 0);
1843 
1844     // Extract low 32-bits of the pointer.
1845     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1846 
1847     auto CmpRes =
1848         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1849     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1850 
1851     MI.eraseFromParent();
1852     return true;
1853   }
1854 
1855   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1856     return false;
1857 
1858   if (!ST.hasFlatAddressSpace())
1859     return false;
1860 
1861   auto SegmentNull =
1862       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1863   auto FlatNull =
1864       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1865 
1866   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1867   if (!ApertureReg.isValid())
1868     return false;
1869 
1870   auto CmpRes =
1871       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1872 
1873   // Coerce the type of the low half of the result so we can use merge_values.
1874   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1875 
1876   // TODO: Should we allow mismatched types but matching sizes in merges to
1877   // avoid the ptrtoint?
1878   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1879   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1880 
1881   MI.eraseFromParent();
1882   return true;
1883 }
1884 
1885 bool AMDGPULegalizerInfo::legalizeFrint(
1886   MachineInstr &MI, MachineRegisterInfo &MRI,
1887   MachineIRBuilder &B) const {
1888   Register Src = MI.getOperand(1).getReg();
1889   LLT Ty = MRI.getType(Src);
1890   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1891 
1892   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1893   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1894 
1895   auto C1 = B.buildFConstant(Ty, C1Val);
1896   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1897 
1898   // TODO: Should this propagate fast-math-flags?
1899   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1900   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1901 
1902   auto C2 = B.buildFConstant(Ty, C2Val);
1903   auto Fabs = B.buildFAbs(Ty, Src);
1904 
1905   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1906   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1907   MI.eraseFromParent();
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFceil(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914 
1915   const LLT S1 = LLT::scalar(1);
1916   const LLT S64 = LLT::scalar(64);
1917 
1918   Register Src = MI.getOperand(1).getReg();
1919   assert(MRI.getType(Src) == S64);
1920 
1921   // result = trunc(src)
1922   // if (src > 0.0 && src != result)
1923   //   result += 1.0
1924 
1925   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1926 
1927   const auto Zero = B.buildFConstant(S64, 0.0);
1928   const auto One = B.buildFConstant(S64, 1.0);
1929   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1930   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1931   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1932   auto Add = B.buildSelect(S64, And, One, Zero);
1933 
1934   // TODO: Should this propagate fast-math-flags?
1935   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1936   return true;
1937 }
1938 
1939 bool AMDGPULegalizerInfo::legalizeFrem(
1940   MachineInstr &MI, MachineRegisterInfo &MRI,
1941   MachineIRBuilder &B) const {
1942     Register DstReg = MI.getOperand(0).getReg();
1943     Register Src0Reg = MI.getOperand(1).getReg();
1944     Register Src1Reg = MI.getOperand(2).getReg();
1945     auto Flags = MI.getFlags();
1946     LLT Ty = MRI.getType(DstReg);
1947 
1948     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1949     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1950     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1951     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1952     MI.eraseFromParent();
1953     return true;
1954 }
1955 
1956 static MachineInstrBuilder extractF64Exponent(Register Hi,
1957                                               MachineIRBuilder &B) {
1958   const unsigned FractBits = 52;
1959   const unsigned ExpBits = 11;
1960   LLT S32 = LLT::scalar(32);
1961 
1962   auto Const0 = B.buildConstant(S32, FractBits - 32);
1963   auto Const1 = B.buildConstant(S32, ExpBits);
1964 
1965   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1966     .addUse(Hi)
1967     .addUse(Const0.getReg(0))
1968     .addUse(Const1.getReg(0));
1969 
1970   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1971 }
1972 
1973 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1974   MachineInstr &MI, MachineRegisterInfo &MRI,
1975   MachineIRBuilder &B) const {
1976   const LLT S1 = LLT::scalar(1);
1977   const LLT S32 = LLT::scalar(32);
1978   const LLT S64 = LLT::scalar(64);
1979 
1980   Register Src = MI.getOperand(1).getReg();
1981   assert(MRI.getType(Src) == S64);
1982 
1983   // TODO: Should this use extract since the low half is unused?
1984   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1985   Register Hi = Unmerge.getReg(1);
1986 
1987   // Extract the upper half, since this is where we will find the sign and
1988   // exponent.
1989   auto Exp = extractF64Exponent(Hi, B);
1990 
1991   const unsigned FractBits = 52;
1992 
1993   // Extract the sign bit.
1994   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1995   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1996 
1997   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1998 
1999   const auto Zero32 = B.buildConstant(S32, 0);
2000 
2001   // Extend back to 64-bits.
2002   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2003 
2004   auto Shr = B.buildAShr(S64, FractMask, Exp);
2005   auto Not = B.buildNot(S64, Shr);
2006   auto Tmp0 = B.buildAnd(S64, Src, Not);
2007   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2008 
2009   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2010   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2011 
2012   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2013   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2014   MI.eraseFromParent();
2015   return true;
2016 }
2017 
2018 bool AMDGPULegalizerInfo::legalizeITOFP(
2019   MachineInstr &MI, MachineRegisterInfo &MRI,
2020   MachineIRBuilder &B, bool Signed) const {
2021 
2022   Register Dst = MI.getOperand(0).getReg();
2023   Register Src = MI.getOperand(1).getReg();
2024 
2025   const LLT S64 = LLT::scalar(64);
2026   const LLT S32 = LLT::scalar(32);
2027 
2028   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2029 
2030   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2031 
2032   auto CvtHi = Signed ?
2033     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2034     B.buildUITOFP(S64, Unmerge.getReg(1));
2035 
2036   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2037 
2038   auto ThirtyTwo = B.buildConstant(S32, 32);
2039   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2040     .addUse(CvtHi.getReg(0))
2041     .addUse(ThirtyTwo.getReg(0));
2042 
2043   // TODO: Should this propagate fast-math-flags?
2044   B.buildFAdd(Dst, LdExp, CvtLo);
2045   MI.eraseFromParent();
2046   return true;
2047 }
2048 
2049 // TODO: Copied from DAG implementation. Verify logic and document how this
2050 // actually works.
2051 bool AMDGPULegalizerInfo::legalizeFPTOI(
2052   MachineInstr &MI, MachineRegisterInfo &MRI,
2053   MachineIRBuilder &B, bool Signed) const {
2054 
2055   Register Dst = MI.getOperand(0).getReg();
2056   Register Src = MI.getOperand(1).getReg();
2057 
2058   const LLT S64 = LLT::scalar(64);
2059   const LLT S32 = LLT::scalar(32);
2060 
2061   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2062 
2063   unsigned Flags = MI.getFlags();
2064 
2065   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2066   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2067   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2068 
2069   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2070   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2071   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2072 
2073   auto Hi = Signed ?
2074     B.buildFPTOSI(S32, FloorMul) :
2075     B.buildFPTOUI(S32, FloorMul);
2076   auto Lo = B.buildFPTOUI(S32, Fma);
2077 
2078   B.buildMerge(Dst, { Lo, Hi });
2079   MI.eraseFromParent();
2080 
2081   return true;
2082 }
2083 
2084 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2085                                                MachineInstr &MI) const {
2086   MachineFunction &MF = Helper.MIRBuilder.getMF();
2087   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2088 
2089   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2090                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2091 
2092   // With ieee_mode disabled, the instructions have the correct behavior
2093   // already for G_FMINNUM/G_FMAXNUM
2094   if (!MFI->getMode().IEEE)
2095     return !IsIEEEOp;
2096 
2097   if (IsIEEEOp)
2098     return true;
2099 
2100   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2101 }
2102 
2103 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2104   MachineInstr &MI, MachineRegisterInfo &MRI,
2105   MachineIRBuilder &B) const {
2106   // TODO: Should move some of this into LegalizerHelper.
2107 
2108   // TODO: Promote dynamic indexing of s16 to s32
2109 
2110   // FIXME: Artifact combiner probably should have replaced the truncated
2111   // constant before this, so we shouldn't need
2112   // getConstantVRegValWithLookThrough.
2113   Optional<ValueAndVReg> MaybeIdxVal =
2114       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2115   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2116     return true;
2117   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2118 
2119   Register Dst = MI.getOperand(0).getReg();
2120   Register Vec = MI.getOperand(1).getReg();
2121 
2122   LLT VecTy = MRI.getType(Vec);
2123   LLT EltTy = VecTy.getElementType();
2124   assert(EltTy == MRI.getType(Dst));
2125 
2126   if (IdxVal < VecTy.getNumElements())
2127     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2128   else
2129     B.buildUndef(Dst);
2130 
2131   MI.eraseFromParent();
2132   return true;
2133 }
2134 
2135 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2136   MachineInstr &MI, MachineRegisterInfo &MRI,
2137   MachineIRBuilder &B) const {
2138   // TODO: Should move some of this into LegalizerHelper.
2139 
2140   // TODO: Promote dynamic indexing of s16 to s32
2141 
2142   // FIXME: Artifact combiner probably should have replaced the truncated
2143   // constant before this, so we shouldn't need
2144   // getConstantVRegValWithLookThrough.
2145   Optional<ValueAndVReg> MaybeIdxVal =
2146       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2147   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2148     return true;
2149 
2150   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2151   Register Dst = MI.getOperand(0).getReg();
2152   Register Vec = MI.getOperand(1).getReg();
2153   Register Ins = MI.getOperand(2).getReg();
2154 
2155   LLT VecTy = MRI.getType(Vec);
2156   LLT EltTy = VecTy.getElementType();
2157   assert(EltTy == MRI.getType(Ins));
2158 
2159   if (IdxVal < VecTy.getNumElements())
2160     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2161   else
2162     B.buildUndef(Dst);
2163 
2164   MI.eraseFromParent();
2165   return true;
2166 }
2167 
2168 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2169   MachineInstr &MI, MachineRegisterInfo &MRI,
2170   MachineIRBuilder &B) const {
2171   const LLT V2S16 = LLT::vector(2, 16);
2172 
2173   Register Dst = MI.getOperand(0).getReg();
2174   Register Src0 = MI.getOperand(1).getReg();
2175   LLT DstTy = MRI.getType(Dst);
2176   LLT SrcTy = MRI.getType(Src0);
2177 
2178   if (SrcTy == V2S16 && DstTy == V2S16 &&
2179       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2180     return true;
2181 
2182   MachineIRBuilder HelperBuilder(MI);
2183   GISelObserverWrapper DummyObserver;
2184   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2185   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2186 }
2187 
2188 bool AMDGPULegalizerInfo::legalizeSinCos(
2189   MachineInstr &MI, MachineRegisterInfo &MRI,
2190   MachineIRBuilder &B) const {
2191 
2192   Register DstReg = MI.getOperand(0).getReg();
2193   Register SrcReg = MI.getOperand(1).getReg();
2194   LLT Ty = MRI.getType(DstReg);
2195   unsigned Flags = MI.getFlags();
2196 
2197   Register TrigVal;
2198   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2199   if (ST.hasTrigReducedRange()) {
2200     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2201     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2202       .addUse(MulVal.getReg(0))
2203       .setMIFlags(Flags).getReg(0);
2204   } else
2205     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2206 
2207   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2208     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2209   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2210     .addUse(TrigVal)
2211     .setMIFlags(Flags);
2212   MI.eraseFromParent();
2213   return true;
2214 }
2215 
2216 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2217                                                   MachineIRBuilder &B,
2218                                                   const GlobalValue *GV,
2219                                                   int64_t Offset,
2220                                                   unsigned GAFlags) const {
2221   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2222   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2223   // to the following code sequence:
2224   //
2225   // For constant address space:
2226   //   s_getpc_b64 s[0:1]
2227   //   s_add_u32 s0, s0, $symbol
2228   //   s_addc_u32 s1, s1, 0
2229   //
2230   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2231   //   a fixup or relocation is emitted to replace $symbol with a literal
2232   //   constant, which is a pc-relative offset from the encoding of the $symbol
2233   //   operand to the global variable.
2234   //
2235   // For global address space:
2236   //   s_getpc_b64 s[0:1]
2237   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2238   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2239   //
2240   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2241   //   fixups or relocations are emitted to replace $symbol@*@lo and
2242   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2243   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2244   //   operand to the global variable.
2245   //
2246   // What we want here is an offset from the value returned by s_getpc
2247   // (which is the address of the s_add_u32 instruction) to the global
2248   // variable, but since the encoding of $symbol starts 4 bytes after the start
2249   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2250   // small. This requires us to add 4 to the global variable offset in order to
2251   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2252   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2253   // instruction.
2254 
2255   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2256 
2257   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2258     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2259 
2260   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2261     .addDef(PCReg);
2262 
2263   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2264   if (GAFlags == SIInstrInfo::MO_NONE)
2265     MIB.addImm(0);
2266   else
2267     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2268 
2269   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2270 
2271   if (PtrTy.getSizeInBits() == 32)
2272     B.buildExtract(DstReg, PCReg, 0);
2273   return true;
2274  }
2275 
2276 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2277   MachineInstr &MI, MachineRegisterInfo &MRI,
2278   MachineIRBuilder &B) const {
2279   Register DstReg = MI.getOperand(0).getReg();
2280   LLT Ty = MRI.getType(DstReg);
2281   unsigned AS = Ty.getAddressSpace();
2282 
2283   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2284   MachineFunction &MF = B.getMF();
2285   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2286 
2287   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2288     if (!MFI->isModuleEntryFunction()) {
2289       const Function &Fn = MF.getFunction();
2290       DiagnosticInfoUnsupported BadLDSDecl(
2291         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2292         DS_Warning);
2293       Fn.getContext().diagnose(BadLDSDecl);
2294 
2295       // We currently don't have a way to correctly allocate LDS objects that
2296       // aren't directly associated with a kernel. We do force inlining of
2297       // functions that use local objects. However, if these dead functions are
2298       // not eliminated, we don't want a compile time error. Just emit a warning
2299       // and a trap, since there should be no callable path here.
2300       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2301       B.buildUndef(DstReg);
2302       MI.eraseFromParent();
2303       return true;
2304     }
2305 
2306     // TODO: We could emit code to handle the initialization somewhere.
2307     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2308       const SITargetLowering *TLI = ST.getTargetLowering();
2309       if (!TLI->shouldUseLDSConstAddress(GV)) {
2310         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2311         return true; // Leave in place;
2312       }
2313 
2314       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2315         Type *Ty = GV->getValueType();
2316         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2317         // zero-sized type in other languages to declare the dynamic shared
2318         // memory which size is not known at the compile time. They will be
2319         // allocated by the runtime and placed directly after the static
2320         // allocated ones. They all share the same offset.
2321         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2322           // Adjust alignment for that dynamic shared memory array.
2323           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2324           LLT S32 = LLT::scalar(32);
2325           auto Sz =
2326               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2327           B.buildIntToPtr(DstReg, Sz);
2328           MI.eraseFromParent();
2329           return true;
2330         }
2331       }
2332 
2333       B.buildConstant(
2334           DstReg,
2335           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2336       MI.eraseFromParent();
2337       return true;
2338     }
2339 
2340     const Function &Fn = MF.getFunction();
2341     DiagnosticInfoUnsupported BadInit(
2342       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2343     Fn.getContext().diagnose(BadInit);
2344     return true;
2345   }
2346 
2347   const SITargetLowering *TLI = ST.getTargetLowering();
2348 
2349   if (TLI->shouldEmitFixup(GV)) {
2350     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2351     MI.eraseFromParent();
2352     return true;
2353   }
2354 
2355   if (TLI->shouldEmitPCReloc(GV)) {
2356     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2357     MI.eraseFromParent();
2358     return true;
2359   }
2360 
2361   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2362   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2363 
2364   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2365       MachinePointerInfo::getGOT(MF),
2366       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2367           MachineMemOperand::MOInvariant,
2368       8 /*Size*/, Align(8));
2369 
2370   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2371 
2372   if (Ty.getSizeInBits() == 32) {
2373     // Truncate if this is a 32-bit constant adrdess.
2374     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2375     B.buildExtract(DstReg, Load, 0);
2376   } else
2377     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2378 
2379   MI.eraseFromParent();
2380   return true;
2381 }
2382 
2383 static LLT widenToNextPowerOf2(LLT Ty) {
2384   if (Ty.isVector())
2385     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2386   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2387 }
2388 
2389 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2390                                        MachineInstr &MI) const {
2391   MachineIRBuilder &B = Helper.MIRBuilder;
2392   MachineRegisterInfo &MRI = *B.getMRI();
2393   GISelChangeObserver &Observer = Helper.Observer;
2394 
2395   Register PtrReg = MI.getOperand(1).getReg();
2396   LLT PtrTy = MRI.getType(PtrReg);
2397   unsigned AddrSpace = PtrTy.getAddressSpace();
2398 
2399   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2400     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2401     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2402     Observer.changingInstr(MI);
2403     MI.getOperand(1).setReg(Cast.getReg(0));
2404     Observer.changedInstr(MI);
2405     return true;
2406   }
2407 
2408   Register ValReg = MI.getOperand(0).getReg();
2409   LLT ValTy = MRI.getType(ValReg);
2410 
2411   MachineMemOperand *MMO = *MI.memoperands_begin();
2412   const unsigned ValSize = ValTy.getSizeInBits();
2413   const unsigned MemSize = 8 * MMO->getSize();
2414   const Align MemAlign = MMO->getAlign();
2415   const unsigned AlignInBits = 8 * MemAlign.value();
2416 
2417   // Widen non-power-of-2 loads to the alignment if needed
2418   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2419     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2420 
2421     // This was already the correct extending load result type, so just adjust
2422     // the memory type.
2423     if (WideMemSize == ValSize) {
2424       MachineFunction &MF = B.getMF();
2425 
2426       MachineMemOperand *WideMMO =
2427           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2428       Observer.changingInstr(MI);
2429       MI.setMemRefs(MF, {WideMMO});
2430       Observer.changedInstr(MI);
2431       return true;
2432     }
2433 
2434     // Don't bother handling edge case that should probably never be produced.
2435     if (ValSize > WideMemSize)
2436       return false;
2437 
2438     LLT WideTy = widenToNextPowerOf2(ValTy);
2439 
2440     Register WideLoad;
2441     if (!WideTy.isVector()) {
2442       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2443       B.buildTrunc(ValReg, WideLoad).getReg(0);
2444     } else {
2445       // Extract the subvector.
2446 
2447       if (isRegisterType(ValTy)) {
2448         // If this a case where G_EXTRACT is legal, use it.
2449         // (e.g. <3 x s32> -> <4 x s32>)
2450         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2451         B.buildExtract(ValReg, WideLoad, 0);
2452       } else {
2453         // For cases where the widened type isn't a nice register value, unmerge
2454         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2455         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2456         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2457         B.setInsertPt(B.getMBB(), MI.getIterator());
2458         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2459       }
2460     }
2461 
2462     MI.eraseFromParent();
2463     return true;
2464   }
2465 
2466   return false;
2467 }
2468 
2469 bool AMDGPULegalizerInfo::legalizeFMad(
2470   MachineInstr &MI, MachineRegisterInfo &MRI,
2471   MachineIRBuilder &B) const {
2472   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2473   assert(Ty.isScalar());
2474 
2475   MachineFunction &MF = B.getMF();
2476   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2477 
2478   // TODO: Always legal with future ftz flag.
2479   // FIXME: Do we need just output?
2480   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2481     return true;
2482   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2483     return true;
2484 
2485   MachineIRBuilder HelperBuilder(MI);
2486   GISelObserverWrapper DummyObserver;
2487   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2488   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2489 }
2490 
2491 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2492   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2493   Register DstReg = MI.getOperand(0).getReg();
2494   Register PtrReg = MI.getOperand(1).getReg();
2495   Register CmpVal = MI.getOperand(2).getReg();
2496   Register NewVal = MI.getOperand(3).getReg();
2497 
2498   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2499          "this should not have been custom lowered");
2500 
2501   LLT ValTy = MRI.getType(CmpVal);
2502   LLT VecTy = LLT::vector(2, ValTy);
2503 
2504   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2505 
2506   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2507     .addDef(DstReg)
2508     .addUse(PtrReg)
2509     .addUse(PackedVal)
2510     .setMemRefs(MI.memoperands());
2511 
2512   MI.eraseFromParent();
2513   return true;
2514 }
2515 
2516 bool AMDGPULegalizerInfo::legalizeFlog(
2517   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2518   Register Dst = MI.getOperand(0).getReg();
2519   Register Src = MI.getOperand(1).getReg();
2520   LLT Ty = B.getMRI()->getType(Dst);
2521   unsigned Flags = MI.getFlags();
2522 
2523   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2524   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2525 
2526   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2527   MI.eraseFromParent();
2528   return true;
2529 }
2530 
2531 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2532                                        MachineIRBuilder &B) const {
2533   Register Dst = MI.getOperand(0).getReg();
2534   Register Src = MI.getOperand(1).getReg();
2535   unsigned Flags = MI.getFlags();
2536   LLT Ty = B.getMRI()->getType(Dst);
2537 
2538   auto K = B.buildFConstant(Ty, numbers::log2e);
2539   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2540   B.buildFExp2(Dst, Mul, Flags);
2541   MI.eraseFromParent();
2542   return true;
2543 }
2544 
2545 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2546                                        MachineIRBuilder &B) const {
2547   Register Dst = MI.getOperand(0).getReg();
2548   Register Src0 = MI.getOperand(1).getReg();
2549   Register Src1 = MI.getOperand(2).getReg();
2550   unsigned Flags = MI.getFlags();
2551   LLT Ty = B.getMRI()->getType(Dst);
2552   const LLT S16 = LLT::scalar(16);
2553   const LLT S32 = LLT::scalar(32);
2554 
2555   if (Ty == S32) {
2556     auto Log = B.buildFLog2(S32, Src0, Flags);
2557     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2558       .addUse(Log.getReg(0))
2559       .addUse(Src1)
2560       .setMIFlags(Flags);
2561     B.buildFExp2(Dst, Mul, Flags);
2562   } else if (Ty == S16) {
2563     // There's no f16 fmul_legacy, so we need to convert for it.
2564     auto Log = B.buildFLog2(S16, Src0, Flags);
2565     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2566     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2567     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2568       .addUse(Ext0.getReg(0))
2569       .addUse(Ext1.getReg(0))
2570       .setMIFlags(Flags);
2571 
2572     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2573   } else
2574     return false;
2575 
2576   MI.eraseFromParent();
2577   return true;
2578 }
2579 
2580 // Find a source register, ignoring any possible source modifiers.
2581 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2582   Register ModSrc = OrigSrc;
2583   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2584     ModSrc = SrcFNeg->getOperand(1).getReg();
2585     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2586       ModSrc = SrcFAbs->getOperand(1).getReg();
2587   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2588     ModSrc = SrcFAbs->getOperand(1).getReg();
2589   return ModSrc;
2590 }
2591 
2592 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2593                                          MachineRegisterInfo &MRI,
2594                                          MachineIRBuilder &B) const {
2595 
2596   const LLT S1 = LLT::scalar(1);
2597   const LLT S64 = LLT::scalar(64);
2598   Register Dst = MI.getOperand(0).getReg();
2599   Register OrigSrc = MI.getOperand(1).getReg();
2600   unsigned Flags = MI.getFlags();
2601   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2602          "this should not have been custom lowered");
2603 
2604   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2605   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2606   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2607   // V_FRACT bug is:
2608   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2609   //
2610   // Convert floor(x) to (x - fract(x))
2611 
2612   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2613     .addUse(OrigSrc)
2614     .setMIFlags(Flags);
2615 
2616   // Give source modifier matching some assistance before obscuring a foldable
2617   // pattern.
2618 
2619   // TODO: We can avoid the neg on the fract? The input sign to fract
2620   // shouldn't matter?
2621   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2622 
2623   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2624 
2625   Register Min = MRI.createGenericVirtualRegister(S64);
2626 
2627   // We don't need to concern ourselves with the snan handling difference, so
2628   // use the one which will directly select.
2629   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2630   if (MFI->getMode().IEEE)
2631     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2632   else
2633     B.buildFMinNum(Min, Fract, Const, Flags);
2634 
2635   Register CorrectedFract = Min;
2636   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2637     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2638     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2639   }
2640 
2641   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2642   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2643 
2644   MI.eraseFromParent();
2645   return true;
2646 }
2647 
2648 // Turn an illegal packed v2s16 build vector into bit operations.
2649 // TODO: This should probably be a bitcast action in LegalizerHelper.
2650 bool AMDGPULegalizerInfo::legalizeBuildVector(
2651   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2652   Register Dst = MI.getOperand(0).getReg();
2653   const LLT S32 = LLT::scalar(32);
2654   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2655 
2656   Register Src0 = MI.getOperand(1).getReg();
2657   Register Src1 = MI.getOperand(2).getReg();
2658   assert(MRI.getType(Src0) == LLT::scalar(16));
2659 
2660   auto Merge = B.buildMerge(S32, {Src0, Src1});
2661   B.buildBitcast(Dst, Merge);
2662 
2663   MI.eraseFromParent();
2664   return true;
2665 }
2666 
2667 // Check that this is a G_XOR x, -1
2668 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2669   if (MI.getOpcode() != TargetOpcode::G_XOR)
2670     return false;
2671   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2672   return ConstVal && *ConstVal == -1;
2673 }
2674 
2675 // Return the use branch instruction, otherwise null if the usage is invalid.
2676 static MachineInstr *
2677 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2678                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2679   Register CondDef = MI.getOperand(0).getReg();
2680   if (!MRI.hasOneNonDBGUse(CondDef))
2681     return nullptr;
2682 
2683   MachineBasicBlock *Parent = MI.getParent();
2684   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2685 
2686   if (isNot(MRI, *UseMI)) {
2687     Register NegatedCond = UseMI->getOperand(0).getReg();
2688     if (!MRI.hasOneNonDBGUse(NegatedCond))
2689       return nullptr;
2690 
2691     // We're deleting the def of this value, so we need to remove it.
2692     UseMI->eraseFromParent();
2693 
2694     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2695     Negated = true;
2696   }
2697 
2698   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2699     return nullptr;
2700 
2701   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2702   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2703   if (Next == Parent->end()) {
2704     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2705     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2706       return nullptr;
2707     UncondBrTarget = &*NextMBB;
2708   } else {
2709     if (Next->getOpcode() != AMDGPU::G_BR)
2710       return nullptr;
2711     Br = &*Next;
2712     UncondBrTarget = Br->getOperand(0).getMBB();
2713   }
2714 
2715   return UseMI;
2716 }
2717 
2718 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2719                                          const ArgDescriptor *Arg,
2720                                          const TargetRegisterClass *ArgRC,
2721                                          LLT ArgTy) const {
2722   MCRegister SrcReg = Arg->getRegister();
2723   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2724   assert(DstReg.isVirtual() && "Virtual register expected");
2725 
2726   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2727                                              ArgTy);
2728   if (Arg->isMasked()) {
2729     // TODO: Should we try to emit this once in the entry block?
2730     const LLT S32 = LLT::scalar(32);
2731     const unsigned Mask = Arg->getMask();
2732     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2733 
2734     Register AndMaskSrc = LiveIn;
2735 
2736     if (Shift != 0) {
2737       auto ShiftAmt = B.buildConstant(S32, Shift);
2738       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2739     }
2740 
2741     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2742   } else {
2743     B.buildCopy(DstReg, LiveIn);
2744   }
2745 
2746   return true;
2747 }
2748 
2749 bool AMDGPULegalizerInfo::loadInputValue(
2750     Register DstReg, MachineIRBuilder &B,
2751     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2752   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2753   const ArgDescriptor *Arg;
2754   const TargetRegisterClass *ArgRC;
2755   LLT ArgTy;
2756   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2757 
2758   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2759     return false; // TODO: Handle these
2760   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2761 }
2762 
2763 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2764     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2765     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2766   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2767     return false;
2768 
2769   MI.eraseFromParent();
2770   return true;
2771 }
2772 
2773 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2774                                        MachineRegisterInfo &MRI,
2775                                        MachineIRBuilder &B) const {
2776   Register Dst = MI.getOperand(0).getReg();
2777   LLT DstTy = MRI.getType(Dst);
2778   LLT S16 = LLT::scalar(16);
2779   LLT S32 = LLT::scalar(32);
2780   LLT S64 = LLT::scalar(64);
2781 
2782   if (DstTy == S16)
2783     return legalizeFDIV16(MI, MRI, B);
2784   if (DstTy == S32)
2785     return legalizeFDIV32(MI, MRI, B);
2786   if (DstTy == S64)
2787     return legalizeFDIV64(MI, MRI, B);
2788 
2789   return false;
2790 }
2791 
2792 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2793                                                   Register DstReg,
2794                                                   Register X,
2795                                                   Register Y,
2796                                                   bool IsDiv) const {
2797   const LLT S1 = LLT::scalar(1);
2798   const LLT S32 = LLT::scalar(32);
2799 
2800   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2801   // algorithm used here.
2802 
2803   // Initial estimate of inv(y).
2804   auto FloatY = B.buildUITOFP(S32, Y);
2805   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2806   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2807   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2808   auto Z = B.buildFPTOUI(S32, ScaledY);
2809 
2810   // One round of UNR.
2811   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2812   auto NegYZ = B.buildMul(S32, NegY, Z);
2813   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2814 
2815   // Quotient/remainder estimate.
2816   auto Q = B.buildUMulH(S32, X, Z);
2817   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2818 
2819   // First quotient/remainder refinement.
2820   auto One = B.buildConstant(S32, 1);
2821   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2822   if (IsDiv)
2823     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2824   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2825 
2826   // Second quotient/remainder refinement.
2827   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2828   if (IsDiv)
2829     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2830   else
2831     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2832 }
2833 
2834 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2835 //
2836 // Return lo, hi of result
2837 //
2838 // %cvt.lo = G_UITOFP Val.lo
2839 // %cvt.hi = G_UITOFP Val.hi
2840 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2841 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2842 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2843 // %mul2 = G_FMUL %mul1, 2**(-32)
2844 // %trunc = G_INTRINSIC_TRUNC %mul2
2845 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2846 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2847 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2848                                                        Register Val) {
2849   const LLT S32 = LLT::scalar(32);
2850   auto Unmerge = B.buildUnmerge(S32, Val);
2851 
2852   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2853   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2854 
2855   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2856                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2857 
2858   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2859   auto Mul1 =
2860       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2861 
2862   // 2**(-32)
2863   auto Mul2 =
2864       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2865   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2866 
2867   // -(2**32)
2868   auto Mad2 = B.buildFMAD(S32, Trunc,
2869                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2870 
2871   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2872   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2873 
2874   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2875 }
2876 
2877 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2878                                                   Register DstReg,
2879                                                   Register Numer,
2880                                                   Register Denom,
2881                                                   bool IsDiv) const {
2882   const LLT S32 = LLT::scalar(32);
2883   const LLT S64 = LLT::scalar(64);
2884   const LLT S1 = LLT::scalar(1);
2885   Register RcpLo, RcpHi;
2886 
2887   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2888 
2889   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2890 
2891   auto Zero64 = B.buildConstant(S64, 0);
2892   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2893 
2894   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2895   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2896 
2897   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2898   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2899   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2900 
2901   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2902   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2903   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2904   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2905 
2906   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2907   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2908   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2909   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2910   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2911 
2912   auto Zero32 = B.buildConstant(S32, 0);
2913   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2914   auto Add2_HiC =
2915       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2916   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2917   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2918 
2919   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2920   Register NumerLo = UnmergeNumer.getReg(0);
2921   Register NumerHi = UnmergeNumer.getReg(1);
2922 
2923   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2924   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2925   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2926   Register Mul3_Lo = UnmergeMul3.getReg(0);
2927   Register Mul3_Hi = UnmergeMul3.getReg(1);
2928   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2929   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2930   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2931   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2932 
2933   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2934   Register DenomLo = UnmergeDenom.getReg(0);
2935   Register DenomHi = UnmergeDenom.getReg(1);
2936 
2937   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2938   auto C1 = B.buildSExt(S32, CmpHi);
2939 
2940   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2941   auto C2 = B.buildSExt(S32, CmpLo);
2942 
2943   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2944   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2945 
2946   // TODO: Here and below portions of the code can be enclosed into if/endif.
2947   // Currently control flow is unconditional and we have 4 selects after
2948   // potential endif to substitute PHIs.
2949 
2950   // if C3 != 0 ...
2951   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2952   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2953   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2954   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2955 
2956   auto One64 = B.buildConstant(S64, 1);
2957   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2958 
2959   auto C4 =
2960       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2961   auto C5 =
2962       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2963   auto C6 = B.buildSelect(
2964       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2965 
2966   // if (C6 != 0)
2967   auto Add4 = B.buildAdd(S64, Add3, One64);
2968   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2969 
2970   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2971   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2972   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2973 
2974   // endif C6
2975   // endif C3
2976 
2977   if (IsDiv) {
2978     auto Sel1 = B.buildSelect(
2979         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2980     B.buildSelect(DstReg,
2981                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2982   } else {
2983     auto Sel2 = B.buildSelect(
2984         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2985     B.buildSelect(DstReg,
2986                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2987   }
2988 }
2989 
2990 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2991                                             MachineRegisterInfo &MRI,
2992                                             MachineIRBuilder &B) const {
2993   const LLT S64 = LLT::scalar(64);
2994   const LLT S32 = LLT::scalar(32);
2995   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2996   Register DstReg = MI.getOperand(0).getReg();
2997   Register Num = MI.getOperand(1).getReg();
2998   Register Den = MI.getOperand(2).getReg();
2999   LLT Ty = MRI.getType(DstReg);
3000 
3001   if (Ty == S32)
3002     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3003   else if (Ty == S64)
3004     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3005   else
3006     return false;
3007 
3008   MI.eraseFromParent();
3009   return true;
3010 
3011 }
3012 
3013 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3014                                             MachineRegisterInfo &MRI,
3015                                             MachineIRBuilder &B) const {
3016   const LLT S64 = LLT::scalar(64);
3017   const LLT S32 = LLT::scalar(32);
3018 
3019   Register DstReg = MI.getOperand(0).getReg();
3020   const LLT Ty = MRI.getType(DstReg);
3021   if (Ty != S32 && Ty != S64)
3022     return false;
3023 
3024   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3025 
3026   Register LHS = MI.getOperand(1).getReg();
3027   Register RHS = MI.getOperand(2).getReg();
3028 
3029   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3030   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3031   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3032 
3033   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3034   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3035 
3036   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3037   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3038 
3039   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3040   if (Ty == S32)
3041     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3042   else
3043     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3044 
3045   Register Sign;
3046   if (IsDiv)
3047     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3048   else
3049     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3050 
3051   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3052   B.buildSub(DstReg, UDivRem, Sign);
3053 
3054   MI.eraseFromParent();
3055   return true;
3056 }
3057 
3058 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3059                                                  MachineRegisterInfo &MRI,
3060                                                  MachineIRBuilder &B) const {
3061   Register Res = MI.getOperand(0).getReg();
3062   Register LHS = MI.getOperand(1).getReg();
3063   Register RHS = MI.getOperand(2).getReg();
3064   uint16_t Flags = MI.getFlags();
3065   LLT ResTy = MRI.getType(Res);
3066 
3067   const MachineFunction &MF = B.getMF();
3068   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3069                             MI.getFlag(MachineInstr::FmAfn);
3070 
3071   if (!AllowInaccurateRcp)
3072     return false;
3073 
3074   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3075     // 1 / x -> RCP(x)
3076     if (CLHS->isExactlyValue(1.0)) {
3077       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3078         .addUse(RHS)
3079         .setMIFlags(Flags);
3080 
3081       MI.eraseFromParent();
3082       return true;
3083     }
3084 
3085     // -1 / x -> RCP( FNEG(x) )
3086     if (CLHS->isExactlyValue(-1.0)) {
3087       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3088       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3089         .addUse(FNeg.getReg(0))
3090         .setMIFlags(Flags);
3091 
3092       MI.eraseFromParent();
3093       return true;
3094     }
3095   }
3096 
3097   // x / y -> x * (1.0 / y)
3098   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3099     .addUse(RHS)
3100     .setMIFlags(Flags);
3101   B.buildFMul(Res, LHS, RCP, Flags);
3102 
3103   MI.eraseFromParent();
3104   return true;
3105 }
3106 
3107 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3108                                                    MachineRegisterInfo &MRI,
3109                                                    MachineIRBuilder &B) const {
3110   Register Res = MI.getOperand(0).getReg();
3111   Register X = MI.getOperand(1).getReg();
3112   Register Y = MI.getOperand(2).getReg();
3113   uint16_t Flags = MI.getFlags();
3114   LLT ResTy = MRI.getType(Res);
3115 
3116   const MachineFunction &MF = B.getMF();
3117   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3118                             MI.getFlag(MachineInstr::FmAfn);
3119 
3120   if (!AllowInaccurateRcp)
3121     return false;
3122 
3123   auto NegY = B.buildFNeg(ResTy, Y);
3124   auto One = B.buildFConstant(ResTy, 1.0);
3125 
3126   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3127     .addUse(Y)
3128     .setMIFlags(Flags);
3129 
3130   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3131   R = B.buildFMA(ResTy, Tmp0, R, R);
3132 
3133   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3134   R = B.buildFMA(ResTy, Tmp1, R, R);
3135 
3136   auto Ret = B.buildFMul(ResTy, X, R);
3137   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3138 
3139   B.buildFMA(Res, Tmp2, R, Ret);
3140   MI.eraseFromParent();
3141   return true;
3142 }
3143 
3144 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3145                                          MachineRegisterInfo &MRI,
3146                                          MachineIRBuilder &B) const {
3147   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3148     return true;
3149 
3150   Register Res = MI.getOperand(0).getReg();
3151   Register LHS = MI.getOperand(1).getReg();
3152   Register RHS = MI.getOperand(2).getReg();
3153 
3154   uint16_t Flags = MI.getFlags();
3155 
3156   LLT S16 = LLT::scalar(16);
3157   LLT S32 = LLT::scalar(32);
3158 
3159   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3160   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3161 
3162   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3163     .addUse(RHSExt.getReg(0))
3164     .setMIFlags(Flags);
3165 
3166   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3167   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3168 
3169   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3170     .addUse(RDst.getReg(0))
3171     .addUse(RHS)
3172     .addUse(LHS)
3173     .setMIFlags(Flags);
3174 
3175   MI.eraseFromParent();
3176   return true;
3177 }
3178 
3179 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3180 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3181 static void toggleSPDenormMode(bool Enable,
3182                                MachineIRBuilder &B,
3183                                const GCNSubtarget &ST,
3184                                AMDGPU::SIModeRegisterDefaults Mode) {
3185   // Set SP denorm mode to this value.
3186   unsigned SPDenormMode =
3187     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3188 
3189   if (ST.hasDenormModeInst()) {
3190     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3191     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3192 
3193     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3194     B.buildInstr(AMDGPU::S_DENORM_MODE)
3195       .addImm(NewDenormModeValue);
3196 
3197   } else {
3198     // Select FP32 bit field in mode register.
3199     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3200                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3201                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3202 
3203     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3204       .addImm(SPDenormMode)
3205       .addImm(SPDenormModeBitField);
3206   }
3207 }
3208 
3209 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3210                                          MachineRegisterInfo &MRI,
3211                                          MachineIRBuilder &B) const {
3212   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3213     return true;
3214 
3215   Register Res = MI.getOperand(0).getReg();
3216   Register LHS = MI.getOperand(1).getReg();
3217   Register RHS = MI.getOperand(2).getReg();
3218   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3219   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3220 
3221   uint16_t Flags = MI.getFlags();
3222 
3223   LLT S32 = LLT::scalar(32);
3224   LLT S1 = LLT::scalar(1);
3225 
3226   auto One = B.buildFConstant(S32, 1.0f);
3227 
3228   auto DenominatorScaled =
3229     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3230       .addUse(LHS)
3231       .addUse(RHS)
3232       .addImm(0)
3233       .setMIFlags(Flags);
3234   auto NumeratorScaled =
3235     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3236       .addUse(LHS)
3237       .addUse(RHS)
3238       .addImm(1)
3239       .setMIFlags(Flags);
3240 
3241   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3242     .addUse(DenominatorScaled.getReg(0))
3243     .setMIFlags(Flags);
3244   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3245 
3246   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3247   // aren't modeled as reading it.
3248   if (!Mode.allFP32Denormals())
3249     toggleSPDenormMode(true, B, ST, Mode);
3250 
3251   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3252   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3253   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3254   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3255   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3256   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3257 
3258   if (!Mode.allFP32Denormals())
3259     toggleSPDenormMode(false, B, ST, Mode);
3260 
3261   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3262     .addUse(Fma4.getReg(0))
3263     .addUse(Fma1.getReg(0))
3264     .addUse(Fma3.getReg(0))
3265     .addUse(NumeratorScaled.getReg(1))
3266     .setMIFlags(Flags);
3267 
3268   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3269     .addUse(Fmas.getReg(0))
3270     .addUse(RHS)
3271     .addUse(LHS)
3272     .setMIFlags(Flags);
3273 
3274   MI.eraseFromParent();
3275   return true;
3276 }
3277 
3278 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3279                                          MachineRegisterInfo &MRI,
3280                                          MachineIRBuilder &B) const {
3281   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3282     return true;
3283 
3284   Register Res = MI.getOperand(0).getReg();
3285   Register LHS = MI.getOperand(1).getReg();
3286   Register RHS = MI.getOperand(2).getReg();
3287 
3288   uint16_t Flags = MI.getFlags();
3289 
3290   LLT S64 = LLT::scalar(64);
3291   LLT S1 = LLT::scalar(1);
3292 
3293   auto One = B.buildFConstant(S64, 1.0);
3294 
3295   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3296     .addUse(LHS)
3297     .addUse(RHS)
3298     .addImm(0)
3299     .setMIFlags(Flags);
3300 
3301   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3302 
3303   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3304     .addUse(DivScale0.getReg(0))
3305     .setMIFlags(Flags);
3306 
3307   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3308   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3309   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3310 
3311   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3312     .addUse(LHS)
3313     .addUse(RHS)
3314     .addImm(1)
3315     .setMIFlags(Flags);
3316 
3317   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3318   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3319   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3320 
3321   Register Scale;
3322   if (!ST.hasUsableDivScaleConditionOutput()) {
3323     // Workaround a hardware bug on SI where the condition output from div_scale
3324     // is not usable.
3325 
3326     LLT S32 = LLT::scalar(32);
3327 
3328     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3329     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3330     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3331     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3332 
3333     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3334                               Scale1Unmerge.getReg(1));
3335     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3336                               Scale0Unmerge.getReg(1));
3337     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3338   } else {
3339     Scale = DivScale1.getReg(1);
3340   }
3341 
3342   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3343     .addUse(Fma4.getReg(0))
3344     .addUse(Fma3.getReg(0))
3345     .addUse(Mul.getReg(0))
3346     .addUse(Scale)
3347     .setMIFlags(Flags);
3348 
3349   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3350     .addUse(Fmas.getReg(0))
3351     .addUse(RHS)
3352     .addUse(LHS)
3353     .setMIFlags(Flags);
3354 
3355   MI.eraseFromParent();
3356   return true;
3357 }
3358 
3359 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3360                                                  MachineRegisterInfo &MRI,
3361                                                  MachineIRBuilder &B) const {
3362   Register Res = MI.getOperand(0).getReg();
3363   Register LHS = MI.getOperand(2).getReg();
3364   Register RHS = MI.getOperand(3).getReg();
3365   uint16_t Flags = MI.getFlags();
3366 
3367   LLT S32 = LLT::scalar(32);
3368   LLT S1 = LLT::scalar(1);
3369 
3370   auto Abs = B.buildFAbs(S32, RHS, Flags);
3371   const APFloat C0Val(1.0f);
3372 
3373   auto C0 = B.buildConstant(S32, 0x6f800000);
3374   auto C1 = B.buildConstant(S32, 0x2f800000);
3375   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3376 
3377   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3378   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3379 
3380   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3381 
3382   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3383     .addUse(Mul0.getReg(0))
3384     .setMIFlags(Flags);
3385 
3386   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3387 
3388   B.buildFMul(Res, Sel, Mul1, Flags);
3389 
3390   MI.eraseFromParent();
3391   return true;
3392 }
3393 
3394 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3395 // FIXME: Why do we handle this one but not other removed instructions?
3396 //
3397 // Reciprocal square root.  The clamp prevents infinite results, clamping
3398 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3399 // +-max_float.
3400 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3401                                                     MachineRegisterInfo &MRI,
3402                                                     MachineIRBuilder &B) const {
3403   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3404     return true;
3405 
3406   Register Dst = MI.getOperand(0).getReg();
3407   Register Src = MI.getOperand(2).getReg();
3408   auto Flags = MI.getFlags();
3409 
3410   LLT Ty = MRI.getType(Dst);
3411 
3412   const fltSemantics *FltSemantics;
3413   if (Ty == LLT::scalar(32))
3414     FltSemantics = &APFloat::IEEEsingle();
3415   else if (Ty == LLT::scalar(64))
3416     FltSemantics = &APFloat::IEEEdouble();
3417   else
3418     return false;
3419 
3420   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3421     .addUse(Src)
3422     .setMIFlags(Flags);
3423 
3424   // We don't need to concern ourselves with the snan handling difference, since
3425   // the rsq quieted (or not) so use the one which will directly select.
3426   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3427   const bool UseIEEE = MFI->getMode().IEEE;
3428 
3429   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3430   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3431                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3432 
3433   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3434 
3435   if (UseIEEE)
3436     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3437   else
3438     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3439   MI.eraseFromParent();
3440   return true;
3441 }
3442 
3443 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3444   switch (IID) {
3445   case Intrinsic::amdgcn_ds_fadd:
3446     return AMDGPU::G_ATOMICRMW_FADD;
3447   case Intrinsic::amdgcn_ds_fmin:
3448     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3449   case Intrinsic::amdgcn_ds_fmax:
3450     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3451   default:
3452     llvm_unreachable("not a DS FP intrinsic");
3453   }
3454 }
3455 
3456 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3457                                                       MachineInstr &MI,
3458                                                       Intrinsic::ID IID) const {
3459   GISelChangeObserver &Observer = Helper.Observer;
3460   Observer.changingInstr(MI);
3461 
3462   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3463 
3464   // The remaining operands were used to set fields in the MemOperand on
3465   // construction.
3466   for (int I = 6; I > 3; --I)
3467     MI.RemoveOperand(I);
3468 
3469   MI.RemoveOperand(1); // Remove the intrinsic ID.
3470   Observer.changedInstr(MI);
3471   return true;
3472 }
3473 
3474 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3475                                             MachineRegisterInfo &MRI,
3476                                             MachineIRBuilder &B) const {
3477   uint64_t Offset =
3478     ST.getTargetLowering()->getImplicitParameterOffset(
3479       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3480   LLT DstTy = MRI.getType(DstReg);
3481   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3482 
3483   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3484   if (!loadInputValue(KernargPtrReg, B,
3485                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3486     return false;
3487 
3488   // FIXME: This should be nuw
3489   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3490   return true;
3491 }
3492 
3493 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3494                                                  MachineRegisterInfo &MRI,
3495                                                  MachineIRBuilder &B) const {
3496   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3497   if (!MFI->isEntryFunction()) {
3498     return legalizePreloadedArgIntrin(MI, MRI, B,
3499                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3500   }
3501 
3502   Register DstReg = MI.getOperand(0).getReg();
3503   if (!getImplicitArgPtr(DstReg, MRI, B))
3504     return false;
3505 
3506   MI.eraseFromParent();
3507   return true;
3508 }
3509 
3510 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3511                                               MachineRegisterInfo &MRI,
3512                                               MachineIRBuilder &B,
3513                                               unsigned AddrSpace) const {
3514   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3515   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3516   Register Hi32 = Unmerge.getReg(1);
3517 
3518   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3519   MI.eraseFromParent();
3520   return true;
3521 }
3522 
3523 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3524 // offset (the offset that is included in bounds checking and swizzling, to be
3525 // split between the instruction's voffset and immoffset fields) and soffset
3526 // (the offset that is excluded from bounds checking and swizzling, to go in
3527 // the instruction's soffset field).  This function takes the first kind of
3528 // offset and figures out how to split it between voffset and immoffset.
3529 std::tuple<Register, unsigned, unsigned>
3530 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3531                                         Register OrigOffset) const {
3532   const unsigned MaxImm = 4095;
3533   Register BaseReg;
3534   unsigned TotalConstOffset;
3535   const LLT S32 = LLT::scalar(32);
3536   MachineRegisterInfo &MRI = *B.getMRI();
3537 
3538   std::tie(BaseReg, TotalConstOffset) =
3539       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3540 
3541   unsigned ImmOffset = TotalConstOffset;
3542 
3543   // If BaseReg is a pointer, convert it to int.
3544   if (MRI.getType(BaseReg).isPointer())
3545     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3546 
3547   // If the immediate value is too big for the immoffset field, put the value
3548   // and -4096 into the immoffset field so that the value that is copied/added
3549   // for the voffset field is a multiple of 4096, and it stands more chance
3550   // of being CSEd with the copy/add for another similar load/store.
3551   // However, do not do that rounding down to a multiple of 4096 if that is a
3552   // negative number, as it appears to be illegal to have a negative offset
3553   // in the vgpr, even if adding the immediate offset makes it positive.
3554   unsigned Overflow = ImmOffset & ~MaxImm;
3555   ImmOffset -= Overflow;
3556   if ((int32_t)Overflow < 0) {
3557     Overflow += ImmOffset;
3558     ImmOffset = 0;
3559   }
3560 
3561   if (Overflow != 0) {
3562     if (!BaseReg) {
3563       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3564     } else {
3565       auto OverflowVal = B.buildConstant(S32, Overflow);
3566       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3567     }
3568   }
3569 
3570   if (!BaseReg)
3571     BaseReg = B.buildConstant(S32, 0).getReg(0);
3572 
3573   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3574 }
3575 
3576 /// Handle register layout difference for f16 images for some subtargets.
3577 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3578                                              MachineRegisterInfo &MRI,
3579                                              Register Reg,
3580                                              bool ImageStore) const {
3581   const LLT S16 = LLT::scalar(16);
3582   const LLT S32 = LLT::scalar(32);
3583   LLT StoreVT = MRI.getType(Reg);
3584   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3585 
3586   if (ST.hasUnpackedD16VMem()) {
3587     auto Unmerge = B.buildUnmerge(S16, Reg);
3588 
3589     SmallVector<Register, 4> WideRegs;
3590     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3591       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3592 
3593     int NumElts = StoreVT.getNumElements();
3594 
3595     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3596   }
3597 
3598   if (ImageStore && ST.hasImageStoreD16Bug()) {
3599     if (StoreVT.getNumElements() == 2) {
3600       SmallVector<Register, 4> PackedRegs;
3601       Reg = B.buildBitcast(S32, Reg).getReg(0);
3602       PackedRegs.push_back(Reg);
3603       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3604       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3605     }
3606 
3607     if (StoreVT.getNumElements() == 3) {
3608       SmallVector<Register, 4> PackedRegs;
3609       auto Unmerge = B.buildUnmerge(S16, Reg);
3610       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3611         PackedRegs.push_back(Unmerge.getReg(I));
3612       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3613       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3614       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3615     }
3616 
3617     if (StoreVT.getNumElements() == 4) {
3618       SmallVector<Register, 4> PackedRegs;
3619       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3620       auto Unmerge = B.buildUnmerge(S32, Reg);
3621       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3622         PackedRegs.push_back(Unmerge.getReg(I));
3623       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3624       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3625     }
3626 
3627     llvm_unreachable("invalid data type");
3628   }
3629 
3630   return Reg;
3631 }
3632 
3633 Register AMDGPULegalizerInfo::fixStoreSourceType(
3634   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3635   MachineRegisterInfo *MRI = B.getMRI();
3636   LLT Ty = MRI->getType(VData);
3637 
3638   const LLT S16 = LLT::scalar(16);
3639 
3640   // Fixup illegal register types for i8 stores.
3641   if (Ty == LLT::scalar(8) || Ty == S16) {
3642     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3643     return AnyExt;
3644   }
3645 
3646   if (Ty.isVector()) {
3647     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3648       if (IsFormat)
3649         return handleD16VData(B, *MRI, VData);
3650     }
3651   }
3652 
3653   return VData;
3654 }
3655 
3656 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3657                                               MachineRegisterInfo &MRI,
3658                                               MachineIRBuilder &B,
3659                                               bool IsTyped,
3660                                               bool IsFormat) const {
3661   Register VData = MI.getOperand(1).getReg();
3662   LLT Ty = MRI.getType(VData);
3663   LLT EltTy = Ty.getScalarType();
3664   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3665   const LLT S32 = LLT::scalar(32);
3666 
3667   VData = fixStoreSourceType(B, VData, IsFormat);
3668   Register RSrc = MI.getOperand(2).getReg();
3669 
3670   MachineMemOperand *MMO = *MI.memoperands_begin();
3671   const int MemSize = MMO->getSize();
3672 
3673   unsigned ImmOffset;
3674   unsigned TotalOffset;
3675 
3676   // The typed intrinsics add an immediate after the registers.
3677   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3678 
3679   // The struct intrinsic variants add one additional operand over raw.
3680   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3681   Register VIndex;
3682   int OpOffset = 0;
3683   if (HasVIndex) {
3684     VIndex = MI.getOperand(3).getReg();
3685     OpOffset = 1;
3686   }
3687 
3688   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3689   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3690 
3691   unsigned Format = 0;
3692   if (IsTyped) {
3693     Format = MI.getOperand(5 + OpOffset).getImm();
3694     ++OpOffset;
3695   }
3696 
3697   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3698 
3699   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3700   if (TotalOffset != 0)
3701     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3702 
3703   unsigned Opc;
3704   if (IsTyped) {
3705     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3706                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3707   } else if (IsFormat) {
3708     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3709                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3710   } else {
3711     switch (MemSize) {
3712     case 1:
3713       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3714       break;
3715     case 2:
3716       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3717       break;
3718     default:
3719       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3720       break;
3721     }
3722   }
3723 
3724   if (!VIndex)
3725     VIndex = B.buildConstant(S32, 0).getReg(0);
3726 
3727   auto MIB = B.buildInstr(Opc)
3728     .addUse(VData)              // vdata
3729     .addUse(RSrc)               // rsrc
3730     .addUse(VIndex)             // vindex
3731     .addUse(VOffset)            // voffset
3732     .addUse(SOffset)            // soffset
3733     .addImm(ImmOffset);         // offset(imm)
3734 
3735   if (IsTyped)
3736     MIB.addImm(Format);
3737 
3738   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3739      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3740      .addMemOperand(MMO);
3741 
3742   MI.eraseFromParent();
3743   return true;
3744 }
3745 
3746 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3747                                              MachineRegisterInfo &MRI,
3748                                              MachineIRBuilder &B,
3749                                              bool IsFormat,
3750                                              bool IsTyped) const {
3751   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3752   MachineMemOperand *MMO = *MI.memoperands_begin();
3753   const int MemSize = MMO->getSize();
3754   const LLT S32 = LLT::scalar(32);
3755 
3756   Register Dst = MI.getOperand(0).getReg();
3757   Register RSrc = MI.getOperand(2).getReg();
3758 
3759   // The typed intrinsics add an immediate after the registers.
3760   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3761 
3762   // The struct intrinsic variants add one additional operand over raw.
3763   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3764   Register VIndex;
3765   int OpOffset = 0;
3766   if (HasVIndex) {
3767     VIndex = MI.getOperand(3).getReg();
3768     OpOffset = 1;
3769   }
3770 
3771   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3772   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3773 
3774   unsigned Format = 0;
3775   if (IsTyped) {
3776     Format = MI.getOperand(5 + OpOffset).getImm();
3777     ++OpOffset;
3778   }
3779 
3780   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3781   unsigned ImmOffset;
3782   unsigned TotalOffset;
3783 
3784   LLT Ty = MRI.getType(Dst);
3785   LLT EltTy = Ty.getScalarType();
3786   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3787   const bool Unpacked = ST.hasUnpackedD16VMem();
3788 
3789   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3790   if (TotalOffset != 0)
3791     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3792 
3793   unsigned Opc;
3794 
3795   if (IsTyped) {
3796     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3797                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3798   } else if (IsFormat) {
3799     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3800                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3801   } else {
3802     switch (MemSize) {
3803     case 1:
3804       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3805       break;
3806     case 2:
3807       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3808       break;
3809     default:
3810       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3811       break;
3812     }
3813   }
3814 
3815   Register LoadDstReg;
3816 
3817   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3818   LLT UnpackedTy = Ty.changeElementSize(32);
3819 
3820   if (IsExtLoad)
3821     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3822   else if (Unpacked && IsD16 && Ty.isVector())
3823     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3824   else
3825     LoadDstReg = Dst;
3826 
3827   if (!VIndex)
3828     VIndex = B.buildConstant(S32, 0).getReg(0);
3829 
3830   auto MIB = B.buildInstr(Opc)
3831     .addDef(LoadDstReg)         // vdata
3832     .addUse(RSrc)               // rsrc
3833     .addUse(VIndex)             // vindex
3834     .addUse(VOffset)            // voffset
3835     .addUse(SOffset)            // soffset
3836     .addImm(ImmOffset);         // offset(imm)
3837 
3838   if (IsTyped)
3839     MIB.addImm(Format);
3840 
3841   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3842      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3843      .addMemOperand(MMO);
3844 
3845   if (LoadDstReg != Dst) {
3846     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3847 
3848     // Widen result for extending loads was widened.
3849     if (IsExtLoad)
3850       B.buildTrunc(Dst, LoadDstReg);
3851     else {
3852       // Repack to original 16-bit vector result
3853       // FIXME: G_TRUNC should work, but legalization currently fails
3854       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3855       SmallVector<Register, 4> Repack;
3856       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3857         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3858       B.buildMerge(Dst, Repack);
3859     }
3860   }
3861 
3862   MI.eraseFromParent();
3863   return true;
3864 }
3865 
3866 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3867                                                MachineIRBuilder &B,
3868                                                bool IsInc) const {
3869   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3870                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3871   B.buildInstr(Opc)
3872     .addDef(MI.getOperand(0).getReg())
3873     .addUse(MI.getOperand(2).getReg())
3874     .addUse(MI.getOperand(3).getReg())
3875     .cloneMemRefs(MI);
3876   MI.eraseFromParent();
3877   return true;
3878 }
3879 
3880 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3881   switch (IntrID) {
3882   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3883   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3884     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3885   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3886   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3887     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3888   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3889   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3890     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3891   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3892   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3893     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3894   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3895   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3896     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3897   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3898   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3899     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3900   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3901   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3902     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3903   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3904   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3905     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3906   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3907   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3908     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3909   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3910   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3911     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3912   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3913   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3914     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3915   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3916   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3917     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3918   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3919   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3920     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3921   case Intrinsic::amdgcn_buffer_atomic_fadd:
3922   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3923   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3924     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3925   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3926   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3927     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3928   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3929   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3930     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3931   default:
3932     llvm_unreachable("unhandled atomic opcode");
3933   }
3934 }
3935 
3936 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3937                                                MachineIRBuilder &B,
3938                                                Intrinsic::ID IID) const {
3939   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3940                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3941   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3942 
3943   Register Dst;
3944 
3945   int OpOffset = 0;
3946   if (HasReturn) {
3947     // A few FP atomics do not support return values.
3948     Dst = MI.getOperand(0).getReg();
3949   } else {
3950     OpOffset = -1;
3951   }
3952 
3953   Register VData = MI.getOperand(2 + OpOffset).getReg();
3954   Register CmpVal;
3955 
3956   if (IsCmpSwap) {
3957     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3958     ++OpOffset;
3959   }
3960 
3961   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3962   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3963 
3964   // The struct intrinsic variants add one additional operand over raw.
3965   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3966   Register VIndex;
3967   if (HasVIndex) {
3968     VIndex = MI.getOperand(4 + OpOffset).getReg();
3969     ++OpOffset;
3970   }
3971 
3972   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3973   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3974   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3975 
3976   MachineMemOperand *MMO = *MI.memoperands_begin();
3977 
3978   unsigned ImmOffset;
3979   unsigned TotalOffset;
3980   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3981   if (TotalOffset != 0)
3982     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3983 
3984   if (!VIndex)
3985     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3986 
3987   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3988 
3989   if (HasReturn)
3990     MIB.addDef(Dst);
3991 
3992   MIB.addUse(VData); // vdata
3993 
3994   if (IsCmpSwap)
3995     MIB.addReg(CmpVal);
3996 
3997   MIB.addUse(RSrc)               // rsrc
3998      .addUse(VIndex)             // vindex
3999      .addUse(VOffset)            // voffset
4000      .addUse(SOffset)            // soffset
4001      .addImm(ImmOffset)          // offset(imm)
4002      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4003      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4004      .addMemOperand(MMO);
4005 
4006   MI.eraseFromParent();
4007   return true;
4008 }
4009 
4010 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4011 /// vector with s16 typed elements.
4012 static void packImageA16AddressToDwords(
4013     MachineIRBuilder &B, MachineInstr &MI,
4014     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4015     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4016   const LLT S16 = LLT::scalar(16);
4017   const LLT V2S16 = LLT::vector(2, 16);
4018 
4019   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4020     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4021     if (!SrcOp.isReg())
4022       continue; // _L to _LZ may have eliminated this.
4023 
4024     Register AddrReg = SrcOp.getReg();
4025 
4026     if (I < Intr->GradientStart) {
4027       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4028       PackedAddrs.push_back(AddrReg);
4029     } else {
4030       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4031       // derivatives dx/dh and dx/dv are packed with undef.
4032       if (((I + 1) >= EndIdx) ||
4033           ((Intr->NumGradients / 2) % 2 == 1 &&
4034            (I == static_cast<unsigned>(Intr->GradientStart +
4035                                        (Intr->NumGradients / 2) - 1) ||
4036             I == static_cast<unsigned>(Intr->GradientStart +
4037                                        Intr->NumGradients - 1))) ||
4038           // Check for _L to _LZ optimization
4039           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4040         PackedAddrs.push_back(
4041             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4042                 .getReg(0));
4043       } else {
4044         PackedAddrs.push_back(
4045             B.buildBuildVector(
4046                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4047                 .getReg(0));
4048         ++I;
4049       }
4050     }
4051   }
4052 }
4053 
4054 /// Convert from separate vaddr components to a single vector address register,
4055 /// and replace the remaining operands with $noreg.
4056 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4057                                      int DimIdx, int NumVAddrs) {
4058   const LLT S32 = LLT::scalar(32);
4059 
4060   SmallVector<Register, 8> AddrRegs;
4061   for (int I = 0; I != NumVAddrs; ++I) {
4062     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4063     if (SrcOp.isReg()) {
4064       AddrRegs.push_back(SrcOp.getReg());
4065       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4066     }
4067   }
4068 
4069   int NumAddrRegs = AddrRegs.size();
4070   if (NumAddrRegs != 1) {
4071     // Round up to 8 elements for v5-v7
4072     // FIXME: Missing intermediate sized register classes and instructions.
4073     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4074       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4075       auto Undef = B.buildUndef(S32);
4076       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4077       NumAddrRegs = RoundedNumRegs;
4078     }
4079 
4080     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4081     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4082   }
4083 
4084   for (int I = 1; I != NumVAddrs; ++I) {
4085     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4086     if (SrcOp.isReg())
4087       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4088   }
4089 }
4090 
4091 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4092 ///
4093 /// Depending on the subtarget, load/store with 16-bit element data need to be
4094 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4095 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4096 /// registers.
4097 ///
4098 /// We don't want to directly select image instructions just yet, but also want
4099 /// to exposes all register repacking to the legalizer/combiners. We also don't
4100 /// want a selected instrution entering RegBankSelect. In order to avoid
4101 /// defining a multitude of intermediate image instructions, directly hack on
4102 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4103 /// now unnecessary arguments with $noreg.
4104 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4105     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4106     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4107 
4108   const unsigned NumDefs = MI.getNumExplicitDefs();
4109   const unsigned ArgOffset = NumDefs + 1;
4110   bool IsTFE = NumDefs == 2;
4111   // We are only processing the operands of d16 image operations on subtargets
4112   // that use the unpacked register layout, or need to repack the TFE result.
4113 
4114   // TODO: Do we need to guard against already legalized intrinsics?
4115   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4116       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4117 
4118   MachineRegisterInfo *MRI = B.getMRI();
4119   const LLT S32 = LLT::scalar(32);
4120   const LLT S16 = LLT::scalar(16);
4121   const LLT V2S16 = LLT::vector(2, 16);
4122 
4123   unsigned DMask = 0;
4124 
4125   // Check for 16 bit addresses and pack if true.
4126   LLT GradTy =
4127       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4128   LLT AddrTy =
4129       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4130   const bool IsG16 = GradTy == S16;
4131   const bool IsA16 = AddrTy == S16;
4132 
4133   int DMaskLanes = 0;
4134   if (!BaseOpcode->Atomic) {
4135     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4136     if (BaseOpcode->Gather4) {
4137       DMaskLanes = 4;
4138     } else if (DMask != 0) {
4139       DMaskLanes = countPopulation(DMask);
4140     } else if (!IsTFE && !BaseOpcode->Store) {
4141       // If dmask is 0, this is a no-op load. This can be eliminated.
4142       B.buildUndef(MI.getOperand(0));
4143       MI.eraseFromParent();
4144       return true;
4145     }
4146   }
4147 
4148   Observer.changingInstr(MI);
4149   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4150 
4151   unsigned NewOpcode = NumDefs == 0 ?
4152     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4153 
4154   // Track that we legalized this
4155   MI.setDesc(B.getTII().get(NewOpcode));
4156 
4157   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4158   // dmask to be at least 1 otherwise the instruction will fail
4159   if (IsTFE && DMask == 0) {
4160     DMask = 0x1;
4161     DMaskLanes = 1;
4162     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4163   }
4164 
4165   if (BaseOpcode->Atomic) {
4166     Register VData0 = MI.getOperand(2).getReg();
4167     LLT Ty = MRI->getType(VData0);
4168 
4169     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4170     if (Ty.isVector())
4171       return false;
4172 
4173     if (BaseOpcode->AtomicX2) {
4174       Register VData1 = MI.getOperand(3).getReg();
4175       // The two values are packed in one register.
4176       LLT PackedTy = LLT::vector(2, Ty);
4177       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4178       MI.getOperand(2).setReg(Concat.getReg(0));
4179       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4180     }
4181   }
4182 
4183   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4184 
4185   // Optimize _L to _LZ when _L is zero
4186   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4187           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4188     const ConstantFP *ConstantLod;
4189 
4190     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4191                  m_GFCst(ConstantLod))) {
4192       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4193         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4194         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4195             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4196                                                       Intr->Dim);
4197 
4198         // The starting indexes should remain in the same place.
4199         --CorrectedNumVAddrs;
4200 
4201         MI.getOperand(MI.getNumExplicitDefs())
4202             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4203         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4204         Intr = NewImageDimIntr;
4205       }
4206     }
4207   }
4208 
4209   // Optimize _mip away, when 'lod' is zero
4210   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4211     int64_t ConstantLod;
4212     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4213                  m_ICst(ConstantLod))) {
4214       if (ConstantLod == 0) {
4215         // TODO: Change intrinsic opcode and remove operand instead or replacing
4216         // it with 0, as the _L to _LZ handling is done above.
4217         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4218         --CorrectedNumVAddrs;
4219       }
4220     }
4221   }
4222 
4223   // Rewrite the addressing register layout before doing anything else.
4224   if (IsA16 || IsG16) {
4225     if (IsA16) {
4226       // Target must support the feature and gradients need to be 16 bit too
4227       if (!ST.hasA16() || !IsG16)
4228         return false;
4229     } else if (!ST.hasG16())
4230       return false;
4231 
4232     if (Intr->NumVAddrs > 1) {
4233       SmallVector<Register, 4> PackedRegs;
4234       // Don't compress addresses for G16
4235       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4236       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4237                                   PackEndIdx);
4238 
4239       if (!IsA16) {
4240         // Add uncompressed address
4241         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4242           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4243           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4244           PackedRegs.push_back(AddrReg);
4245         }
4246       }
4247 
4248       // See also below in the non-a16 branch
4249       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4250 
4251       if (!UseNSA && PackedRegs.size() > 1) {
4252         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4253         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4254         PackedRegs[0] = Concat.getReg(0);
4255         PackedRegs.resize(1);
4256       }
4257 
4258       const unsigned NumPacked = PackedRegs.size();
4259       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4260         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4261         if (!SrcOp.isReg()) {
4262           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4263           continue;
4264         }
4265 
4266         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4267 
4268         if (I - Intr->VAddrStart < NumPacked)
4269           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4270         else
4271           SrcOp.setReg(AMDGPU::NoRegister);
4272       }
4273     }
4274   } else {
4275     // If the register allocator cannot place the address registers contiguously
4276     // without introducing moves, then using the non-sequential address encoding
4277     // is always preferable, since it saves VALU instructions and is usually a
4278     // wash in terms of code size or even better.
4279     //
4280     // However, we currently have no way of hinting to the register allocator
4281     // that MIMG addresses should be placed contiguously when it is possible to
4282     // do so, so force non-NSA for the common 2-address case as a heuristic.
4283     //
4284     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4285     // allocation when possible.
4286     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4287 
4288     if (!UseNSA && Intr->NumVAddrs > 1)
4289       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4290                                Intr->NumVAddrs);
4291   }
4292 
4293   int Flags = 0;
4294   if (IsA16)
4295     Flags |= 1;
4296   if (IsG16)
4297     Flags |= 2;
4298   MI.addOperand(MachineOperand::CreateImm(Flags));
4299 
4300   if (BaseOpcode->Store) { // No TFE for stores?
4301     // TODO: Handle dmask trim
4302     Register VData = MI.getOperand(1).getReg();
4303     LLT Ty = MRI->getType(VData);
4304     if (!Ty.isVector() || Ty.getElementType() != S16)
4305       return true;
4306 
4307     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4308     if (RepackedReg != VData) {
4309       MI.getOperand(1).setReg(RepackedReg);
4310     }
4311 
4312     return true;
4313   }
4314 
4315   Register DstReg = MI.getOperand(0).getReg();
4316   LLT Ty = MRI->getType(DstReg);
4317   const LLT EltTy = Ty.getScalarType();
4318   const bool IsD16 = Ty.getScalarType() == S16;
4319   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4320 
4321   // Confirm that the return type is large enough for the dmask specified
4322   if (NumElts < DMaskLanes)
4323     return false;
4324 
4325   if (NumElts > 4 || DMaskLanes > 4)
4326     return false;
4327 
4328   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4329   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4330 
4331   // The raw dword aligned data component of the load. The only legal cases
4332   // where this matters should be when using the packed D16 format, for
4333   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4334   LLT RoundedTy;
4335 
4336   // S32 vector to to cover all data, plus TFE result element.
4337   LLT TFETy;
4338 
4339   // Register type to use for each loaded component. Will be S32 or V2S16.
4340   LLT RegTy;
4341 
4342   if (IsD16 && ST.hasUnpackedD16VMem()) {
4343     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4344     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4345     RegTy = S32;
4346   } else {
4347     unsigned EltSize = EltTy.getSizeInBits();
4348     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4349     unsigned RoundedSize = 32 * RoundedElts;
4350     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4351     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4352     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4353   }
4354 
4355   // The return type does not need adjustment.
4356   // TODO: Should we change s16 case to s32 or <2 x s16>?
4357   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4358     return true;
4359 
4360   Register Dst1Reg;
4361 
4362   // Insert after the instruction.
4363   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4364 
4365   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4366   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4367   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4368   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4369 
4370   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4371 
4372   MI.getOperand(0).setReg(NewResultReg);
4373 
4374   // In the IR, TFE is supposed to be used with a 2 element struct return
4375   // type. The intruction really returns these two values in one contiguous
4376   // register, with one additional dword beyond the loaded data. Rewrite the
4377   // return type to use a single register result.
4378 
4379   if (IsTFE) {
4380     Dst1Reg = MI.getOperand(1).getReg();
4381     if (MRI->getType(Dst1Reg) != S32)
4382       return false;
4383 
4384     // TODO: Make sure the TFE operand bit is set.
4385     MI.RemoveOperand(1);
4386 
4387     // Handle the easy case that requires no repack instructions.
4388     if (Ty == S32) {
4389       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4390       return true;
4391     }
4392   }
4393 
4394   // Now figure out how to copy the new result register back into the old
4395   // result.
4396   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4397 
4398   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4399 
4400   if (ResultNumRegs == 1) {
4401     assert(!IsTFE);
4402     ResultRegs[0] = NewResultReg;
4403   } else {
4404     // We have to repack into a new vector of some kind.
4405     for (int I = 0; I != NumDataRegs; ++I)
4406       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4407     B.buildUnmerge(ResultRegs, NewResultReg);
4408 
4409     // Drop the final TFE element to get the data part. The TFE result is
4410     // directly written to the right place already.
4411     if (IsTFE)
4412       ResultRegs.resize(NumDataRegs);
4413   }
4414 
4415   // For an s16 scalar result, we form an s32 result with a truncate regardless
4416   // of packed vs. unpacked.
4417   if (IsD16 && !Ty.isVector()) {
4418     B.buildTrunc(DstReg, ResultRegs[0]);
4419     return true;
4420   }
4421 
4422   // Avoid a build/concat_vector of 1 entry.
4423   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4424     B.buildBitcast(DstReg, ResultRegs[0]);
4425     return true;
4426   }
4427 
4428   assert(Ty.isVector());
4429 
4430   if (IsD16) {
4431     // For packed D16 results with TFE enabled, all the data components are
4432     // S32. Cast back to the expected type.
4433     //
4434     // TODO: We don't really need to use load s32 elements. We would only need one
4435     // cast for the TFE result if a multiple of v2s16 was used.
4436     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4437       for (Register &Reg : ResultRegs)
4438         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4439     } else if (ST.hasUnpackedD16VMem()) {
4440       for (Register &Reg : ResultRegs)
4441         Reg = B.buildTrunc(S16, Reg).getReg(0);
4442     }
4443   }
4444 
4445   auto padWithUndef = [&](LLT Ty, int NumElts) {
4446     if (NumElts == 0)
4447       return;
4448     Register Undef = B.buildUndef(Ty).getReg(0);
4449     for (int I = 0; I != NumElts; ++I)
4450       ResultRegs.push_back(Undef);
4451   };
4452 
4453   // Pad out any elements eliminated due to the dmask.
4454   LLT ResTy = MRI->getType(ResultRegs[0]);
4455   if (!ResTy.isVector()) {
4456     padWithUndef(ResTy, NumElts - ResultRegs.size());
4457     B.buildBuildVector(DstReg, ResultRegs);
4458     return true;
4459   }
4460 
4461   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4462   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4463 
4464   // Deal with the one annoying legal case.
4465   const LLT V3S16 = LLT::vector(3, 16);
4466   if (Ty == V3S16) {
4467     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4468     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4469     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4470     return true;
4471   }
4472 
4473   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4474   B.buildConcatVectors(DstReg, ResultRegs);
4475   return true;
4476 }
4477 
4478 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4479   LegalizerHelper &Helper, MachineInstr &MI) const {
4480   MachineIRBuilder &B = Helper.MIRBuilder;
4481   GISelChangeObserver &Observer = Helper.Observer;
4482 
4483   Register Dst = MI.getOperand(0).getReg();
4484   LLT Ty = B.getMRI()->getType(Dst);
4485   unsigned Size = Ty.getSizeInBits();
4486   MachineFunction &MF = B.getMF();
4487 
4488   Observer.changingInstr(MI);
4489 
4490   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4491     Ty = getBitcastRegisterType(Ty);
4492     Helper.bitcastDst(MI, Ty, 0);
4493     Dst = MI.getOperand(0).getReg();
4494     B.setInsertPt(B.getMBB(), MI);
4495   }
4496 
4497   // FIXME: We don't really need this intermediate instruction. The intrinsic
4498   // should be fixed to have a memory operand. Since it's readnone, we're not
4499   // allowed to add one.
4500   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4501   MI.RemoveOperand(1); // Remove intrinsic ID
4502 
4503   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4504   // TODO: Should this use datalayout alignment?
4505   const unsigned MemSize = (Size + 7) / 8;
4506   const Align MemAlign(4);
4507   MachineMemOperand *MMO = MF.getMachineMemOperand(
4508       MachinePointerInfo(),
4509       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4510           MachineMemOperand::MOInvariant,
4511       MemSize, MemAlign);
4512   MI.addMemOperand(MF, MMO);
4513 
4514   // There are no 96-bit result scalar loads, but widening to 128-bit should
4515   // always be legal. We may need to restore this to a 96-bit result if it turns
4516   // out this needs to be converted to a vector load during RegBankSelect.
4517   if (!isPowerOf2_32(Size)) {
4518     if (Ty.isVector())
4519       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4520     else
4521       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4522   }
4523 
4524   Observer.changedInstr(MI);
4525   return true;
4526 }
4527 
4528 // TODO: Move to selection
4529 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4530                                                 MachineRegisterInfo &MRI,
4531                                                 MachineIRBuilder &B) const {
4532   if (!ST.isTrapHandlerEnabled() ||
4533       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4534     return legalizeTrapEndpgm(MI, MRI, B);
4535 
4536   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4537     switch (*HsaAbiVer) {
4538     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4539     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4540       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4541     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4542       return ST.supportsGetDoorbellID() ?
4543           legalizeTrapHsa(MI, MRI, B) :
4544           legalizeTrapHsaQueuePtr(MI, MRI, B);
4545     }
4546   }
4547 
4548   llvm_unreachable("Unknown trap handler");
4549 }
4550 
4551 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4552     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4553   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4554   MI.eraseFromParent();
4555   return true;
4556 }
4557 
4558 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4559     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4560   // Pass queue pointer to trap handler as input, and insert trap instruction
4561   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4562   Register LiveIn =
4563     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4564   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4565     return false;
4566 
4567   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4568   B.buildCopy(SGPR01, LiveIn);
4569   B.buildInstr(AMDGPU::S_TRAP)
4570       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4571       .addReg(SGPR01, RegState::Implicit);
4572 
4573   MI.eraseFromParent();
4574   return true;
4575 }
4576 
4577 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4578     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4579   B.buildInstr(AMDGPU::S_TRAP)
4580       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4581   MI.eraseFromParent();
4582   return true;
4583 }
4584 
4585 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4586     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4587   // Is non-HSA path or trap-handler disabled? then, report a warning
4588   // accordingly
4589   if (!ST.isTrapHandlerEnabled() ||
4590       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4591     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4592                                      "debugtrap handler not supported",
4593                                      MI.getDebugLoc(), DS_Warning);
4594     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4595     Ctx.diagnose(NoTrap);
4596   } else {
4597     // Insert debug-trap instruction
4598     B.buildInstr(AMDGPU::S_TRAP)
4599         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4600   }
4601 
4602   MI.eraseFromParent();
4603   return true;
4604 }
4605 
4606 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4607                                                MachineIRBuilder &B) const {
4608   MachineRegisterInfo &MRI = *B.getMRI();
4609   const LLT S16 = LLT::scalar(16);
4610   const LLT S32 = LLT::scalar(32);
4611 
4612   Register DstReg = MI.getOperand(0).getReg();
4613   Register NodePtr = MI.getOperand(2).getReg();
4614   Register RayExtent = MI.getOperand(3).getReg();
4615   Register RayOrigin = MI.getOperand(4).getReg();
4616   Register RayDir = MI.getOperand(5).getReg();
4617   Register RayInvDir = MI.getOperand(6).getReg();
4618   Register TDescr = MI.getOperand(7).getReg();
4619 
4620   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4621   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4622   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4623                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4624                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4625                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4626 
4627   SmallVector<Register, 12> Ops;
4628   if (Is64) {
4629     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4630     Ops.push_back(Unmerge.getReg(0));
4631     Ops.push_back(Unmerge.getReg(1));
4632   } else {
4633     Ops.push_back(NodePtr);
4634   }
4635   Ops.push_back(RayExtent);
4636 
4637   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4638     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4639     Ops.push_back(Unmerge.getReg(0));
4640     Ops.push_back(Unmerge.getReg(1));
4641     Ops.push_back(Unmerge.getReg(2));
4642   };
4643 
4644   packLanes(RayOrigin);
4645   if (IsA16) {
4646     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4647     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4648     Register R1 = MRI.createGenericVirtualRegister(S32);
4649     Register R2 = MRI.createGenericVirtualRegister(S32);
4650     Register R3 = MRI.createGenericVirtualRegister(S32);
4651     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4652     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4653     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4654     Ops.push_back(R1);
4655     Ops.push_back(R2);
4656     Ops.push_back(R3);
4657   } else {
4658     packLanes(RayDir);
4659     packLanes(RayInvDir);
4660   }
4661 
4662   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4663     .addDef(DstReg)
4664     .addImm(Opcode);
4665 
4666   for (Register R : Ops) {
4667     MIB.addUse(R);
4668   }
4669 
4670   MIB.addUse(TDescr)
4671      .addImm(IsA16 ? 1 : 0)
4672      .cloneMemRefs(MI);
4673 
4674   MI.eraseFromParent();
4675   return true;
4676 }
4677 
4678 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4679                                             MachineInstr &MI) const {
4680   MachineIRBuilder &B = Helper.MIRBuilder;
4681   MachineRegisterInfo &MRI = *B.getMRI();
4682 
4683   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4684   auto IntrID = MI.getIntrinsicID();
4685   switch (IntrID) {
4686   case Intrinsic::amdgcn_if:
4687   case Intrinsic::amdgcn_else: {
4688     MachineInstr *Br = nullptr;
4689     MachineBasicBlock *UncondBrTarget = nullptr;
4690     bool Negated = false;
4691     if (MachineInstr *BrCond =
4692             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4693       const SIRegisterInfo *TRI
4694         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4695 
4696       Register Def = MI.getOperand(1).getReg();
4697       Register Use = MI.getOperand(3).getReg();
4698 
4699       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4700 
4701       if (Negated)
4702         std::swap(CondBrTarget, UncondBrTarget);
4703 
4704       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4705       if (IntrID == Intrinsic::amdgcn_if) {
4706         B.buildInstr(AMDGPU::SI_IF)
4707           .addDef(Def)
4708           .addUse(Use)
4709           .addMBB(UncondBrTarget);
4710       } else {
4711         B.buildInstr(AMDGPU::SI_ELSE)
4712             .addDef(Def)
4713             .addUse(Use)
4714             .addMBB(UncondBrTarget);
4715       }
4716 
4717       if (Br) {
4718         Br->getOperand(0).setMBB(CondBrTarget);
4719       } else {
4720         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4721         // since we're swapping branch targets it needs to be reinserted.
4722         // FIXME: IRTranslator should probably not do this
4723         B.buildBr(*CondBrTarget);
4724       }
4725 
4726       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4727       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4728       MI.eraseFromParent();
4729       BrCond->eraseFromParent();
4730       return true;
4731     }
4732 
4733     return false;
4734   }
4735   case Intrinsic::amdgcn_loop: {
4736     MachineInstr *Br = nullptr;
4737     MachineBasicBlock *UncondBrTarget = nullptr;
4738     bool Negated = false;
4739     if (MachineInstr *BrCond =
4740             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4741       const SIRegisterInfo *TRI
4742         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4743 
4744       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4745       Register Reg = MI.getOperand(2).getReg();
4746 
4747       if (Negated)
4748         std::swap(CondBrTarget, UncondBrTarget);
4749 
4750       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4751       B.buildInstr(AMDGPU::SI_LOOP)
4752         .addUse(Reg)
4753         .addMBB(UncondBrTarget);
4754 
4755       if (Br)
4756         Br->getOperand(0).setMBB(CondBrTarget);
4757       else
4758         B.buildBr(*CondBrTarget);
4759 
4760       MI.eraseFromParent();
4761       BrCond->eraseFromParent();
4762       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4763       return true;
4764     }
4765 
4766     return false;
4767   }
4768   case Intrinsic::amdgcn_kernarg_segment_ptr:
4769     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4770       // This only makes sense to call in a kernel, so just lower to null.
4771       B.buildConstant(MI.getOperand(0).getReg(), 0);
4772       MI.eraseFromParent();
4773       return true;
4774     }
4775 
4776     return legalizePreloadedArgIntrin(
4777       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4778   case Intrinsic::amdgcn_implicitarg_ptr:
4779     return legalizeImplicitArgPtr(MI, MRI, B);
4780   case Intrinsic::amdgcn_workitem_id_x:
4781     return legalizePreloadedArgIntrin(MI, MRI, B,
4782                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4783   case Intrinsic::amdgcn_workitem_id_y:
4784     return legalizePreloadedArgIntrin(MI, MRI, B,
4785                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4786   case Intrinsic::amdgcn_workitem_id_z:
4787     return legalizePreloadedArgIntrin(MI, MRI, B,
4788                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4789   case Intrinsic::amdgcn_workgroup_id_x:
4790     return legalizePreloadedArgIntrin(MI, MRI, B,
4791                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4792   case Intrinsic::amdgcn_workgroup_id_y:
4793     return legalizePreloadedArgIntrin(MI, MRI, B,
4794                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4795   case Intrinsic::amdgcn_workgroup_id_z:
4796     return legalizePreloadedArgIntrin(MI, MRI, B,
4797                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4798   case Intrinsic::amdgcn_dispatch_ptr:
4799     return legalizePreloadedArgIntrin(MI, MRI, B,
4800                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4801   case Intrinsic::amdgcn_queue_ptr:
4802     return legalizePreloadedArgIntrin(MI, MRI, B,
4803                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4804   case Intrinsic::amdgcn_implicit_buffer_ptr:
4805     return legalizePreloadedArgIntrin(
4806       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4807   case Intrinsic::amdgcn_dispatch_id:
4808     return legalizePreloadedArgIntrin(MI, MRI, B,
4809                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4810   case Intrinsic::amdgcn_fdiv_fast:
4811     return legalizeFDIVFastIntrin(MI, MRI, B);
4812   case Intrinsic::amdgcn_is_shared:
4813     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4814   case Intrinsic::amdgcn_is_private:
4815     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4816   case Intrinsic::amdgcn_wavefrontsize: {
4817     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4818     MI.eraseFromParent();
4819     return true;
4820   }
4821   case Intrinsic::amdgcn_s_buffer_load:
4822     return legalizeSBufferLoad(Helper, MI);
4823   case Intrinsic::amdgcn_raw_buffer_store:
4824   case Intrinsic::amdgcn_struct_buffer_store:
4825     return legalizeBufferStore(MI, MRI, B, false, false);
4826   case Intrinsic::amdgcn_raw_buffer_store_format:
4827   case Intrinsic::amdgcn_struct_buffer_store_format:
4828     return legalizeBufferStore(MI, MRI, B, false, true);
4829   case Intrinsic::amdgcn_raw_tbuffer_store:
4830   case Intrinsic::amdgcn_struct_tbuffer_store:
4831     return legalizeBufferStore(MI, MRI, B, true, true);
4832   case Intrinsic::amdgcn_raw_buffer_load:
4833   case Intrinsic::amdgcn_struct_buffer_load:
4834     return legalizeBufferLoad(MI, MRI, B, false, false);
4835   case Intrinsic::amdgcn_raw_buffer_load_format:
4836   case Intrinsic::amdgcn_struct_buffer_load_format:
4837     return legalizeBufferLoad(MI, MRI, B, true, false);
4838   case Intrinsic::amdgcn_raw_tbuffer_load:
4839   case Intrinsic::amdgcn_struct_tbuffer_load:
4840     return legalizeBufferLoad(MI, MRI, B, true, true);
4841   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4842   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4843   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4844   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4845   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4846   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4847   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4848   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4849   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4850   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4851   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4852   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4853   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4854   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4855   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4856   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4857   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4858   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4859   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4860   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4861   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4862   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4863   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4864   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4865   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4866   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4867   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4868   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4869   case Intrinsic::amdgcn_buffer_atomic_fadd:
4870   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4871   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4872   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4873   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4874     return legalizeBufferAtomic(MI, B, IntrID);
4875   case Intrinsic::amdgcn_atomic_inc:
4876     return legalizeAtomicIncDec(MI, B, true);
4877   case Intrinsic::amdgcn_atomic_dec:
4878     return legalizeAtomicIncDec(MI, B, false);
4879   case Intrinsic::trap:
4880     return legalizeTrapIntrinsic(MI, MRI, B);
4881   case Intrinsic::debugtrap:
4882     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4883   case Intrinsic::amdgcn_rsq_clamp:
4884     return legalizeRsqClampIntrinsic(MI, MRI, B);
4885   case Intrinsic::amdgcn_ds_fadd:
4886   case Intrinsic::amdgcn_ds_fmin:
4887   case Intrinsic::amdgcn_ds_fmax:
4888     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4889   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4890     return legalizeBVHIntrinsic(MI, B);
4891   default: {
4892     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4893             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4894       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4895     return true;
4896   }
4897   }
4898 
4899   return true;
4900 }
4901