1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 // If we have a truncating store or an extending load with a data size larger
226 // than 32-bits, we need to reduce to a 32-bit type.
227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return ST.enableFlatScratch() ? 128 : 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // Do not handle extending vector loads.
280   if (Ty.isVector() && MemSize != RegSize)
281     return false;
282 
283   // TODO: We should be able to widen loads if the alignment is high enough, but
284   // we also need to modify the memory access size.
285 #if 0
286   // Accept widening loads based on alignment.
287   if (IsLoad && MemSize < Size)
288     MemSize = std::max(MemSize, Align);
289 #endif
290 
291   // Only 1-byte and 2-byte to 32-bit extloads are valid.
292   if (MemSize != RegSize && RegSize != 32)
293     return false;
294 
295   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296     return false;
297 
298   switch (MemSize) {
299   case 8:
300   case 16:
301   case 32:
302   case 64:
303   case 128:
304     break;
305   case 96:
306     if (!ST.hasDwordx3LoadStores())
307       return false;
308     break;
309   case 256:
310   case 512:
311     // These may contextually need to be broken down.
312     break;
313   default:
314     return false;
315   }
316 
317   assert(RegSize >= MemSize);
318 
319   if (AlignBits < MemSize) {
320     const SITargetLowering *TLI = ST.getTargetLowering();
321     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322                                                  Align(AlignBits / 8)))
323       return false;
324   }
325 
326   return true;
327 }
328 
329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330 // workaround this. Eventually it should ignore the type for loads and only care
331 // about the size. Return true in cases where we will workaround this for now by
332 // bitcasting.
333 static bool loadStoreBitcastWorkaround(const LLT Ty) {
334   if (EnableNewLegality)
335     return false;
336 
337   const unsigned Size = Ty.getSizeInBits();
338   if (Size <= 64)
339     return false;
340   if (!Ty.isVector())
341     return true;
342 
343   LLT EltTy = Ty.getElementType();
344   if (EltTy.isPointer())
345     return true;
346 
347   unsigned EltSize = EltTy.getSizeInBits();
348   return EltSize != 32 && EltSize != 64;
349 }
350 
351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352   const LLT Ty = Query.Types[0];
353   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
354          !loadStoreBitcastWorkaround(Ty);
355 }
356 
357 /// Return true if a load or store of the type should be lowered with a bitcast
358 /// to a different type.
359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360                                        const unsigned MemSizeInBits) {
361   const unsigned Size = Ty.getSizeInBits();
362   if (Size != MemSizeInBits)
363     return Size <= 32 && Ty.isVector();
364 
365   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
366     return true;
367   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
368          !isRegisterVectorElementType(Ty.getElementType());
369 }
370 
371 /// Return true if we should legalize a load by widening an odd sized memory
372 /// access up to the alignment. Note this case when the memory access itself
373 /// changes, not the size of the result register.
374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375                             unsigned AlignInBits, unsigned AddrSpace,
376                             unsigned Opcode) {
377   // We don't want to widen cases that are naturally legal.
378   if (isPowerOf2_32(SizeInBits))
379     return false;
380 
381   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382   // end up widening these for a scalar load during RegBankSelect, since there
383   // aren't 96-bit scalar loads.
384   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385     return false;
386 
387   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388     return false;
389 
390   // A load is known dereferenceable up to the alignment, so it's legal to widen
391   // to it.
392   //
393   // TODO: Could check dereferenceable for less aligned cases.
394   unsigned RoundedSize = NextPowerOf2(SizeInBits);
395   if (AlignInBits < RoundedSize)
396     return false;
397 
398   // Do not widen if it would introduce a slow unaligned load.
399   const SITargetLowering *TLI = ST.getTargetLowering();
400   bool Fast = false;
401   return TLI->allowsMisalignedMemoryAccessesImpl(
402              RoundedSize, AddrSpace, Align(AlignInBits / 8),
403              MachineMemOperand::MOLoad, &Fast) &&
404          Fast;
405 }
406 
407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408                             unsigned Opcode) {
409   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410     return false;
411 
412   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413                          Query.MMODescrs[0].AlignInBits,
414                          Query.Types[1].getAddressSpace(), Opcode);
415 }
416 
417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
418                                          const GCNTargetMachine &TM)
419   :  ST(ST_) {
420   using namespace TargetOpcode;
421 
422   auto GetAddrSpacePtr = [&TM](unsigned AS) {
423     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424   };
425 
426   const LLT S1 = LLT::scalar(1);
427   const LLT S8 = LLT::scalar(8);
428   const LLT S16 = LLT::scalar(16);
429   const LLT S32 = LLT::scalar(32);
430   const LLT S64 = LLT::scalar(64);
431   const LLT S128 = LLT::scalar(128);
432   const LLT S256 = LLT::scalar(256);
433   const LLT S512 = LLT::scalar(512);
434   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435 
436   const LLT V2S8 = LLT::vector(2, 8);
437   const LLT V2S16 = LLT::vector(2, 16);
438   const LLT V4S16 = LLT::vector(4, 16);
439 
440   const LLT V2S32 = LLT::vector(2, 32);
441   const LLT V3S32 = LLT::vector(3, 32);
442   const LLT V4S32 = LLT::vector(4, 32);
443   const LLT V5S32 = LLT::vector(5, 32);
444   const LLT V6S32 = LLT::vector(6, 32);
445   const LLT V7S32 = LLT::vector(7, 32);
446   const LLT V8S32 = LLT::vector(8, 32);
447   const LLT V9S32 = LLT::vector(9, 32);
448   const LLT V10S32 = LLT::vector(10, 32);
449   const LLT V11S32 = LLT::vector(11, 32);
450   const LLT V12S32 = LLT::vector(12, 32);
451   const LLT V13S32 = LLT::vector(13, 32);
452   const LLT V14S32 = LLT::vector(14, 32);
453   const LLT V15S32 = LLT::vector(15, 32);
454   const LLT V16S32 = LLT::vector(16, 32);
455   const LLT V32S32 = LLT::vector(32, 32);
456 
457   const LLT V2S64 = LLT::vector(2, 64);
458   const LLT V3S64 = LLT::vector(3, 64);
459   const LLT V4S64 = LLT::vector(4, 64);
460   const LLT V5S64 = LLT::vector(5, 64);
461   const LLT V6S64 = LLT::vector(6, 64);
462   const LLT V7S64 = LLT::vector(7, 64);
463   const LLT V8S64 = LLT::vector(8, 64);
464   const LLT V16S64 = LLT::vector(16, 64);
465 
466   std::initializer_list<LLT> AllS32Vectors =
467     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469   std::initializer_list<LLT> AllS64Vectors =
470     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471 
472   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479 
480   const LLT CodePtr = FlatPtr;
481 
482   const std::initializer_list<LLT> AddrSpaces64 = {
483     GlobalPtr, ConstantPtr, FlatPtr
484   };
485 
486   const std::initializer_list<LLT> AddrSpaces32 = {
487     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488   };
489 
490   const std::initializer_list<LLT> FPTypesBase = {
491     S32, S64
492   };
493 
494   const std::initializer_list<LLT> FPTypes16 = {
495     S32, S64, S16
496   };
497 
498   const std::initializer_list<LLT> FPTypesPK16 = {
499     S32, S64, S16, V2S16
500   };
501 
502   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503 
504   // s1 for VCC branches, s32 for SCC branches.
505   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
506 
507   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
508   // elements for v3s16
509   getActionDefinitionsBuilder(G_PHI)
510     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
511     .legalFor(AllS32Vectors)
512     .legalFor(AllS64Vectors)
513     .legalFor(AddrSpaces64)
514     .legalFor(AddrSpaces32)
515     .legalIf(isPointer(0))
516     .clampScalar(0, S16, S256)
517     .widenScalarToNextPow2(0, 32)
518     .clampMaxNumElements(0, S32, 16)
519     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
520     .scalarize(0);
521 
522   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
523     // Full set of gfx9 features.
524     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
525       .legalFor({S32, S16, V2S16})
526       .clampScalar(0, S16, S32)
527       .clampMaxNumElements(0, S16, 2)
528       .scalarize(0)
529       .widenScalarToNextPow2(0, 32);
530 
531     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
532       .legalFor({S32, S16, V2S16}) // Clamp modifier
533       .minScalarOrElt(0, S16)
534       .clampMaxNumElements(0, S16, 2)
535       .scalarize(0)
536       .widenScalarToNextPow2(0, 32)
537       .lower();
538   } else if (ST.has16BitInsts()) {
539     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
540       .legalFor({S32, S16})
541       .clampScalar(0, S16, S32)
542       .scalarize(0)
543       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
544 
545     // Technically the saturating operations require clamp bit support, but this
546     // was introduced at the same time as 16-bit operations.
547     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
548       .legalFor({S32, S16}) // Clamp modifier
549       .minScalar(0, S16)
550       .scalarize(0)
551       .widenScalarToNextPow2(0, 16)
552       .lower();
553 
554     // We're just lowering this, but it helps get a better result to try to
555     // coerce to the desired type first.
556     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
557       .minScalar(0, S16)
558       .scalarize(0)
559       .lower();
560   } else {
561     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
562       .legalFor({S32})
563       .clampScalar(0, S32, S32)
564       .scalarize(0);
565 
566     if (ST.hasIntClamp()) {
567       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
568         .legalFor({S32}) // Clamp modifier.
569         .scalarize(0)
570         .minScalarOrElt(0, S32)
571         .lower();
572     } else {
573       // Clamp bit support was added in VI, along with 16-bit operations.
574       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
575         .minScalar(0, S32)
576         .scalarize(0)
577         .lower();
578     }
579 
580     // FIXME: DAG expansion gets better results. The widening uses the smaller
581     // range values and goes for the min/max lowering directly.
582     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
583       .minScalar(0, S32)
584       .scalarize(0)
585       .lower();
586   }
587 
588   getActionDefinitionsBuilder(
589       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
590       .customFor({S32, S64})
591       .clampScalar(0, S32, S64)
592       .widenScalarToNextPow2(0, 32)
593       .scalarize(0);
594 
595   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
596                    .legalFor({S32})
597                    .maxScalarOrElt(0, S32);
598 
599   if (ST.hasVOP3PInsts()) {
600     Mulh
601       .clampMaxNumElements(0, S8, 2)
602       .lowerFor({V2S8});
603   }
604 
605   Mulh
606     .scalarize(0)
607     .lower();
608 
609   // Report legal for any types we can handle anywhere. For the cases only legal
610   // on the SALU, RegBankSelect will be able to re-legalize.
611   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
612     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
613     .clampScalar(0, S32, S64)
614     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
615     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
616     .widenScalarToNextPow2(0)
617     .scalarize(0);
618 
619   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
620                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
621     .legalFor({{S32, S1}, {S32, S32}})
622     .minScalar(0, S32)
623     // TODO: .scalarize(0)
624     .lower();
625 
626   getActionDefinitionsBuilder(G_BITCAST)
627     // Don't worry about the size constraint.
628     .legalIf(all(isRegisterType(0), isRegisterType(1)))
629     .lower();
630 
631 
632   getActionDefinitionsBuilder(G_CONSTANT)
633     .legalFor({S1, S32, S64, S16, GlobalPtr,
634                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
635     .legalIf(isPointer(0))
636     .clampScalar(0, S32, S64)
637     .widenScalarToNextPow2(0);
638 
639   getActionDefinitionsBuilder(G_FCONSTANT)
640     .legalFor({S32, S64, S16})
641     .clampScalar(0, S16, S64);
642 
643   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
644       .legalIf(isRegisterType(0))
645       // s1 and s16 are special cases because they have legal operations on
646       // them, but don't really occupy registers in the normal way.
647       .legalFor({S1, S16})
648       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
649       .clampScalarOrElt(0, S32, MaxScalar)
650       .widenScalarToNextPow2(0, 32)
651       .clampMaxNumElements(0, S32, 16);
652 
653   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
654 
655   // If the amount is divergent, we have to do a wave reduction to get the
656   // maximum value, so this is expanded during RegBankSelect.
657   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
658     .legalFor({{PrivatePtr, S32}});
659 
660   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
661     .customIf(typeIsNot(0, PrivatePtr));
662 
663   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
664 
665   auto &FPOpActions = getActionDefinitionsBuilder(
666     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
667     .legalFor({S32, S64});
668   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
669     .customFor({S32, S64});
670   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
671     .customFor({S32, S64});
672 
673   if (ST.has16BitInsts()) {
674     if (ST.hasVOP3PInsts())
675       FPOpActions.legalFor({S16, V2S16});
676     else
677       FPOpActions.legalFor({S16});
678 
679     TrigActions.customFor({S16});
680     FDIVActions.customFor({S16});
681   }
682 
683   auto &MinNumMaxNum = getActionDefinitionsBuilder({
684       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
685 
686   if (ST.hasVOP3PInsts()) {
687     MinNumMaxNum.customFor(FPTypesPK16)
688       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689       .clampMaxNumElements(0, S16, 2)
690       .clampScalar(0, S16, S64)
691       .scalarize(0);
692   } else if (ST.has16BitInsts()) {
693     MinNumMaxNum.customFor(FPTypes16)
694       .clampScalar(0, S16, S64)
695       .scalarize(0);
696   } else {
697     MinNumMaxNum.customFor(FPTypesBase)
698       .clampScalar(0, S32, S64)
699       .scalarize(0);
700   }
701 
702   if (ST.hasVOP3PInsts())
703     FPOpActions.clampMaxNumElements(0, S16, 2);
704 
705   FPOpActions
706     .scalarize(0)
707     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
708 
709   TrigActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   FDIVActions
714     .scalarize(0)
715     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717   getActionDefinitionsBuilder({G_FNEG, G_FABS})
718     .legalFor(FPTypesPK16)
719     .clampMaxNumElements(0, S16, 2)
720     .scalarize(0)
721     .clampScalar(0, S16, S64);
722 
723   if (ST.has16BitInsts()) {
724     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
725       .legalFor({S32, S64, S16})
726       .scalarize(0)
727       .clampScalar(0, S16, S64);
728   } else {
729     getActionDefinitionsBuilder(G_FSQRT)
730       .legalFor({S32, S64})
731       .scalarize(0)
732       .clampScalar(0, S32, S64);
733 
734     if (ST.hasFractBug()) {
735       getActionDefinitionsBuilder(G_FFLOOR)
736         .customFor({S64})
737         .legalFor({S32, S64})
738         .scalarize(0)
739         .clampScalar(0, S32, S64);
740     } else {
741       getActionDefinitionsBuilder(G_FFLOOR)
742         .legalFor({S32, S64})
743         .scalarize(0)
744         .clampScalar(0, S32, S64);
745     }
746   }
747 
748   getActionDefinitionsBuilder(G_FPTRUNC)
749     .legalFor({{S32, S64}, {S16, S32}})
750     .scalarize(0)
751     .lower();
752 
753   getActionDefinitionsBuilder(G_FPEXT)
754     .legalFor({{S64, S32}, {S32, S16}})
755     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
756     .scalarize(0);
757 
758   getActionDefinitionsBuilder(G_FSUB)
759       // Use actual fsub instruction
760       .legalFor({S32})
761       // Must use fadd + fneg
762       .lowerFor({S64, S16, V2S16})
763       .scalarize(0)
764       .clampScalar(0, S32, S64);
765 
766   // Whether this is legal depends on the floating point mode for the function.
767   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
768   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
769     FMad.customFor({S32, S16});
770   else if (ST.hasMadMacF32Insts())
771     FMad.customFor({S32});
772   else if (ST.hasMadF16())
773     FMad.customFor({S16});
774   FMad.scalarize(0)
775       .lower();
776 
777   auto &FRem = getActionDefinitionsBuilder(G_FREM);
778   if (ST.has16BitInsts()) {
779     FRem.customFor({S16, S32, S64});
780   } else {
781     FRem.minScalar(0, S32)
782         .customFor({S32, S64});
783   }
784   FRem.scalarize(0);
785 
786   // TODO: Do we need to clamp maximum bitwidth?
787   getActionDefinitionsBuilder(G_TRUNC)
788     .legalIf(isScalar(0))
789     .legalFor({{V2S16, V2S32}})
790     .clampMaxNumElements(0, S16, 2)
791     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
792     // situations (like an invalid implicit use), we don't want to infinite loop
793     // in the legalizer.
794     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
795     .alwaysLegal();
796 
797   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
798     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
799                {S32, S1}, {S64, S1}, {S16, S1}})
800     .scalarize(0)
801     .clampScalar(0, S32, S64)
802     .widenScalarToNextPow2(1, 32);
803 
804   // TODO: Split s1->s64 during regbankselect for VALU.
805   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
806     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
807     .lowerFor({{S32, S64}})
808     .lowerIf(typeIs(1, S1))
809     .customFor({{S64, S64}});
810   if (ST.has16BitInsts())
811     IToFP.legalFor({{S16, S16}});
812   IToFP.clampScalar(1, S32, S64)
813        .minScalar(0, S32)
814        .scalarize(0)
815        .widenScalarToNextPow2(1);
816 
817   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
818     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
819     .customFor({{S64, S32}, {S64, S64}})
820     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
821   if (ST.has16BitInsts())
822     FPToI.legalFor({{S16, S16}});
823   else
824     FPToI.minScalar(1, S32);
825 
826   FPToI.minScalar(0, S32)
827        .widenScalarToNextPow2(0, 32)
828        .scalarize(0)
829        .lower();
830 
831   // Lower roundeven into G_FRINT
832   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
833     .scalarize(0)
834     .lower();
835 
836   if (ST.has16BitInsts()) {
837     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
838       .legalFor({S16, S32, S64})
839       .clampScalar(0, S16, S64)
840       .scalarize(0);
841   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
842     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
843       .legalFor({S32, S64})
844       .clampScalar(0, S32, S64)
845       .scalarize(0);
846   } else {
847     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
848       .legalFor({S32})
849       .customFor({S64})
850       .clampScalar(0, S32, S64)
851       .scalarize(0);
852   }
853 
854   getActionDefinitionsBuilder(G_PTR_ADD)
855     .legalIf(all(isPointer(0), sameSize(0, 1)))
856     .scalarize(0)
857     .scalarSameSizeAs(1, 0);
858 
859   getActionDefinitionsBuilder(G_PTRMASK)
860     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
861     .scalarSameSizeAs(1, 0)
862     .scalarize(0);
863 
864   auto &CmpBuilder =
865     getActionDefinitionsBuilder(G_ICMP)
866     // The compare output type differs based on the register bank of the output,
867     // so make both s1 and s32 legal.
868     //
869     // Scalar compares producing output in scc will be promoted to s32, as that
870     // is the allocatable register type that will be needed for the copy from
871     // scc. This will be promoted during RegBankSelect, and we assume something
872     // before that won't try to use s32 result types.
873     //
874     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
875     // bank.
876     .legalForCartesianProduct(
877       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
878     .legalForCartesianProduct(
879       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
880   if (ST.has16BitInsts()) {
881     CmpBuilder.legalFor({{S1, S16}});
882   }
883 
884   CmpBuilder
885     .widenScalarToNextPow2(1)
886     .clampScalar(1, S32, S64)
887     .scalarize(0)
888     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
889 
890   getActionDefinitionsBuilder(G_FCMP)
891     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
892     .widenScalarToNextPow2(1)
893     .clampScalar(1, S32, S64)
894     .scalarize(0);
895 
896   // FIXME: fpow has a selection pattern that should move to custom lowering.
897   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
898   if (ST.has16BitInsts())
899     Exp2Ops.legalFor({S32, S16});
900   else
901     Exp2Ops.legalFor({S32});
902   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
903   Exp2Ops.scalarize(0);
904 
905   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
906   if (ST.has16BitInsts())
907     ExpOps.customFor({{S32}, {S16}});
908   else
909     ExpOps.customFor({S32});
910   ExpOps.clampScalar(0, MinScalarFPTy, S32)
911         .scalarize(0);
912 
913   getActionDefinitionsBuilder(G_FPOWI)
914     .clampScalar(0, MinScalarFPTy, S32)
915     .lower();
916 
917   // The 64-bit versions produce 32-bit results, but only on the SALU.
918   getActionDefinitionsBuilder(G_CTPOP)
919     .legalFor({{S32, S32}, {S32, S64}})
920     .clampScalar(0, S32, S32)
921     .clampScalar(1, S32, S64)
922     .scalarize(0)
923     .widenScalarToNextPow2(0, 32)
924     .widenScalarToNextPow2(1, 32);
925 
926   // The hardware instructions return a different result on 0 than the generic
927   // instructions expect. The hardware produces -1, but these produce the
928   // bitwidth.
929   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
930     .scalarize(0)
931     .clampScalar(0, S32, S32)
932     .clampScalar(1, S32, S64)
933     .widenScalarToNextPow2(0, 32)
934     .widenScalarToNextPow2(1, 32)
935     .lower();
936 
937   // The 64-bit versions produce 32-bit results, but only on the SALU.
938   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
939     .legalFor({{S32, S32}, {S32, S64}})
940     .clampScalar(0, S32, S32)
941     .clampScalar(1, S32, S64)
942     .scalarize(0)
943     .widenScalarToNextPow2(0, 32)
944     .widenScalarToNextPow2(1, 32);
945 
946   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
947   // RegBankSelect.
948   getActionDefinitionsBuilder(G_BITREVERSE)
949     .legalFor({S32, S64})
950     .clampScalar(0, S32, S64)
951     .scalarize(0)
952     .widenScalarToNextPow2(0);
953 
954   if (ST.has16BitInsts()) {
955     getActionDefinitionsBuilder(G_BSWAP)
956       .legalFor({S16, S32, V2S16})
957       .clampMaxNumElements(0, S16, 2)
958       // FIXME: Fixing non-power-of-2 before clamp is workaround for
959       // narrowScalar limitation.
960       .widenScalarToNextPow2(0)
961       .clampScalar(0, S16, S32)
962       .scalarize(0);
963 
964     if (ST.hasVOP3PInsts()) {
965       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
966         .legalFor({S32, S16, V2S16})
967         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
968         .clampMaxNumElements(0, S16, 2)
969         .minScalar(0, S16)
970         .widenScalarToNextPow2(0)
971         .scalarize(0)
972         .lower();
973     } else {
974       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
975         .legalFor({S32, S16})
976         .widenScalarToNextPow2(0)
977         .minScalar(0, S16)
978         .scalarize(0)
979         .lower();
980     }
981   } else {
982     // TODO: Should have same legality without v_perm_b32
983     getActionDefinitionsBuilder(G_BSWAP)
984       .legalFor({S32})
985       .lowerIf(scalarNarrowerThan(0, 32))
986       // FIXME: Fixing non-power-of-2 before clamp is workaround for
987       // narrowScalar limitation.
988       .widenScalarToNextPow2(0)
989       .maxScalar(0, S32)
990       .scalarize(0)
991       .lower();
992 
993     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
994       .legalFor({S32})
995       .minScalar(0, S32)
996       .widenScalarToNextPow2(0)
997       .scalarize(0)
998       .lower();
999   }
1000 
1001   getActionDefinitionsBuilder(G_INTTOPTR)
1002     // List the common cases
1003     .legalForCartesianProduct(AddrSpaces64, {S64})
1004     .legalForCartesianProduct(AddrSpaces32, {S32})
1005     .scalarize(0)
1006     // Accept any address space as long as the size matches
1007     .legalIf(sameSize(0, 1))
1008     .widenScalarIf(smallerThan(1, 0),
1009       [](const LegalityQuery &Query) {
1010         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1011       })
1012     .narrowScalarIf(largerThan(1, 0),
1013       [](const LegalityQuery &Query) {
1014         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015       });
1016 
1017   getActionDefinitionsBuilder(G_PTRTOINT)
1018     // List the common cases
1019     .legalForCartesianProduct(AddrSpaces64, {S64})
1020     .legalForCartesianProduct(AddrSpaces32, {S32})
1021     .scalarize(0)
1022     // Accept any address space as long as the size matches
1023     .legalIf(sameSize(0, 1))
1024     .widenScalarIf(smallerThan(0, 1),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1027       })
1028     .narrowScalarIf(
1029       largerThan(0, 1),
1030       [](const LegalityQuery &Query) {
1031         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1032       });
1033 
1034   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1035     .scalarize(0)
1036     .custom();
1037 
1038   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1039                                     bool IsLoad) -> bool {
1040     const LLT DstTy = Query.Types[0];
1041 
1042     // Split vector extloads.
1043     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1044     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1045 
1046     if (MemSize < DstTy.getSizeInBits())
1047       MemSize = std::max(MemSize, AlignBits);
1048 
1049     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1050       return true;
1051 
1052     const LLT PtrTy = Query.Types[1];
1053     unsigned AS = PtrTy.getAddressSpace();
1054     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1055       return true;
1056 
1057     // Catch weird sized loads that don't evenly divide into the access sizes
1058     // TODO: May be able to widen depending on alignment etc.
1059     unsigned NumRegs = (MemSize + 31) / 32;
1060     if (NumRegs == 3) {
1061       if (!ST.hasDwordx3LoadStores())
1062         return true;
1063     } else {
1064       // If the alignment allows, these should have been widened.
1065       if (!isPowerOf2_32(NumRegs))
1066         return true;
1067     }
1068 
1069     if (AlignBits < MemSize) {
1070       const SITargetLowering *TLI = ST.getTargetLowering();
1071       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1072                                                       Align(AlignBits / 8));
1073     }
1074 
1075     return false;
1076   };
1077 
1078   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1079   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1080   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1081 
1082   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1083   // LDS
1084   // TODO: Unsupported flat for SI.
1085 
1086   for (unsigned Op : {G_LOAD, G_STORE}) {
1087     const bool IsStore = Op == G_STORE;
1088 
1089     auto &Actions = getActionDefinitionsBuilder(Op);
1090     // Explicitly list some common cases.
1091     // TODO: Does this help compile time at all?
1092     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1093                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1094                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1095                                       {S64, GlobalPtr, 64, GlobalAlign32},
1096                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1097                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1098                                       {S32, GlobalPtr, 8, GlobalAlign8},
1099                                       {S32, GlobalPtr, 16, GlobalAlign16},
1100 
1101                                       {S32, LocalPtr, 32, 32},
1102                                       {S64, LocalPtr, 64, 32},
1103                                       {V2S32, LocalPtr, 64, 32},
1104                                       {S32, LocalPtr, 8, 8},
1105                                       {S32, LocalPtr, 16, 16},
1106                                       {V2S16, LocalPtr, 32, 32},
1107 
1108                                       {S32, PrivatePtr, 32, 32},
1109                                       {S32, PrivatePtr, 8, 8},
1110                                       {S32, PrivatePtr, 16, 16},
1111                                       {V2S16, PrivatePtr, 32, 32},
1112 
1113                                       {S32, ConstantPtr, 32, GlobalAlign32},
1114                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1115                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1116                                       {S64, ConstantPtr, 64, GlobalAlign32},
1117                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1118     Actions.legalIf(
1119       [=](const LegalityQuery &Query) -> bool {
1120         return isLoadStoreLegal(ST, Query);
1121       });
1122 
1123     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1124     // 64-bits.
1125     //
1126     // TODO: Should generalize bitcast action into coerce, which will also cover
1127     // inserting addrspacecasts.
1128     Actions.customIf(typeIs(1, Constant32Ptr));
1129 
1130     // Turn any illegal element vectors into something easier to deal
1131     // with. These will ultimately produce 32-bit scalar shifts to extract the
1132     // parts anyway.
1133     //
1134     // For odd 16-bit element vectors, prefer to split those into pieces with
1135     // 16-bit vector parts.
1136     Actions.bitcastIf(
1137       [=](const LegalityQuery &Query) -> bool {
1138         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1139                                           Query.MMODescrs[0].SizeInBits);
1140       }, bitcastToRegisterType(0));
1141 
1142     if (!IsStore) {
1143       // Widen suitably aligned loads by loading extra bytes. The standard
1144       // legalization actions can't properly express widening memory operands.
1145       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1146         return shouldWidenLoad(ST, Query, G_LOAD);
1147       });
1148     }
1149 
1150     // FIXME: load/store narrowing should be moved to lower action
1151     Actions
1152         .narrowScalarIf(
1153             [=](const LegalityQuery &Query) -> bool {
1154               return !Query.Types[0].isVector() &&
1155                      needToSplitMemOp(Query, Op == G_LOAD);
1156             },
1157             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1158               const LLT DstTy = Query.Types[0];
1159               const LLT PtrTy = Query.Types[1];
1160 
1161               const unsigned DstSize = DstTy.getSizeInBits();
1162               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1163 
1164               // Split extloads.
1165               if (DstSize > MemSize)
1166                 return std::make_pair(0, LLT::scalar(MemSize));
1167 
1168               if (!isPowerOf2_32(DstSize)) {
1169                 // We're probably decomposing an odd sized store. Try to split
1170                 // to the widest type. TODO: Account for alignment. As-is it
1171                 // should be OK, since the new parts will be further legalized.
1172                 unsigned FloorSize = PowerOf2Floor(DstSize);
1173                 return std::make_pair(0, LLT::scalar(FloorSize));
1174               }
1175 
1176               if (DstSize > 32 && (DstSize % 32 != 0)) {
1177                 // FIXME: Need a way to specify non-extload of larger size if
1178                 // suitably aligned.
1179                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1180               }
1181 
1182               unsigned MaxSize = maxSizeForAddrSpace(ST,
1183                                                      PtrTy.getAddressSpace(),
1184                                                      Op == G_LOAD);
1185               if (MemSize > MaxSize)
1186                 return std::make_pair(0, LLT::scalar(MaxSize));
1187 
1188               unsigned Align = Query.MMODescrs[0].AlignInBits;
1189               return std::make_pair(0, LLT::scalar(Align));
1190             })
1191         .fewerElementsIf(
1192             [=](const LegalityQuery &Query) -> bool {
1193               return Query.Types[0].isVector() &&
1194                      needToSplitMemOp(Query, Op == G_LOAD);
1195             },
1196             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1197               const LLT DstTy = Query.Types[0];
1198               const LLT PtrTy = Query.Types[1];
1199 
1200               LLT EltTy = DstTy.getElementType();
1201               unsigned MaxSize = maxSizeForAddrSpace(ST,
1202                                                      PtrTy.getAddressSpace(),
1203                                                      Op == G_LOAD);
1204 
1205               // FIXME: Handle widened to power of 2 results better. This ends
1206               // up scalarizing.
1207               // FIXME: 3 element stores scalarized on SI
1208 
1209               // Split if it's too large for the address space.
1210               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1211                 unsigned NumElts = DstTy.getNumElements();
1212                 unsigned EltSize = EltTy.getSizeInBits();
1213 
1214                 if (MaxSize % EltSize == 0) {
1215                   return std::make_pair(
1216                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1217                 }
1218 
1219                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1220 
1221                 // FIXME: Refine when odd breakdowns handled
1222                 // The scalars will need to be re-legalized.
1223                 if (NumPieces == 1 || NumPieces >= NumElts ||
1224                     NumElts % NumPieces != 0)
1225                   return std::make_pair(0, EltTy);
1226 
1227                 return std::make_pair(0,
1228                                       LLT::vector(NumElts / NumPieces, EltTy));
1229               }
1230 
1231               // FIXME: We could probably handle weird extending loads better.
1232               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1233               if (DstTy.getSizeInBits() > MemSize)
1234                 return std::make_pair(0, EltTy);
1235 
1236               unsigned EltSize = EltTy.getSizeInBits();
1237               unsigned DstSize = DstTy.getSizeInBits();
1238               if (!isPowerOf2_32(DstSize)) {
1239                 // We're probably decomposing an odd sized store. Try to split
1240                 // to the widest type. TODO: Account for alignment. As-is it
1241                 // should be OK, since the new parts will be further legalized.
1242                 unsigned FloorSize = PowerOf2Floor(DstSize);
1243                 return std::make_pair(
1244                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1245               }
1246 
1247               // Need to split because of alignment.
1248               unsigned Align = Query.MMODescrs[0].AlignInBits;
1249               if (EltSize > Align &&
1250                   (EltSize / Align < DstTy.getNumElements())) {
1251                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1252               }
1253 
1254               // May need relegalization for the scalars.
1255               return std::make_pair(0, EltTy);
1256             })
1257     .lowerIfMemSizeNotPow2()
1258     .minScalar(0, S32)
1259     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1260     .widenScalarToNextPow2(0)
1261     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1262     .lower();
1263   }
1264 
1265   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1266                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1267                                                   {S32, GlobalPtr, 16, 2 * 8},
1268                                                   {S32, LocalPtr, 8, 8},
1269                                                   {S32, LocalPtr, 16, 16},
1270                                                   {S32, PrivatePtr, 8, 8},
1271                                                   {S32, PrivatePtr, 16, 16},
1272                                                   {S32, ConstantPtr, 8, 8},
1273                                                   {S32, ConstantPtr, 16, 2 * 8}})
1274                        .legalIf(
1275                          [=](const LegalityQuery &Query) -> bool {
1276                            return isLoadStoreLegal(ST, Query);
1277                          });
1278 
1279   if (ST.hasFlatAddressSpace()) {
1280     ExtLoads.legalForTypesWithMemDesc(
1281         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1282   }
1283 
1284   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1285   // 64-bits.
1286   //
1287   // TODO: Should generalize bitcast action into coerce, which will also cover
1288   // inserting addrspacecasts.
1289   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1290 
1291   ExtLoads.clampScalar(0, S32, S32)
1292           .widenScalarToNextPow2(0)
1293           .unsupportedIfMemSizeNotPow2()
1294           .lower();
1295 
1296   auto &Atomics = getActionDefinitionsBuilder(
1297     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1298      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1299      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1300      G_ATOMICRMW_UMIN})
1301     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1302                {S64, GlobalPtr}, {S64, LocalPtr},
1303                {S32, RegionPtr}, {S64, RegionPtr}});
1304   if (ST.hasFlatAddressSpace()) {
1305     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1306   }
1307 
1308   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1309   if (ST.hasLDSFPAtomics()) {
1310     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1311     if (ST.hasGFX90AInsts())
1312       Atomic.legalFor({{S64, LocalPtr}});
1313   }
1314   if (ST.hasAtomicFaddInsts())
1315     Atomic.legalFor({{S32, GlobalPtr}});
1316 
1317   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1318   // demarshalling
1319   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1320     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1321                 {S32, FlatPtr}, {S64, FlatPtr}})
1322     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1323                {S32, RegionPtr}, {S64, RegionPtr}});
1324   // TODO: Pointer types, any 32-bit or 64-bit vector
1325 
1326   // Condition should be s32 for scalar, s1 for vector.
1327   getActionDefinitionsBuilder(G_SELECT)
1328     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1329           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1330           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1331     .clampScalar(0, S16, S64)
1332     .scalarize(1)
1333     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1334     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1335     .clampMaxNumElements(0, S32, 2)
1336     .clampMaxNumElements(0, LocalPtr, 2)
1337     .clampMaxNumElements(0, PrivatePtr, 2)
1338     .scalarize(0)
1339     .widenScalarToNextPow2(0)
1340     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1341 
1342   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1343   // be more flexible with the shift amount type.
1344   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1345     .legalFor({{S32, S32}, {S64, S32}});
1346   if (ST.has16BitInsts()) {
1347     if (ST.hasVOP3PInsts()) {
1348       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1349             .clampMaxNumElements(0, S16, 2);
1350     } else
1351       Shifts.legalFor({{S16, S16}});
1352 
1353     // TODO: Support 16-bit shift amounts for all types
1354     Shifts.widenScalarIf(
1355       [=](const LegalityQuery &Query) {
1356         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1357         // 32-bit amount.
1358         const LLT ValTy = Query.Types[0];
1359         const LLT AmountTy = Query.Types[1];
1360         return ValTy.getSizeInBits() <= 16 &&
1361                AmountTy.getSizeInBits() < 16;
1362       }, changeTo(1, S16));
1363     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1364     Shifts.clampScalar(1, S32, S32);
1365     Shifts.clampScalar(0, S16, S64);
1366     Shifts.widenScalarToNextPow2(0, 16);
1367 
1368     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1369       .minScalar(0, S16)
1370       .scalarize(0)
1371       .lower();
1372   } else {
1373     // Make sure we legalize the shift amount type first, as the general
1374     // expansion for the shifted type will produce much worse code if it hasn't
1375     // been truncated already.
1376     Shifts.clampScalar(1, S32, S32);
1377     Shifts.clampScalar(0, S32, S64);
1378     Shifts.widenScalarToNextPow2(0, 32);
1379 
1380     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381       .minScalar(0, S32)
1382       .scalarize(0)
1383       .lower();
1384   }
1385   Shifts.scalarize(0);
1386 
1387   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1388     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1389     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1390     unsigned IdxTypeIdx = 2;
1391 
1392     getActionDefinitionsBuilder(Op)
1393       .customIf([=](const LegalityQuery &Query) {
1394           const LLT EltTy = Query.Types[EltTypeIdx];
1395           const LLT VecTy = Query.Types[VecTypeIdx];
1396           const LLT IdxTy = Query.Types[IdxTypeIdx];
1397           const unsigned EltSize = EltTy.getSizeInBits();
1398           return (EltSize == 32 || EltSize == 64) &&
1399                   VecTy.getSizeInBits() % 32 == 0 &&
1400                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1401                   IdxTy.getSizeInBits() == 32;
1402         })
1403       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1404                  bitcastToVectorElement32(VecTypeIdx))
1405       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1406       .bitcastIf(
1407         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1408         [=](const LegalityQuery &Query) {
1409           // For > 64-bit element types, try to turn this into a 64-bit
1410           // element vector since we may be able to do better indexing
1411           // if this is scalar. If not, fall back to 32.
1412           const LLT EltTy = Query.Types[EltTypeIdx];
1413           const LLT VecTy = Query.Types[VecTypeIdx];
1414           const unsigned DstEltSize = EltTy.getSizeInBits();
1415           const unsigned VecSize = VecTy.getSizeInBits();
1416 
1417           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1418           return std::make_pair(
1419             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1420         })
1421       .clampScalar(EltTypeIdx, S32, S64)
1422       .clampScalar(VecTypeIdx, S32, S64)
1423       .clampScalar(IdxTypeIdx, S32, S32)
1424       .clampMaxNumElements(VecTypeIdx, S32, 32)
1425       // TODO: Clamp elements for 64-bit vectors?
1426       // It should only be necessary with variable indexes.
1427       // As a last resort, lower to the stack
1428       .lower();
1429   }
1430 
1431   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1432     .unsupportedIf([=](const LegalityQuery &Query) {
1433         const LLT &EltTy = Query.Types[1].getElementType();
1434         return Query.Types[0] != EltTy;
1435       });
1436 
1437   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1438     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1439     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1440 
1441     // FIXME: Doesn't handle extract of illegal sizes.
1442     getActionDefinitionsBuilder(Op)
1443       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1444       // FIXME: Multiples of 16 should not be legal.
1445       .legalIf([=](const LegalityQuery &Query) {
1446           const LLT BigTy = Query.Types[BigTyIdx];
1447           const LLT LitTy = Query.Types[LitTyIdx];
1448           return (BigTy.getSizeInBits() % 32 == 0) &&
1449                  (LitTy.getSizeInBits() % 16 == 0);
1450         })
1451       .widenScalarIf(
1452         [=](const LegalityQuery &Query) {
1453           const LLT BigTy = Query.Types[BigTyIdx];
1454           return (BigTy.getScalarSizeInBits() < 16);
1455         },
1456         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1457       .widenScalarIf(
1458         [=](const LegalityQuery &Query) {
1459           const LLT LitTy = Query.Types[LitTyIdx];
1460           return (LitTy.getScalarSizeInBits() < 16);
1461         },
1462         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1463       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1464       .widenScalarToNextPow2(BigTyIdx, 32);
1465 
1466   }
1467 
1468   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1469     .legalForCartesianProduct(AllS32Vectors, {S32})
1470     .legalForCartesianProduct(AllS64Vectors, {S64})
1471     .clampNumElements(0, V16S32, V32S32)
1472     .clampNumElements(0, V2S64, V16S64)
1473     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1474 
1475   if (ST.hasScalarPackInsts()) {
1476     BuildVector
1477       // FIXME: Should probably widen s1 vectors straight to s32
1478       .minScalarOrElt(0, S16)
1479       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1480       .minScalar(1, S32);
1481 
1482     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1483       .legalFor({V2S16, S32})
1484       .lower();
1485     BuildVector.minScalarOrElt(0, S32);
1486   } else {
1487     BuildVector.customFor({V2S16, S16});
1488     BuildVector.minScalarOrElt(0, S32);
1489 
1490     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1491       .customFor({V2S16, S32})
1492       .lower();
1493   }
1494 
1495   BuildVector.legalIf(isRegisterType(0));
1496 
1497   // FIXME: Clamp maximum size
1498   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1499     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1500     .clampMaxNumElements(0, S32, 32)
1501     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1502     .clampMaxNumElements(0, S16, 64);
1503 
1504   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1505   // pre-legalize.
1506   if (ST.hasVOP3PInsts()) {
1507     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1508       .customFor({V2S16, V2S16})
1509       .lower();
1510   } else
1511     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1512 
1513   // Merge/Unmerge
1514   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1515     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1516     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1517 
1518     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1519       const LLT Ty = Query.Types[TypeIdx];
1520       if (Ty.isVector()) {
1521         const LLT &EltTy = Ty.getElementType();
1522         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1523           return true;
1524         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1525           return true;
1526       }
1527       return false;
1528     };
1529 
1530     auto &Builder = getActionDefinitionsBuilder(Op)
1531       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1532       .lowerFor({{S16, V2S16}})
1533       .lowerIf([=](const LegalityQuery &Query) {
1534           const LLT BigTy = Query.Types[BigTyIdx];
1535           return BigTy.getSizeInBits() == 32;
1536         })
1537       // Try to widen to s16 first for small types.
1538       // TODO: Only do this on targets with legal s16 shifts
1539       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1540       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1541       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1542       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1543                            elementTypeIs(1, S16)),
1544                        changeTo(1, V2S16))
1545       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1546       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1547       // valid.
1548       .clampScalar(LitTyIdx, S32, S512)
1549       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1550       // Break up vectors with weird elements into scalars
1551       .fewerElementsIf(
1552         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1553         scalarize(0))
1554       .fewerElementsIf(
1555         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1556         scalarize(1))
1557       .clampScalar(BigTyIdx, S32, MaxScalar);
1558 
1559     if (Op == G_MERGE_VALUES) {
1560       Builder.widenScalarIf(
1561         // TODO: Use 16-bit shifts if legal for 8-bit values?
1562         [=](const LegalityQuery &Query) {
1563           const LLT Ty = Query.Types[LitTyIdx];
1564           return Ty.getSizeInBits() < 32;
1565         },
1566         changeTo(LitTyIdx, S32));
1567     }
1568 
1569     Builder.widenScalarIf(
1570       [=](const LegalityQuery &Query) {
1571         const LLT Ty = Query.Types[BigTyIdx];
1572         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1573           Ty.getSizeInBits() % 16 != 0;
1574       },
1575       [=](const LegalityQuery &Query) {
1576         // Pick the next power of 2, or a multiple of 64 over 128.
1577         // Whichever is smaller.
1578         const LLT &Ty = Query.Types[BigTyIdx];
1579         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1580         if (NewSizeInBits >= 256) {
1581           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1582           if (RoundedTo < NewSizeInBits)
1583             NewSizeInBits = RoundedTo;
1584         }
1585         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1586       })
1587       // Any vectors left are the wrong size. Scalarize them.
1588       .scalarize(0)
1589       .scalarize(1);
1590   }
1591 
1592   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1593   // RegBankSelect.
1594   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1595     .legalFor({{S32}, {S64}});
1596 
1597   if (ST.hasVOP3PInsts()) {
1598     SextInReg.lowerFor({{V2S16}})
1599       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1600       // get more vector shift opportunities, since we'll get those when
1601       // expanded.
1602       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1603   } else if (ST.has16BitInsts()) {
1604     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1605   } else {
1606     // Prefer to promote to s32 before lowering if we don't have 16-bit
1607     // shifts. This avoid a lot of intermediate truncate and extend operations.
1608     SextInReg.lowerFor({{S32}, {S64}});
1609   }
1610 
1611   SextInReg
1612     .scalarize(0)
1613     .clampScalar(0, S32, S64)
1614     .lower();
1615 
1616   // TODO: Only Try to form v2s16 with legal packed instructions.
1617   getActionDefinitionsBuilder(G_FSHR)
1618     .legalFor({{S32, S32}})
1619     .lowerFor({{V2S16, V2S16}})
1620     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1621     .scalarize(0)
1622     .lower();
1623 
1624   if (ST.hasVOP3PInsts()) {
1625     getActionDefinitionsBuilder(G_FSHL)
1626       .lowerFor({{V2S16, V2S16}})
1627       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1628       .scalarize(0)
1629       .lower();
1630   } else {
1631     getActionDefinitionsBuilder(G_FSHL)
1632       .scalarize(0)
1633       .lower();
1634   }
1635 
1636   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1637     .legalFor({S64});
1638 
1639   getActionDefinitionsBuilder(G_FENCE)
1640     .alwaysLegal();
1641 
1642   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1643       .scalarize(0)
1644       .minScalar(0, S32)
1645       .lower();
1646 
1647   getActionDefinitionsBuilder({
1648       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1649       G_FCOPYSIGN,
1650 
1651       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1652       G_ATOMICRMW_NAND,
1653       G_ATOMICRMW_FSUB,
1654       G_READ_REGISTER,
1655       G_WRITE_REGISTER,
1656 
1657       G_SADDO, G_SSUBO,
1658 
1659        // TODO: Implement
1660       G_FMINIMUM, G_FMAXIMUM}).lower();
1661 
1662   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1663         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1664         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1665     .unsupported();
1666 
1667   getLegacyLegalizerInfo().computeTables();
1668   verify(*ST.getInstrInfo());
1669 }
1670 
1671 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1672                                          MachineInstr &MI) const {
1673   MachineIRBuilder &B = Helper.MIRBuilder;
1674   MachineRegisterInfo &MRI = *B.getMRI();
1675 
1676   switch (MI.getOpcode()) {
1677   case TargetOpcode::G_ADDRSPACE_CAST:
1678     return legalizeAddrSpaceCast(MI, MRI, B);
1679   case TargetOpcode::G_FRINT:
1680     return legalizeFrint(MI, MRI, B);
1681   case TargetOpcode::G_FCEIL:
1682     return legalizeFceil(MI, MRI, B);
1683   case TargetOpcode::G_FREM:
1684     return legalizeFrem(MI, MRI, B);
1685   case TargetOpcode::G_INTRINSIC_TRUNC:
1686     return legalizeIntrinsicTrunc(MI, MRI, B);
1687   case TargetOpcode::G_SITOFP:
1688     return legalizeITOFP(MI, MRI, B, true);
1689   case TargetOpcode::G_UITOFP:
1690     return legalizeITOFP(MI, MRI, B, false);
1691   case TargetOpcode::G_FPTOSI:
1692     return legalizeFPTOI(MI, MRI, B, true);
1693   case TargetOpcode::G_FPTOUI:
1694     return legalizeFPTOI(MI, MRI, B, false);
1695   case TargetOpcode::G_FMINNUM:
1696   case TargetOpcode::G_FMAXNUM:
1697   case TargetOpcode::G_FMINNUM_IEEE:
1698   case TargetOpcode::G_FMAXNUM_IEEE:
1699     return legalizeMinNumMaxNum(Helper, MI);
1700   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1701     return legalizeExtractVectorElt(MI, MRI, B);
1702   case TargetOpcode::G_INSERT_VECTOR_ELT:
1703     return legalizeInsertVectorElt(MI, MRI, B);
1704   case TargetOpcode::G_SHUFFLE_VECTOR:
1705     return legalizeShuffleVector(MI, MRI, B);
1706   case TargetOpcode::G_FSIN:
1707   case TargetOpcode::G_FCOS:
1708     return legalizeSinCos(MI, MRI, B);
1709   case TargetOpcode::G_GLOBAL_VALUE:
1710     return legalizeGlobalValue(MI, MRI, B);
1711   case TargetOpcode::G_LOAD:
1712   case TargetOpcode::G_SEXTLOAD:
1713   case TargetOpcode::G_ZEXTLOAD:
1714     return legalizeLoad(Helper, MI);
1715   case TargetOpcode::G_FMAD:
1716     return legalizeFMad(MI, MRI, B);
1717   case TargetOpcode::G_FDIV:
1718     return legalizeFDIV(MI, MRI, B);
1719   case TargetOpcode::G_UDIV:
1720   case TargetOpcode::G_UREM:
1721   case TargetOpcode::G_UDIVREM:
1722     return legalizeUnsignedDIV_REM(MI, MRI, B);
1723   case TargetOpcode::G_SDIV:
1724   case TargetOpcode::G_SREM:
1725   case TargetOpcode::G_SDIVREM:
1726     return legalizeSignedDIV_REM(MI, MRI, B);
1727   case TargetOpcode::G_ATOMIC_CMPXCHG:
1728     return legalizeAtomicCmpXChg(MI, MRI, B);
1729   case TargetOpcode::G_FLOG:
1730     return legalizeFlog(MI, B, numbers::ln2f);
1731   case TargetOpcode::G_FLOG10:
1732     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1733   case TargetOpcode::G_FEXP:
1734     return legalizeFExp(MI, B);
1735   case TargetOpcode::G_FPOW:
1736     return legalizeFPow(MI, B);
1737   case TargetOpcode::G_FFLOOR:
1738     return legalizeFFloor(MI, MRI, B);
1739   case TargetOpcode::G_BUILD_VECTOR:
1740     return legalizeBuildVector(MI, MRI, B);
1741   default:
1742     return false;
1743   }
1744 
1745   llvm_unreachable("expected switch to return");
1746 }
1747 
1748 Register AMDGPULegalizerInfo::getSegmentAperture(
1749   unsigned AS,
1750   MachineRegisterInfo &MRI,
1751   MachineIRBuilder &B) const {
1752   MachineFunction &MF = B.getMF();
1753   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1754   const LLT S32 = LLT::scalar(32);
1755 
1756   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1757 
1758   if (ST.hasApertureRegs()) {
1759     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1760     // getreg.
1761     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1762         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1763         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1764     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1765         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1766         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1767     unsigned Encoding =
1768         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1769         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1770         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1771 
1772     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1773 
1774     B.buildInstr(AMDGPU::S_GETREG_B32)
1775       .addDef(GetReg)
1776       .addImm(Encoding);
1777     MRI.setType(GetReg, S32);
1778 
1779     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1780     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1781   }
1782 
1783   Register QueuePtr = MRI.createGenericVirtualRegister(
1784     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1785 
1786   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1787     return Register();
1788 
1789   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1790   // private_segment_aperture_base_hi.
1791   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1792 
1793   // TODO: can we be smarter about machine pointer info?
1794   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1795   MachineMemOperand *MMO = MF.getMachineMemOperand(
1796       PtrInfo,
1797       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1798           MachineMemOperand::MOInvariant,
1799       4, commonAlignment(Align(64), StructOffset));
1800 
1801   Register LoadAddr;
1802 
1803   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1804   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1805 }
1806 
1807 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1808   MachineInstr &MI, MachineRegisterInfo &MRI,
1809   MachineIRBuilder &B) const {
1810   MachineFunction &MF = B.getMF();
1811 
1812   const LLT S32 = LLT::scalar(32);
1813   Register Dst = MI.getOperand(0).getReg();
1814   Register Src = MI.getOperand(1).getReg();
1815 
1816   LLT DstTy = MRI.getType(Dst);
1817   LLT SrcTy = MRI.getType(Src);
1818   unsigned DestAS = DstTy.getAddressSpace();
1819   unsigned SrcAS = SrcTy.getAddressSpace();
1820 
1821   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1822   // vector element.
1823   assert(!DstTy.isVector());
1824 
1825   const AMDGPUTargetMachine &TM
1826     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1827 
1828   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1829     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1830     return true;
1831   }
1832 
1833   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1834     // Truncate.
1835     B.buildExtract(Dst, Src, 0);
1836     MI.eraseFromParent();
1837     return true;
1838   }
1839 
1840   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1841     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1842     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1843 
1844     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1845     // another. Merge operands are required to be the same type, but creating an
1846     // extra ptrtoint would be kind of pointless.
1847     auto HighAddr = B.buildConstant(
1848       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1849     B.buildMerge(Dst, {Src, HighAddr});
1850     MI.eraseFromParent();
1851     return true;
1852   }
1853 
1854   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1855     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1856            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1857     unsigned NullVal = TM.getNullPointerValue(DestAS);
1858 
1859     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1860     auto FlatNull = B.buildConstant(SrcTy, 0);
1861 
1862     // Extract low 32-bits of the pointer.
1863     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1864 
1865     auto CmpRes =
1866         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1867     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1868 
1869     MI.eraseFromParent();
1870     return true;
1871   }
1872 
1873   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1874     return false;
1875 
1876   if (!ST.hasFlatAddressSpace())
1877     return false;
1878 
1879   auto SegmentNull =
1880       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1881   auto FlatNull =
1882       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1883 
1884   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1885   if (!ApertureReg.isValid())
1886     return false;
1887 
1888   auto CmpRes =
1889       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1890 
1891   // Coerce the type of the low half of the result so we can use merge_values.
1892   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1893 
1894   // TODO: Should we allow mismatched types but matching sizes in merges to
1895   // avoid the ptrtoint?
1896   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1897   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1898 
1899   MI.eraseFromParent();
1900   return true;
1901 }
1902 
1903 bool AMDGPULegalizerInfo::legalizeFrint(
1904   MachineInstr &MI, MachineRegisterInfo &MRI,
1905   MachineIRBuilder &B) const {
1906   Register Src = MI.getOperand(1).getReg();
1907   LLT Ty = MRI.getType(Src);
1908   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1909 
1910   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1911   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1912 
1913   auto C1 = B.buildFConstant(Ty, C1Val);
1914   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1915 
1916   // TODO: Should this propagate fast-math-flags?
1917   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1918   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1919 
1920   auto C2 = B.buildFConstant(Ty, C2Val);
1921   auto Fabs = B.buildFAbs(Ty, Src);
1922 
1923   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1924   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1925   MI.eraseFromParent();
1926   return true;
1927 }
1928 
1929 bool AMDGPULegalizerInfo::legalizeFceil(
1930   MachineInstr &MI, MachineRegisterInfo &MRI,
1931   MachineIRBuilder &B) const {
1932 
1933   const LLT S1 = LLT::scalar(1);
1934   const LLT S64 = LLT::scalar(64);
1935 
1936   Register Src = MI.getOperand(1).getReg();
1937   assert(MRI.getType(Src) == S64);
1938 
1939   // result = trunc(src)
1940   // if (src > 0.0 && src != result)
1941   //   result += 1.0
1942 
1943   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1944 
1945   const auto Zero = B.buildFConstant(S64, 0.0);
1946   const auto One = B.buildFConstant(S64, 1.0);
1947   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1948   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1949   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1950   auto Add = B.buildSelect(S64, And, One, Zero);
1951 
1952   // TODO: Should this propagate fast-math-flags?
1953   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1954   return true;
1955 }
1956 
1957 bool AMDGPULegalizerInfo::legalizeFrem(
1958   MachineInstr &MI, MachineRegisterInfo &MRI,
1959   MachineIRBuilder &B) const {
1960     Register DstReg = MI.getOperand(0).getReg();
1961     Register Src0Reg = MI.getOperand(1).getReg();
1962     Register Src1Reg = MI.getOperand(2).getReg();
1963     auto Flags = MI.getFlags();
1964     LLT Ty = MRI.getType(DstReg);
1965 
1966     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1967     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1968     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1969     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1970     MI.eraseFromParent();
1971     return true;
1972 }
1973 
1974 static MachineInstrBuilder extractF64Exponent(Register Hi,
1975                                               MachineIRBuilder &B) {
1976   const unsigned FractBits = 52;
1977   const unsigned ExpBits = 11;
1978   LLT S32 = LLT::scalar(32);
1979 
1980   auto Const0 = B.buildConstant(S32, FractBits - 32);
1981   auto Const1 = B.buildConstant(S32, ExpBits);
1982 
1983   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1984     .addUse(Hi)
1985     .addUse(Const0.getReg(0))
1986     .addUse(Const1.getReg(0));
1987 
1988   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1989 }
1990 
1991 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1992   MachineInstr &MI, MachineRegisterInfo &MRI,
1993   MachineIRBuilder &B) const {
1994   const LLT S1 = LLT::scalar(1);
1995   const LLT S32 = LLT::scalar(32);
1996   const LLT S64 = LLT::scalar(64);
1997 
1998   Register Src = MI.getOperand(1).getReg();
1999   assert(MRI.getType(Src) == S64);
2000 
2001   // TODO: Should this use extract since the low half is unused?
2002   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2003   Register Hi = Unmerge.getReg(1);
2004 
2005   // Extract the upper half, since this is where we will find the sign and
2006   // exponent.
2007   auto Exp = extractF64Exponent(Hi, B);
2008 
2009   const unsigned FractBits = 52;
2010 
2011   // Extract the sign bit.
2012   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2013   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2014 
2015   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2016 
2017   const auto Zero32 = B.buildConstant(S32, 0);
2018 
2019   // Extend back to 64-bits.
2020   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2021 
2022   auto Shr = B.buildAShr(S64, FractMask, Exp);
2023   auto Not = B.buildNot(S64, Shr);
2024   auto Tmp0 = B.buildAnd(S64, Src, Not);
2025   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2026 
2027   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2028   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2029 
2030   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2031   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2032   MI.eraseFromParent();
2033   return true;
2034 }
2035 
2036 bool AMDGPULegalizerInfo::legalizeITOFP(
2037   MachineInstr &MI, MachineRegisterInfo &MRI,
2038   MachineIRBuilder &B, bool Signed) const {
2039 
2040   Register Dst = MI.getOperand(0).getReg();
2041   Register Src = MI.getOperand(1).getReg();
2042 
2043   const LLT S64 = LLT::scalar(64);
2044   const LLT S32 = LLT::scalar(32);
2045 
2046   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2047 
2048   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2049 
2050   auto CvtHi = Signed ?
2051     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2052     B.buildUITOFP(S64, Unmerge.getReg(1));
2053 
2054   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2055 
2056   auto ThirtyTwo = B.buildConstant(S32, 32);
2057   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2058     .addUse(CvtHi.getReg(0))
2059     .addUse(ThirtyTwo.getReg(0));
2060 
2061   // TODO: Should this propagate fast-math-flags?
2062   B.buildFAdd(Dst, LdExp, CvtLo);
2063   MI.eraseFromParent();
2064   return true;
2065 }
2066 
2067 // TODO: Copied from DAG implementation. Verify logic and document how this
2068 // actually works.
2069 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2070                                         MachineRegisterInfo &MRI,
2071                                         MachineIRBuilder &B,
2072                                         bool Signed) const {
2073 
2074   Register Dst = MI.getOperand(0).getReg();
2075   Register Src = MI.getOperand(1).getReg();
2076 
2077   const LLT S64 = LLT::scalar(64);
2078   const LLT S32 = LLT::scalar(32);
2079 
2080   const LLT SrcLT = MRI.getType(Src);
2081   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2082 
2083   unsigned Flags = MI.getFlags();
2084 
2085   // The basic idea of converting a floating point number into a pair of 32-bit
2086   // integers is illustrated as follows:
2087   //
2088   //     tf := trunc(val);
2089   //    hif := floor(tf * 2^-32);
2090   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2091   //     hi := fptoi(hif);
2092   //     lo := fptoi(lof);
2093   //
2094   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2095   MachineInstrBuilder Sign;
2096   if (Signed && SrcLT == S32) {
2097     // However, a 32-bit floating point number has only 23 bits mantissa and
2098     // it's not enough to hold all the significant bits of `lof` if val is
2099     // negative. To avoid the loss of precision, We need to take the absolute
2100     // value after truncating and flip the result back based on the original
2101     // signedness.
2102     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2103     Trunc = B.buildFAbs(S32, Trunc, Flags);
2104   }
2105   MachineInstrBuilder K0, K1;
2106   if (SrcLT == S64) {
2107     K0 = B.buildFConstant(S64,
2108                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2109     K1 = B.buildFConstant(S64,
2110                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2111   } else {
2112     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2113     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2114   }
2115 
2116   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2117   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2118   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2119 
2120   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2121                                      : B.buildFPTOUI(S32, FloorMul);
2122   auto Lo = B.buildFPTOUI(S32, Fma);
2123 
2124   if (Signed && SrcLT == S32) {
2125     // Flip the result based on the signedness, which is either all 0s or 1s.
2126     Sign = B.buildMerge(S64, {Sign, Sign});
2127     // r := xor({lo, hi}, sign) - sign;
2128     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2129   } else
2130     B.buildMerge(Dst, {Lo, Hi});
2131   MI.eraseFromParent();
2132 
2133   return true;
2134 }
2135 
2136 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2137                                                MachineInstr &MI) const {
2138   MachineFunction &MF = Helper.MIRBuilder.getMF();
2139   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2140 
2141   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2142                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2143 
2144   // With ieee_mode disabled, the instructions have the correct behavior
2145   // already for G_FMINNUM/G_FMAXNUM
2146   if (!MFI->getMode().IEEE)
2147     return !IsIEEEOp;
2148 
2149   if (IsIEEEOp)
2150     return true;
2151 
2152   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2153 }
2154 
2155 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2156   MachineInstr &MI, MachineRegisterInfo &MRI,
2157   MachineIRBuilder &B) const {
2158   // TODO: Should move some of this into LegalizerHelper.
2159 
2160   // TODO: Promote dynamic indexing of s16 to s32
2161 
2162   // FIXME: Artifact combiner probably should have replaced the truncated
2163   // constant before this, so we shouldn't need
2164   // getConstantVRegValWithLookThrough.
2165   Optional<ValueAndVReg> MaybeIdxVal =
2166       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2167   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2168     return true;
2169   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2170 
2171   Register Dst = MI.getOperand(0).getReg();
2172   Register Vec = MI.getOperand(1).getReg();
2173 
2174   LLT VecTy = MRI.getType(Vec);
2175   LLT EltTy = VecTy.getElementType();
2176   assert(EltTy == MRI.getType(Dst));
2177 
2178   if (IdxVal < VecTy.getNumElements())
2179     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2180   else
2181     B.buildUndef(Dst);
2182 
2183   MI.eraseFromParent();
2184   return true;
2185 }
2186 
2187 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2188   MachineInstr &MI, MachineRegisterInfo &MRI,
2189   MachineIRBuilder &B) const {
2190   // TODO: Should move some of this into LegalizerHelper.
2191 
2192   // TODO: Promote dynamic indexing of s16 to s32
2193 
2194   // FIXME: Artifact combiner probably should have replaced the truncated
2195   // constant before this, so we shouldn't need
2196   // getConstantVRegValWithLookThrough.
2197   Optional<ValueAndVReg> MaybeIdxVal =
2198       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2199   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2200     return true;
2201 
2202   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2203   Register Dst = MI.getOperand(0).getReg();
2204   Register Vec = MI.getOperand(1).getReg();
2205   Register Ins = MI.getOperand(2).getReg();
2206 
2207   LLT VecTy = MRI.getType(Vec);
2208   LLT EltTy = VecTy.getElementType();
2209   assert(EltTy == MRI.getType(Ins));
2210 
2211   if (IdxVal < VecTy.getNumElements())
2212     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2213   else
2214     B.buildUndef(Dst);
2215 
2216   MI.eraseFromParent();
2217   return true;
2218 }
2219 
2220 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2221   MachineInstr &MI, MachineRegisterInfo &MRI,
2222   MachineIRBuilder &B) const {
2223   const LLT V2S16 = LLT::vector(2, 16);
2224 
2225   Register Dst = MI.getOperand(0).getReg();
2226   Register Src0 = MI.getOperand(1).getReg();
2227   LLT DstTy = MRI.getType(Dst);
2228   LLT SrcTy = MRI.getType(Src0);
2229 
2230   if (SrcTy == V2S16 && DstTy == V2S16 &&
2231       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2232     return true;
2233 
2234   MachineIRBuilder HelperBuilder(MI);
2235   GISelObserverWrapper DummyObserver;
2236   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2237   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2238 }
2239 
2240 bool AMDGPULegalizerInfo::legalizeSinCos(
2241   MachineInstr &MI, MachineRegisterInfo &MRI,
2242   MachineIRBuilder &B) const {
2243 
2244   Register DstReg = MI.getOperand(0).getReg();
2245   Register SrcReg = MI.getOperand(1).getReg();
2246   LLT Ty = MRI.getType(DstReg);
2247   unsigned Flags = MI.getFlags();
2248 
2249   Register TrigVal;
2250   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2251   if (ST.hasTrigReducedRange()) {
2252     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2253     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2254       .addUse(MulVal.getReg(0))
2255       .setMIFlags(Flags).getReg(0);
2256   } else
2257     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2258 
2259   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2260     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2261   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2262     .addUse(TrigVal)
2263     .setMIFlags(Flags);
2264   MI.eraseFromParent();
2265   return true;
2266 }
2267 
2268 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2269                                                   MachineIRBuilder &B,
2270                                                   const GlobalValue *GV,
2271                                                   int64_t Offset,
2272                                                   unsigned GAFlags) const {
2273   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2274   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2275   // to the following code sequence:
2276   //
2277   // For constant address space:
2278   //   s_getpc_b64 s[0:1]
2279   //   s_add_u32 s0, s0, $symbol
2280   //   s_addc_u32 s1, s1, 0
2281   //
2282   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2283   //   a fixup or relocation is emitted to replace $symbol with a literal
2284   //   constant, which is a pc-relative offset from the encoding of the $symbol
2285   //   operand to the global variable.
2286   //
2287   // For global address space:
2288   //   s_getpc_b64 s[0:1]
2289   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2290   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2291   //
2292   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2293   //   fixups or relocations are emitted to replace $symbol@*@lo and
2294   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2295   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2296   //   operand to the global variable.
2297   //
2298   // What we want here is an offset from the value returned by s_getpc
2299   // (which is the address of the s_add_u32 instruction) to the global
2300   // variable, but since the encoding of $symbol starts 4 bytes after the start
2301   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2302   // small. This requires us to add 4 to the global variable offset in order to
2303   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2304   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2305   // instruction.
2306 
2307   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2308 
2309   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2310     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2311 
2312   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2313     .addDef(PCReg);
2314 
2315   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2316   if (GAFlags == SIInstrInfo::MO_NONE)
2317     MIB.addImm(0);
2318   else
2319     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2320 
2321   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2322 
2323   if (PtrTy.getSizeInBits() == 32)
2324     B.buildExtract(DstReg, PCReg, 0);
2325   return true;
2326  }
2327 
2328 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2329   MachineInstr &MI, MachineRegisterInfo &MRI,
2330   MachineIRBuilder &B) const {
2331   Register DstReg = MI.getOperand(0).getReg();
2332   LLT Ty = MRI.getType(DstReg);
2333   unsigned AS = Ty.getAddressSpace();
2334 
2335   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2336   MachineFunction &MF = B.getMF();
2337   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2338 
2339   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2340     if (!MFI->isModuleEntryFunction() &&
2341         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2342       const Function &Fn = MF.getFunction();
2343       DiagnosticInfoUnsupported BadLDSDecl(
2344         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2345         DS_Warning);
2346       Fn.getContext().diagnose(BadLDSDecl);
2347 
2348       // We currently don't have a way to correctly allocate LDS objects that
2349       // aren't directly associated with a kernel. We do force inlining of
2350       // functions that use local objects. However, if these dead functions are
2351       // not eliminated, we don't want a compile time error. Just emit a warning
2352       // and a trap, since there should be no callable path here.
2353       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2354       B.buildUndef(DstReg);
2355       MI.eraseFromParent();
2356       return true;
2357     }
2358 
2359     // TODO: We could emit code to handle the initialization somewhere.
2360     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2361       const SITargetLowering *TLI = ST.getTargetLowering();
2362       if (!TLI->shouldUseLDSConstAddress(GV)) {
2363         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2364         return true; // Leave in place;
2365       }
2366 
2367       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2368         Type *Ty = GV->getValueType();
2369         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2370         // zero-sized type in other languages to declare the dynamic shared
2371         // memory which size is not known at the compile time. They will be
2372         // allocated by the runtime and placed directly after the static
2373         // allocated ones. They all share the same offset.
2374         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2375           // Adjust alignment for that dynamic shared memory array.
2376           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2377           LLT S32 = LLT::scalar(32);
2378           auto Sz =
2379               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2380           B.buildIntToPtr(DstReg, Sz);
2381           MI.eraseFromParent();
2382           return true;
2383         }
2384       }
2385 
2386       B.buildConstant(
2387           DstReg,
2388           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2389       MI.eraseFromParent();
2390       return true;
2391     }
2392 
2393     const Function &Fn = MF.getFunction();
2394     DiagnosticInfoUnsupported BadInit(
2395       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2396     Fn.getContext().diagnose(BadInit);
2397     return true;
2398   }
2399 
2400   const SITargetLowering *TLI = ST.getTargetLowering();
2401 
2402   if (TLI->shouldEmitFixup(GV)) {
2403     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2404     MI.eraseFromParent();
2405     return true;
2406   }
2407 
2408   if (TLI->shouldEmitPCReloc(GV)) {
2409     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2410     MI.eraseFromParent();
2411     return true;
2412   }
2413 
2414   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2415   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2416 
2417   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2418       MachinePointerInfo::getGOT(MF),
2419       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2420           MachineMemOperand::MOInvariant,
2421       8 /*Size*/, Align(8));
2422 
2423   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2424 
2425   if (Ty.getSizeInBits() == 32) {
2426     // Truncate if this is a 32-bit constant adrdess.
2427     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2428     B.buildExtract(DstReg, Load, 0);
2429   } else
2430     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2431 
2432   MI.eraseFromParent();
2433   return true;
2434 }
2435 
2436 static LLT widenToNextPowerOf2(LLT Ty) {
2437   if (Ty.isVector())
2438     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2439   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2440 }
2441 
2442 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2443                                        MachineInstr &MI) const {
2444   MachineIRBuilder &B = Helper.MIRBuilder;
2445   MachineRegisterInfo &MRI = *B.getMRI();
2446   GISelChangeObserver &Observer = Helper.Observer;
2447 
2448   Register PtrReg = MI.getOperand(1).getReg();
2449   LLT PtrTy = MRI.getType(PtrReg);
2450   unsigned AddrSpace = PtrTy.getAddressSpace();
2451 
2452   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2453     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2454     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2455     Observer.changingInstr(MI);
2456     MI.getOperand(1).setReg(Cast.getReg(0));
2457     Observer.changedInstr(MI);
2458     return true;
2459   }
2460 
2461   if (MI.getOpcode() != AMDGPU::G_LOAD)
2462     return false;
2463 
2464   Register ValReg = MI.getOperand(0).getReg();
2465   LLT ValTy = MRI.getType(ValReg);
2466 
2467   MachineMemOperand *MMO = *MI.memoperands_begin();
2468   const unsigned ValSize = ValTy.getSizeInBits();
2469   const unsigned MemSize = 8 * MMO->getSize();
2470   const Align MemAlign = MMO->getAlign();
2471   const unsigned AlignInBits = 8 * MemAlign.value();
2472 
2473   // Widen non-power-of-2 loads to the alignment if needed
2474   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2475     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2476 
2477     // This was already the correct extending load result type, so just adjust
2478     // the memory type.
2479     if (WideMemSize == ValSize) {
2480       MachineFunction &MF = B.getMF();
2481 
2482       MachineMemOperand *WideMMO =
2483           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2484       Observer.changingInstr(MI);
2485       MI.setMemRefs(MF, {WideMMO});
2486       Observer.changedInstr(MI);
2487       return true;
2488     }
2489 
2490     // Don't bother handling edge case that should probably never be produced.
2491     if (ValSize > WideMemSize)
2492       return false;
2493 
2494     LLT WideTy = widenToNextPowerOf2(ValTy);
2495 
2496     Register WideLoad;
2497     if (!WideTy.isVector()) {
2498       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2499       B.buildTrunc(ValReg, WideLoad).getReg(0);
2500     } else {
2501       // Extract the subvector.
2502 
2503       if (isRegisterType(ValTy)) {
2504         // If this a case where G_EXTRACT is legal, use it.
2505         // (e.g. <3 x s32> -> <4 x s32>)
2506         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2507         B.buildExtract(ValReg, WideLoad, 0);
2508       } else {
2509         // For cases where the widened type isn't a nice register value, unmerge
2510         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2511         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2512         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2513         B.setInsertPt(B.getMBB(), MI.getIterator());
2514         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2515       }
2516     }
2517 
2518     MI.eraseFromParent();
2519     return true;
2520   }
2521 
2522   return false;
2523 }
2524 
2525 bool AMDGPULegalizerInfo::legalizeFMad(
2526   MachineInstr &MI, MachineRegisterInfo &MRI,
2527   MachineIRBuilder &B) const {
2528   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2529   assert(Ty.isScalar());
2530 
2531   MachineFunction &MF = B.getMF();
2532   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2533 
2534   // TODO: Always legal with future ftz flag.
2535   // FIXME: Do we need just output?
2536   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2537     return true;
2538   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2539     return true;
2540 
2541   MachineIRBuilder HelperBuilder(MI);
2542   GISelObserverWrapper DummyObserver;
2543   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2544   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2545 }
2546 
2547 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2548   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2549   Register DstReg = MI.getOperand(0).getReg();
2550   Register PtrReg = MI.getOperand(1).getReg();
2551   Register CmpVal = MI.getOperand(2).getReg();
2552   Register NewVal = MI.getOperand(3).getReg();
2553 
2554   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2555          "this should not have been custom lowered");
2556 
2557   LLT ValTy = MRI.getType(CmpVal);
2558   LLT VecTy = LLT::vector(2, ValTy);
2559 
2560   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2561 
2562   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2563     .addDef(DstReg)
2564     .addUse(PtrReg)
2565     .addUse(PackedVal)
2566     .setMemRefs(MI.memoperands());
2567 
2568   MI.eraseFromParent();
2569   return true;
2570 }
2571 
2572 bool AMDGPULegalizerInfo::legalizeFlog(
2573   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2574   Register Dst = MI.getOperand(0).getReg();
2575   Register Src = MI.getOperand(1).getReg();
2576   LLT Ty = B.getMRI()->getType(Dst);
2577   unsigned Flags = MI.getFlags();
2578 
2579   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2580   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2581 
2582   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2583   MI.eraseFromParent();
2584   return true;
2585 }
2586 
2587 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2588                                        MachineIRBuilder &B) const {
2589   Register Dst = MI.getOperand(0).getReg();
2590   Register Src = MI.getOperand(1).getReg();
2591   unsigned Flags = MI.getFlags();
2592   LLT Ty = B.getMRI()->getType(Dst);
2593 
2594   auto K = B.buildFConstant(Ty, numbers::log2e);
2595   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2596   B.buildFExp2(Dst, Mul, Flags);
2597   MI.eraseFromParent();
2598   return true;
2599 }
2600 
2601 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2602                                        MachineIRBuilder &B) const {
2603   Register Dst = MI.getOperand(0).getReg();
2604   Register Src0 = MI.getOperand(1).getReg();
2605   Register Src1 = MI.getOperand(2).getReg();
2606   unsigned Flags = MI.getFlags();
2607   LLT Ty = B.getMRI()->getType(Dst);
2608   const LLT S16 = LLT::scalar(16);
2609   const LLT S32 = LLT::scalar(32);
2610 
2611   if (Ty == S32) {
2612     auto Log = B.buildFLog2(S32, Src0, Flags);
2613     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2614       .addUse(Log.getReg(0))
2615       .addUse(Src1)
2616       .setMIFlags(Flags);
2617     B.buildFExp2(Dst, Mul, Flags);
2618   } else if (Ty == S16) {
2619     // There's no f16 fmul_legacy, so we need to convert for it.
2620     auto Log = B.buildFLog2(S16, Src0, Flags);
2621     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2622     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2623     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2624       .addUse(Ext0.getReg(0))
2625       .addUse(Ext1.getReg(0))
2626       .setMIFlags(Flags);
2627 
2628     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2629   } else
2630     return false;
2631 
2632   MI.eraseFromParent();
2633   return true;
2634 }
2635 
2636 // Find a source register, ignoring any possible source modifiers.
2637 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2638   Register ModSrc = OrigSrc;
2639   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2640     ModSrc = SrcFNeg->getOperand(1).getReg();
2641     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2642       ModSrc = SrcFAbs->getOperand(1).getReg();
2643   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2644     ModSrc = SrcFAbs->getOperand(1).getReg();
2645   return ModSrc;
2646 }
2647 
2648 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2649                                          MachineRegisterInfo &MRI,
2650                                          MachineIRBuilder &B) const {
2651 
2652   const LLT S1 = LLT::scalar(1);
2653   const LLT S64 = LLT::scalar(64);
2654   Register Dst = MI.getOperand(0).getReg();
2655   Register OrigSrc = MI.getOperand(1).getReg();
2656   unsigned Flags = MI.getFlags();
2657   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2658          "this should not have been custom lowered");
2659 
2660   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2661   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2662   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2663   // V_FRACT bug is:
2664   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2665   //
2666   // Convert floor(x) to (x - fract(x))
2667 
2668   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2669     .addUse(OrigSrc)
2670     .setMIFlags(Flags);
2671 
2672   // Give source modifier matching some assistance before obscuring a foldable
2673   // pattern.
2674 
2675   // TODO: We can avoid the neg on the fract? The input sign to fract
2676   // shouldn't matter?
2677   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2678 
2679   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2680 
2681   Register Min = MRI.createGenericVirtualRegister(S64);
2682 
2683   // We don't need to concern ourselves with the snan handling difference, so
2684   // use the one which will directly select.
2685   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2686   if (MFI->getMode().IEEE)
2687     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2688   else
2689     B.buildFMinNum(Min, Fract, Const, Flags);
2690 
2691   Register CorrectedFract = Min;
2692   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2693     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2694     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2695   }
2696 
2697   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2698   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2699 
2700   MI.eraseFromParent();
2701   return true;
2702 }
2703 
2704 // Turn an illegal packed v2s16 build vector into bit operations.
2705 // TODO: This should probably be a bitcast action in LegalizerHelper.
2706 bool AMDGPULegalizerInfo::legalizeBuildVector(
2707   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2708   Register Dst = MI.getOperand(0).getReg();
2709   const LLT S32 = LLT::scalar(32);
2710   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2711 
2712   Register Src0 = MI.getOperand(1).getReg();
2713   Register Src1 = MI.getOperand(2).getReg();
2714   assert(MRI.getType(Src0) == LLT::scalar(16));
2715 
2716   auto Merge = B.buildMerge(S32, {Src0, Src1});
2717   B.buildBitcast(Dst, Merge);
2718 
2719   MI.eraseFromParent();
2720   return true;
2721 }
2722 
2723 // Check that this is a G_XOR x, -1
2724 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2725   if (MI.getOpcode() != TargetOpcode::G_XOR)
2726     return false;
2727   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2728   return ConstVal && *ConstVal == -1;
2729 }
2730 
2731 // Return the use branch instruction, otherwise null if the usage is invalid.
2732 static MachineInstr *
2733 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2734                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2735   Register CondDef = MI.getOperand(0).getReg();
2736   if (!MRI.hasOneNonDBGUse(CondDef))
2737     return nullptr;
2738 
2739   MachineBasicBlock *Parent = MI.getParent();
2740   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2741 
2742   if (isNot(MRI, *UseMI)) {
2743     Register NegatedCond = UseMI->getOperand(0).getReg();
2744     if (!MRI.hasOneNonDBGUse(NegatedCond))
2745       return nullptr;
2746 
2747     // We're deleting the def of this value, so we need to remove it.
2748     UseMI->eraseFromParent();
2749 
2750     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2751     Negated = true;
2752   }
2753 
2754   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2755     return nullptr;
2756 
2757   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2758   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2759   if (Next == Parent->end()) {
2760     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2761     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2762       return nullptr;
2763     UncondBrTarget = &*NextMBB;
2764   } else {
2765     if (Next->getOpcode() != AMDGPU::G_BR)
2766       return nullptr;
2767     Br = &*Next;
2768     UncondBrTarget = Br->getOperand(0).getMBB();
2769   }
2770 
2771   return UseMI;
2772 }
2773 
2774 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2775                                          const ArgDescriptor *Arg,
2776                                          const TargetRegisterClass *ArgRC,
2777                                          LLT ArgTy) const {
2778   MCRegister SrcReg = Arg->getRegister();
2779   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2780   assert(DstReg.isVirtual() && "Virtual register expected");
2781 
2782   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2783                                              ArgTy);
2784   if (Arg->isMasked()) {
2785     // TODO: Should we try to emit this once in the entry block?
2786     const LLT S32 = LLT::scalar(32);
2787     const unsigned Mask = Arg->getMask();
2788     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2789 
2790     Register AndMaskSrc = LiveIn;
2791 
2792     if (Shift != 0) {
2793       auto ShiftAmt = B.buildConstant(S32, Shift);
2794       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2795     }
2796 
2797     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2798   } else {
2799     B.buildCopy(DstReg, LiveIn);
2800   }
2801 
2802   return true;
2803 }
2804 
2805 bool AMDGPULegalizerInfo::loadInputValue(
2806     Register DstReg, MachineIRBuilder &B,
2807     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2808   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2809   const ArgDescriptor *Arg;
2810   const TargetRegisterClass *ArgRC;
2811   LLT ArgTy;
2812   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2813 
2814   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2815     return false; // TODO: Handle these
2816   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2817 }
2818 
2819 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2820     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2821     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2822   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2823     return false;
2824 
2825   MI.eraseFromParent();
2826   return true;
2827 }
2828 
2829 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2830                                        MachineRegisterInfo &MRI,
2831                                        MachineIRBuilder &B) const {
2832   Register Dst = MI.getOperand(0).getReg();
2833   LLT DstTy = MRI.getType(Dst);
2834   LLT S16 = LLT::scalar(16);
2835   LLT S32 = LLT::scalar(32);
2836   LLT S64 = LLT::scalar(64);
2837 
2838   if (DstTy == S16)
2839     return legalizeFDIV16(MI, MRI, B);
2840   if (DstTy == S32)
2841     return legalizeFDIV32(MI, MRI, B);
2842   if (DstTy == S64)
2843     return legalizeFDIV64(MI, MRI, B);
2844 
2845   return false;
2846 }
2847 
2848 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2849                                                         Register DstDivReg,
2850                                                         Register DstRemReg,
2851                                                         Register X,
2852                                                         Register Y) const {
2853   const LLT S1 = LLT::scalar(1);
2854   const LLT S32 = LLT::scalar(32);
2855 
2856   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2857   // algorithm used here.
2858 
2859   // Initial estimate of inv(y).
2860   auto FloatY = B.buildUITOFP(S32, Y);
2861   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2862   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2863   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2864   auto Z = B.buildFPTOUI(S32, ScaledY);
2865 
2866   // One round of UNR.
2867   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2868   auto NegYZ = B.buildMul(S32, NegY, Z);
2869   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2870 
2871   // Quotient/remainder estimate.
2872   auto Q = B.buildUMulH(S32, X, Z);
2873   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2874 
2875   // First quotient/remainder refinement.
2876   auto One = B.buildConstant(S32, 1);
2877   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2878   if (DstDivReg)
2879     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2880   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2881 
2882   // Second quotient/remainder refinement.
2883   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2884   if (DstDivReg)
2885     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2886 
2887   if (DstRemReg)
2888     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2889 }
2890 
2891 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2892 //
2893 // Return lo, hi of result
2894 //
2895 // %cvt.lo = G_UITOFP Val.lo
2896 // %cvt.hi = G_UITOFP Val.hi
2897 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2898 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2899 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2900 // %mul2 = G_FMUL %mul1, 2**(-32)
2901 // %trunc = G_INTRINSIC_TRUNC %mul2
2902 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2903 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2904 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2905                                                        Register Val) {
2906   const LLT S32 = LLT::scalar(32);
2907   auto Unmerge = B.buildUnmerge(S32, Val);
2908 
2909   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2910   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2911 
2912   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2913                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2914 
2915   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2916   auto Mul1 =
2917       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2918 
2919   // 2**(-32)
2920   auto Mul2 =
2921       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2922   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2923 
2924   // -(2**32)
2925   auto Mad2 = B.buildFMAD(S32, Trunc,
2926                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2927 
2928   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2929   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2930 
2931   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2932 }
2933 
2934 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2935                                                         Register DstDivReg,
2936                                                         Register DstRemReg,
2937                                                         Register Numer,
2938                                                         Register Denom) const {
2939   const LLT S32 = LLT::scalar(32);
2940   const LLT S64 = LLT::scalar(64);
2941   const LLT S1 = LLT::scalar(1);
2942   Register RcpLo, RcpHi;
2943 
2944   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2945 
2946   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2947 
2948   auto Zero64 = B.buildConstant(S64, 0);
2949   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2950 
2951   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2952   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2953 
2954   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2955   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2956   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2957 
2958   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2959   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2960   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2961   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2962 
2963   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2964   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2965   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2966   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2967   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2968 
2969   auto Zero32 = B.buildConstant(S32, 0);
2970   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2971   auto Add2_HiC =
2972       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2973   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2974   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2975 
2976   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2977   Register NumerLo = UnmergeNumer.getReg(0);
2978   Register NumerHi = UnmergeNumer.getReg(1);
2979 
2980   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2981   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2982   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2983   Register Mul3_Lo = UnmergeMul3.getReg(0);
2984   Register Mul3_Hi = UnmergeMul3.getReg(1);
2985   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2986   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2987   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2988   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2989 
2990   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2991   Register DenomLo = UnmergeDenom.getReg(0);
2992   Register DenomHi = UnmergeDenom.getReg(1);
2993 
2994   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2995   auto C1 = B.buildSExt(S32, CmpHi);
2996 
2997   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2998   auto C2 = B.buildSExt(S32, CmpLo);
2999 
3000   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3001   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3002 
3003   // TODO: Here and below portions of the code can be enclosed into if/endif.
3004   // Currently control flow is unconditional and we have 4 selects after
3005   // potential endif to substitute PHIs.
3006 
3007   // if C3 != 0 ...
3008   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3009   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3010   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3011   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3012 
3013   auto One64 = B.buildConstant(S64, 1);
3014   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3015 
3016   auto C4 =
3017       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3018   auto C5 =
3019       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3020   auto C6 = B.buildSelect(
3021       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3022 
3023   // if (C6 != 0)
3024   auto Add4 = B.buildAdd(S64, Add3, One64);
3025   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3026 
3027   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3028   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3029   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3030 
3031   // endif C6
3032   // endif C3
3033 
3034   if (DstDivReg) {
3035     auto Sel1 = B.buildSelect(
3036         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3037     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3038                   Sel1, MulHi3);
3039   }
3040 
3041   if (DstRemReg) {
3042     auto Sel2 = B.buildSelect(
3043         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3044     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3045                   Sel2, Sub1);
3046   }
3047 }
3048 
3049 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3050                                                   MachineRegisterInfo &MRI,
3051                                                   MachineIRBuilder &B) const {
3052   Register DstDivReg, DstRemReg;
3053   switch (MI.getOpcode()) {
3054   default:
3055     llvm_unreachable("Unexpected opcode!");
3056   case AMDGPU::G_UDIV: {
3057     DstDivReg = MI.getOperand(0).getReg();
3058     break;
3059   }
3060   case AMDGPU::G_UREM: {
3061     DstRemReg = MI.getOperand(0).getReg();
3062     break;
3063   }
3064   case AMDGPU::G_UDIVREM: {
3065     DstDivReg = MI.getOperand(0).getReg();
3066     DstRemReg = MI.getOperand(1).getReg();
3067     break;
3068   }
3069   }
3070 
3071   const LLT S64 = LLT::scalar(64);
3072   const LLT S32 = LLT::scalar(32);
3073   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3074   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3075   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3076   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3077 
3078   if (Ty == S32)
3079     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3080   else if (Ty == S64)
3081     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3082   else
3083     return false;
3084 
3085   MI.eraseFromParent();
3086   return true;
3087 }
3088 
3089 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3090                                                 MachineRegisterInfo &MRI,
3091                                                 MachineIRBuilder &B) const {
3092   const LLT S64 = LLT::scalar(64);
3093   const LLT S32 = LLT::scalar(32);
3094 
3095   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3096   if (Ty != S32 && Ty != S64)
3097     return false;
3098 
3099   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3100   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3101   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3102 
3103   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3104   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3105   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3106 
3107   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3108   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3109 
3110   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3111   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3112 
3113   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3114   switch (MI.getOpcode()) {
3115   default:
3116     llvm_unreachable("Unexpected opcode!");
3117   case AMDGPU::G_SDIV: {
3118     DstDivReg = MI.getOperand(0).getReg();
3119     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3120     break;
3121   }
3122   case AMDGPU::G_SREM: {
3123     DstRemReg = MI.getOperand(0).getReg();
3124     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3125     break;
3126   }
3127   case AMDGPU::G_SDIVREM: {
3128     DstDivReg = MI.getOperand(0).getReg();
3129     DstRemReg = MI.getOperand(1).getReg();
3130     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3131     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3132     break;
3133   }
3134   }
3135 
3136   if (Ty == S32)
3137     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3138   else
3139     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3140 
3141   if (DstDivReg) {
3142     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3143     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3144     B.buildSub(DstDivReg, SignXor, Sign);
3145   }
3146 
3147   if (DstRemReg) {
3148     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3149     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3150     B.buildSub(DstRemReg, SignXor, Sign);
3151   }
3152 
3153   MI.eraseFromParent();
3154   return true;
3155 }
3156 
3157 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3158                                                  MachineRegisterInfo &MRI,
3159                                                  MachineIRBuilder &B) const {
3160   Register Res = MI.getOperand(0).getReg();
3161   Register LHS = MI.getOperand(1).getReg();
3162   Register RHS = MI.getOperand(2).getReg();
3163   uint16_t Flags = MI.getFlags();
3164   LLT ResTy = MRI.getType(Res);
3165 
3166   const MachineFunction &MF = B.getMF();
3167   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3168                             MI.getFlag(MachineInstr::FmAfn);
3169 
3170   if (!AllowInaccurateRcp)
3171     return false;
3172 
3173   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3174     // 1 / x -> RCP(x)
3175     if (CLHS->isExactlyValue(1.0)) {
3176       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3177         .addUse(RHS)
3178         .setMIFlags(Flags);
3179 
3180       MI.eraseFromParent();
3181       return true;
3182     }
3183 
3184     // -1 / x -> RCP( FNEG(x) )
3185     if (CLHS->isExactlyValue(-1.0)) {
3186       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3187       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3188         .addUse(FNeg.getReg(0))
3189         .setMIFlags(Flags);
3190 
3191       MI.eraseFromParent();
3192       return true;
3193     }
3194   }
3195 
3196   // x / y -> x * (1.0 / y)
3197   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3198     .addUse(RHS)
3199     .setMIFlags(Flags);
3200   B.buildFMul(Res, LHS, RCP, Flags);
3201 
3202   MI.eraseFromParent();
3203   return true;
3204 }
3205 
3206 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3207                                                    MachineRegisterInfo &MRI,
3208                                                    MachineIRBuilder &B) const {
3209   Register Res = MI.getOperand(0).getReg();
3210   Register X = MI.getOperand(1).getReg();
3211   Register Y = MI.getOperand(2).getReg();
3212   uint16_t Flags = MI.getFlags();
3213   LLT ResTy = MRI.getType(Res);
3214 
3215   const MachineFunction &MF = B.getMF();
3216   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3217                             MI.getFlag(MachineInstr::FmAfn);
3218 
3219   if (!AllowInaccurateRcp)
3220     return false;
3221 
3222   auto NegY = B.buildFNeg(ResTy, Y);
3223   auto One = B.buildFConstant(ResTy, 1.0);
3224 
3225   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3226     .addUse(Y)
3227     .setMIFlags(Flags);
3228 
3229   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3230   R = B.buildFMA(ResTy, Tmp0, R, R);
3231 
3232   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3233   R = B.buildFMA(ResTy, Tmp1, R, R);
3234 
3235   auto Ret = B.buildFMul(ResTy, X, R);
3236   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3237 
3238   B.buildFMA(Res, Tmp2, R, Ret);
3239   MI.eraseFromParent();
3240   return true;
3241 }
3242 
3243 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3244                                          MachineRegisterInfo &MRI,
3245                                          MachineIRBuilder &B) const {
3246   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3247     return true;
3248 
3249   Register Res = MI.getOperand(0).getReg();
3250   Register LHS = MI.getOperand(1).getReg();
3251   Register RHS = MI.getOperand(2).getReg();
3252 
3253   uint16_t Flags = MI.getFlags();
3254 
3255   LLT S16 = LLT::scalar(16);
3256   LLT S32 = LLT::scalar(32);
3257 
3258   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3259   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3260 
3261   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3262     .addUse(RHSExt.getReg(0))
3263     .setMIFlags(Flags);
3264 
3265   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3266   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3267 
3268   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3269     .addUse(RDst.getReg(0))
3270     .addUse(RHS)
3271     .addUse(LHS)
3272     .setMIFlags(Flags);
3273 
3274   MI.eraseFromParent();
3275   return true;
3276 }
3277 
3278 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3279 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3280 static void toggleSPDenormMode(bool Enable,
3281                                MachineIRBuilder &B,
3282                                const GCNSubtarget &ST,
3283                                AMDGPU::SIModeRegisterDefaults Mode) {
3284   // Set SP denorm mode to this value.
3285   unsigned SPDenormMode =
3286     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3287 
3288   if (ST.hasDenormModeInst()) {
3289     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3290     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3291 
3292     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3293     B.buildInstr(AMDGPU::S_DENORM_MODE)
3294       .addImm(NewDenormModeValue);
3295 
3296   } else {
3297     // Select FP32 bit field in mode register.
3298     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3299                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3300                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3301 
3302     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3303       .addImm(SPDenormMode)
3304       .addImm(SPDenormModeBitField);
3305   }
3306 }
3307 
3308 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3309                                          MachineRegisterInfo &MRI,
3310                                          MachineIRBuilder &B) const {
3311   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3312     return true;
3313 
3314   Register Res = MI.getOperand(0).getReg();
3315   Register LHS = MI.getOperand(1).getReg();
3316   Register RHS = MI.getOperand(2).getReg();
3317   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3318   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3319 
3320   uint16_t Flags = MI.getFlags();
3321 
3322   LLT S32 = LLT::scalar(32);
3323   LLT S1 = LLT::scalar(1);
3324 
3325   auto One = B.buildFConstant(S32, 1.0f);
3326 
3327   auto DenominatorScaled =
3328     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3329       .addUse(LHS)
3330       .addUse(RHS)
3331       .addImm(0)
3332       .setMIFlags(Flags);
3333   auto NumeratorScaled =
3334     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3335       .addUse(LHS)
3336       .addUse(RHS)
3337       .addImm(1)
3338       .setMIFlags(Flags);
3339 
3340   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3341     .addUse(DenominatorScaled.getReg(0))
3342     .setMIFlags(Flags);
3343   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3344 
3345   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3346   // aren't modeled as reading it.
3347   if (!Mode.allFP32Denormals())
3348     toggleSPDenormMode(true, B, ST, Mode);
3349 
3350   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3351   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3352   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3353   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3354   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3355   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3356 
3357   if (!Mode.allFP32Denormals())
3358     toggleSPDenormMode(false, B, ST, Mode);
3359 
3360   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3361     .addUse(Fma4.getReg(0))
3362     .addUse(Fma1.getReg(0))
3363     .addUse(Fma3.getReg(0))
3364     .addUse(NumeratorScaled.getReg(1))
3365     .setMIFlags(Flags);
3366 
3367   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3368     .addUse(Fmas.getReg(0))
3369     .addUse(RHS)
3370     .addUse(LHS)
3371     .setMIFlags(Flags);
3372 
3373   MI.eraseFromParent();
3374   return true;
3375 }
3376 
3377 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3378                                          MachineRegisterInfo &MRI,
3379                                          MachineIRBuilder &B) const {
3380   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3381     return true;
3382 
3383   Register Res = MI.getOperand(0).getReg();
3384   Register LHS = MI.getOperand(1).getReg();
3385   Register RHS = MI.getOperand(2).getReg();
3386 
3387   uint16_t Flags = MI.getFlags();
3388 
3389   LLT S64 = LLT::scalar(64);
3390   LLT S1 = LLT::scalar(1);
3391 
3392   auto One = B.buildFConstant(S64, 1.0);
3393 
3394   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3395     .addUse(LHS)
3396     .addUse(RHS)
3397     .addImm(0)
3398     .setMIFlags(Flags);
3399 
3400   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3401 
3402   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3403     .addUse(DivScale0.getReg(0))
3404     .setMIFlags(Flags);
3405 
3406   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3407   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3408   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3409 
3410   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3411     .addUse(LHS)
3412     .addUse(RHS)
3413     .addImm(1)
3414     .setMIFlags(Flags);
3415 
3416   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3417   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3418   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3419 
3420   Register Scale;
3421   if (!ST.hasUsableDivScaleConditionOutput()) {
3422     // Workaround a hardware bug on SI where the condition output from div_scale
3423     // is not usable.
3424 
3425     LLT S32 = LLT::scalar(32);
3426 
3427     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3428     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3429     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3430     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3431 
3432     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3433                               Scale1Unmerge.getReg(1));
3434     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3435                               Scale0Unmerge.getReg(1));
3436     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3437   } else {
3438     Scale = DivScale1.getReg(1);
3439   }
3440 
3441   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3442     .addUse(Fma4.getReg(0))
3443     .addUse(Fma3.getReg(0))
3444     .addUse(Mul.getReg(0))
3445     .addUse(Scale)
3446     .setMIFlags(Flags);
3447 
3448   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3449     .addUse(Fmas.getReg(0))
3450     .addUse(RHS)
3451     .addUse(LHS)
3452     .setMIFlags(Flags);
3453 
3454   MI.eraseFromParent();
3455   return true;
3456 }
3457 
3458 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3459                                                  MachineRegisterInfo &MRI,
3460                                                  MachineIRBuilder &B) const {
3461   Register Res = MI.getOperand(0).getReg();
3462   Register LHS = MI.getOperand(2).getReg();
3463   Register RHS = MI.getOperand(3).getReg();
3464   uint16_t Flags = MI.getFlags();
3465 
3466   LLT S32 = LLT::scalar(32);
3467   LLT S1 = LLT::scalar(1);
3468 
3469   auto Abs = B.buildFAbs(S32, RHS, Flags);
3470   const APFloat C0Val(1.0f);
3471 
3472   auto C0 = B.buildConstant(S32, 0x6f800000);
3473   auto C1 = B.buildConstant(S32, 0x2f800000);
3474   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3475 
3476   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3477   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3478 
3479   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3480 
3481   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3482     .addUse(Mul0.getReg(0))
3483     .setMIFlags(Flags);
3484 
3485   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3486 
3487   B.buildFMul(Res, Sel, Mul1, Flags);
3488 
3489   MI.eraseFromParent();
3490   return true;
3491 }
3492 
3493 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3494 // FIXME: Why do we handle this one but not other removed instructions?
3495 //
3496 // Reciprocal square root.  The clamp prevents infinite results, clamping
3497 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3498 // +-max_float.
3499 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3500                                                     MachineRegisterInfo &MRI,
3501                                                     MachineIRBuilder &B) const {
3502   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3503     return true;
3504 
3505   Register Dst = MI.getOperand(0).getReg();
3506   Register Src = MI.getOperand(2).getReg();
3507   auto Flags = MI.getFlags();
3508 
3509   LLT Ty = MRI.getType(Dst);
3510 
3511   const fltSemantics *FltSemantics;
3512   if (Ty == LLT::scalar(32))
3513     FltSemantics = &APFloat::IEEEsingle();
3514   else if (Ty == LLT::scalar(64))
3515     FltSemantics = &APFloat::IEEEdouble();
3516   else
3517     return false;
3518 
3519   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3520     .addUse(Src)
3521     .setMIFlags(Flags);
3522 
3523   // We don't need to concern ourselves with the snan handling difference, since
3524   // the rsq quieted (or not) so use the one which will directly select.
3525   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3526   const bool UseIEEE = MFI->getMode().IEEE;
3527 
3528   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3529   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3530                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3531 
3532   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3533 
3534   if (UseIEEE)
3535     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3536   else
3537     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3538   MI.eraseFromParent();
3539   return true;
3540 }
3541 
3542 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3543   switch (IID) {
3544   case Intrinsic::amdgcn_ds_fadd:
3545     return AMDGPU::G_ATOMICRMW_FADD;
3546   case Intrinsic::amdgcn_ds_fmin:
3547     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3548   case Intrinsic::amdgcn_ds_fmax:
3549     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3550   default:
3551     llvm_unreachable("not a DS FP intrinsic");
3552   }
3553 }
3554 
3555 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3556                                                       MachineInstr &MI,
3557                                                       Intrinsic::ID IID) const {
3558   GISelChangeObserver &Observer = Helper.Observer;
3559   Observer.changingInstr(MI);
3560 
3561   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3562 
3563   // The remaining operands were used to set fields in the MemOperand on
3564   // construction.
3565   for (int I = 6; I > 3; --I)
3566     MI.RemoveOperand(I);
3567 
3568   MI.RemoveOperand(1); // Remove the intrinsic ID.
3569   Observer.changedInstr(MI);
3570   return true;
3571 }
3572 
3573 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3574                                             MachineRegisterInfo &MRI,
3575                                             MachineIRBuilder &B) const {
3576   uint64_t Offset =
3577     ST.getTargetLowering()->getImplicitParameterOffset(
3578       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3579   LLT DstTy = MRI.getType(DstReg);
3580   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3581 
3582   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3583   if (!loadInputValue(KernargPtrReg, B,
3584                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3585     return false;
3586 
3587   // FIXME: This should be nuw
3588   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3589   return true;
3590 }
3591 
3592 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3593                                                  MachineRegisterInfo &MRI,
3594                                                  MachineIRBuilder &B) const {
3595   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3596   if (!MFI->isEntryFunction()) {
3597     return legalizePreloadedArgIntrin(MI, MRI, B,
3598                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3599   }
3600 
3601   Register DstReg = MI.getOperand(0).getReg();
3602   if (!getImplicitArgPtr(DstReg, MRI, B))
3603     return false;
3604 
3605   MI.eraseFromParent();
3606   return true;
3607 }
3608 
3609 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3610                                               MachineRegisterInfo &MRI,
3611                                               MachineIRBuilder &B,
3612                                               unsigned AddrSpace) const {
3613   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3614   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3615   Register Hi32 = Unmerge.getReg(1);
3616 
3617   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3618   MI.eraseFromParent();
3619   return true;
3620 }
3621 
3622 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3623 // offset (the offset that is included in bounds checking and swizzling, to be
3624 // split between the instruction's voffset and immoffset fields) and soffset
3625 // (the offset that is excluded from bounds checking and swizzling, to go in
3626 // the instruction's soffset field).  This function takes the first kind of
3627 // offset and figures out how to split it between voffset and immoffset.
3628 std::tuple<Register, unsigned, unsigned>
3629 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3630                                         Register OrigOffset) const {
3631   const unsigned MaxImm = 4095;
3632   Register BaseReg;
3633   unsigned TotalConstOffset;
3634   const LLT S32 = LLT::scalar(32);
3635   MachineRegisterInfo &MRI = *B.getMRI();
3636 
3637   std::tie(BaseReg, TotalConstOffset) =
3638       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3639 
3640   unsigned ImmOffset = TotalConstOffset;
3641 
3642   // If BaseReg is a pointer, convert it to int.
3643   if (MRI.getType(BaseReg).isPointer())
3644     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3645 
3646   // If the immediate value is too big for the immoffset field, put the value
3647   // and -4096 into the immoffset field so that the value that is copied/added
3648   // for the voffset field is a multiple of 4096, and it stands more chance
3649   // of being CSEd with the copy/add for another similar load/store.
3650   // However, do not do that rounding down to a multiple of 4096 if that is a
3651   // negative number, as it appears to be illegal to have a negative offset
3652   // in the vgpr, even if adding the immediate offset makes it positive.
3653   unsigned Overflow = ImmOffset & ~MaxImm;
3654   ImmOffset -= Overflow;
3655   if ((int32_t)Overflow < 0) {
3656     Overflow += ImmOffset;
3657     ImmOffset = 0;
3658   }
3659 
3660   if (Overflow != 0) {
3661     if (!BaseReg) {
3662       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3663     } else {
3664       auto OverflowVal = B.buildConstant(S32, Overflow);
3665       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3666     }
3667   }
3668 
3669   if (!BaseReg)
3670     BaseReg = B.buildConstant(S32, 0).getReg(0);
3671 
3672   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3673 }
3674 
3675 /// Handle register layout difference for f16 images for some subtargets.
3676 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3677                                              MachineRegisterInfo &MRI,
3678                                              Register Reg,
3679                                              bool ImageStore) const {
3680   const LLT S16 = LLT::scalar(16);
3681   const LLT S32 = LLT::scalar(32);
3682   LLT StoreVT = MRI.getType(Reg);
3683   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3684 
3685   if (ST.hasUnpackedD16VMem()) {
3686     auto Unmerge = B.buildUnmerge(S16, Reg);
3687 
3688     SmallVector<Register, 4> WideRegs;
3689     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3690       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3691 
3692     int NumElts = StoreVT.getNumElements();
3693 
3694     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3695   }
3696 
3697   if (ImageStore && ST.hasImageStoreD16Bug()) {
3698     if (StoreVT.getNumElements() == 2) {
3699       SmallVector<Register, 4> PackedRegs;
3700       Reg = B.buildBitcast(S32, Reg).getReg(0);
3701       PackedRegs.push_back(Reg);
3702       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3703       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3704     }
3705 
3706     if (StoreVT.getNumElements() == 3) {
3707       SmallVector<Register, 4> PackedRegs;
3708       auto Unmerge = B.buildUnmerge(S16, Reg);
3709       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3710         PackedRegs.push_back(Unmerge.getReg(I));
3711       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3712       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3713       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3714     }
3715 
3716     if (StoreVT.getNumElements() == 4) {
3717       SmallVector<Register, 4> PackedRegs;
3718       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3719       auto Unmerge = B.buildUnmerge(S32, Reg);
3720       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3721         PackedRegs.push_back(Unmerge.getReg(I));
3722       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3723       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3724     }
3725 
3726     llvm_unreachable("invalid data type");
3727   }
3728 
3729   return Reg;
3730 }
3731 
3732 Register AMDGPULegalizerInfo::fixStoreSourceType(
3733   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3734   MachineRegisterInfo *MRI = B.getMRI();
3735   LLT Ty = MRI->getType(VData);
3736 
3737   const LLT S16 = LLT::scalar(16);
3738 
3739   // Fixup illegal register types for i8 stores.
3740   if (Ty == LLT::scalar(8) || Ty == S16) {
3741     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3742     return AnyExt;
3743   }
3744 
3745   if (Ty.isVector()) {
3746     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3747       if (IsFormat)
3748         return handleD16VData(B, *MRI, VData);
3749     }
3750   }
3751 
3752   return VData;
3753 }
3754 
3755 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3756                                               MachineRegisterInfo &MRI,
3757                                               MachineIRBuilder &B,
3758                                               bool IsTyped,
3759                                               bool IsFormat) const {
3760   Register VData = MI.getOperand(1).getReg();
3761   LLT Ty = MRI.getType(VData);
3762   LLT EltTy = Ty.getScalarType();
3763   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3764   const LLT S32 = LLT::scalar(32);
3765 
3766   VData = fixStoreSourceType(B, VData, IsFormat);
3767   Register RSrc = MI.getOperand(2).getReg();
3768 
3769   MachineMemOperand *MMO = *MI.memoperands_begin();
3770   const int MemSize = MMO->getSize();
3771 
3772   unsigned ImmOffset;
3773   unsigned TotalOffset;
3774 
3775   // The typed intrinsics add an immediate after the registers.
3776   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3777 
3778   // The struct intrinsic variants add one additional operand over raw.
3779   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3780   Register VIndex;
3781   int OpOffset = 0;
3782   if (HasVIndex) {
3783     VIndex = MI.getOperand(3).getReg();
3784     OpOffset = 1;
3785   }
3786 
3787   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3788   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3789 
3790   unsigned Format = 0;
3791   if (IsTyped) {
3792     Format = MI.getOperand(5 + OpOffset).getImm();
3793     ++OpOffset;
3794   }
3795 
3796   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3797 
3798   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3799   if (TotalOffset != 0)
3800     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3801 
3802   unsigned Opc;
3803   if (IsTyped) {
3804     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3805                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3806   } else if (IsFormat) {
3807     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3808                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3809   } else {
3810     switch (MemSize) {
3811     case 1:
3812       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3813       break;
3814     case 2:
3815       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3816       break;
3817     default:
3818       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3819       break;
3820     }
3821   }
3822 
3823   if (!VIndex)
3824     VIndex = B.buildConstant(S32, 0).getReg(0);
3825 
3826   auto MIB = B.buildInstr(Opc)
3827     .addUse(VData)              // vdata
3828     .addUse(RSrc)               // rsrc
3829     .addUse(VIndex)             // vindex
3830     .addUse(VOffset)            // voffset
3831     .addUse(SOffset)            // soffset
3832     .addImm(ImmOffset);         // offset(imm)
3833 
3834   if (IsTyped)
3835     MIB.addImm(Format);
3836 
3837   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3838      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3839      .addMemOperand(MMO);
3840 
3841   MI.eraseFromParent();
3842   return true;
3843 }
3844 
3845 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3846                                              MachineRegisterInfo &MRI,
3847                                              MachineIRBuilder &B,
3848                                              bool IsFormat,
3849                                              bool IsTyped) const {
3850   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3851   MachineMemOperand *MMO = *MI.memoperands_begin();
3852   const int MemSize = MMO->getSize();
3853   const LLT S32 = LLT::scalar(32);
3854 
3855   Register Dst = MI.getOperand(0).getReg();
3856   Register RSrc = MI.getOperand(2).getReg();
3857 
3858   // The typed intrinsics add an immediate after the registers.
3859   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3860 
3861   // The struct intrinsic variants add one additional operand over raw.
3862   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3863   Register VIndex;
3864   int OpOffset = 0;
3865   if (HasVIndex) {
3866     VIndex = MI.getOperand(3).getReg();
3867     OpOffset = 1;
3868   }
3869 
3870   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3871   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3872 
3873   unsigned Format = 0;
3874   if (IsTyped) {
3875     Format = MI.getOperand(5 + OpOffset).getImm();
3876     ++OpOffset;
3877   }
3878 
3879   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3880   unsigned ImmOffset;
3881   unsigned TotalOffset;
3882 
3883   LLT Ty = MRI.getType(Dst);
3884   LLT EltTy = Ty.getScalarType();
3885   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3886   const bool Unpacked = ST.hasUnpackedD16VMem();
3887 
3888   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3889   if (TotalOffset != 0)
3890     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3891 
3892   unsigned Opc;
3893 
3894   if (IsTyped) {
3895     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3896                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3897   } else if (IsFormat) {
3898     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3899                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3900   } else {
3901     switch (MemSize) {
3902     case 1:
3903       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3904       break;
3905     case 2:
3906       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3907       break;
3908     default:
3909       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3910       break;
3911     }
3912   }
3913 
3914   Register LoadDstReg;
3915 
3916   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3917   LLT UnpackedTy = Ty.changeElementSize(32);
3918 
3919   if (IsExtLoad)
3920     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3921   else if (Unpacked && IsD16 && Ty.isVector())
3922     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3923   else
3924     LoadDstReg = Dst;
3925 
3926   if (!VIndex)
3927     VIndex = B.buildConstant(S32, 0).getReg(0);
3928 
3929   auto MIB = B.buildInstr(Opc)
3930     .addDef(LoadDstReg)         // vdata
3931     .addUse(RSrc)               // rsrc
3932     .addUse(VIndex)             // vindex
3933     .addUse(VOffset)            // voffset
3934     .addUse(SOffset)            // soffset
3935     .addImm(ImmOffset);         // offset(imm)
3936 
3937   if (IsTyped)
3938     MIB.addImm(Format);
3939 
3940   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3941      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3942      .addMemOperand(MMO);
3943 
3944   if (LoadDstReg != Dst) {
3945     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3946 
3947     // Widen result for extending loads was widened.
3948     if (IsExtLoad)
3949       B.buildTrunc(Dst, LoadDstReg);
3950     else {
3951       // Repack to original 16-bit vector result
3952       // FIXME: G_TRUNC should work, but legalization currently fails
3953       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3954       SmallVector<Register, 4> Repack;
3955       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3956         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3957       B.buildMerge(Dst, Repack);
3958     }
3959   }
3960 
3961   MI.eraseFromParent();
3962   return true;
3963 }
3964 
3965 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3966                                                MachineIRBuilder &B,
3967                                                bool IsInc) const {
3968   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3969                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3970   B.buildInstr(Opc)
3971     .addDef(MI.getOperand(0).getReg())
3972     .addUse(MI.getOperand(2).getReg())
3973     .addUse(MI.getOperand(3).getReg())
3974     .cloneMemRefs(MI);
3975   MI.eraseFromParent();
3976   return true;
3977 }
3978 
3979 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3980   switch (IntrID) {
3981   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3982   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3983     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3984   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3985   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3986     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3987   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3988   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3989     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3990   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3991   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3992     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3993   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3994   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3995     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3996   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3997   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3998     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3999   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4000   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4001     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4002   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4003   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4004     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4005   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4006   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4007     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4008   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4009   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4010     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4011   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4012   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4013     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4014   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4015   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4016     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4017   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4018   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4019     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4020   case Intrinsic::amdgcn_buffer_atomic_fadd:
4021   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4022   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4023     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4024   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4025   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4026     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4027   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4028   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4029     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4030   default:
4031     llvm_unreachable("unhandled atomic opcode");
4032   }
4033 }
4034 
4035 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4036                                                MachineIRBuilder &B,
4037                                                Intrinsic::ID IID) const {
4038   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4039                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4040   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4041 
4042   Register Dst;
4043 
4044   int OpOffset = 0;
4045   if (HasReturn) {
4046     // A few FP atomics do not support return values.
4047     Dst = MI.getOperand(0).getReg();
4048   } else {
4049     OpOffset = -1;
4050   }
4051 
4052   Register VData = MI.getOperand(2 + OpOffset).getReg();
4053   Register CmpVal;
4054 
4055   if (IsCmpSwap) {
4056     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4057     ++OpOffset;
4058   }
4059 
4060   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4061   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4062 
4063   // The struct intrinsic variants add one additional operand over raw.
4064   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4065   Register VIndex;
4066   if (HasVIndex) {
4067     VIndex = MI.getOperand(4 + OpOffset).getReg();
4068     ++OpOffset;
4069   }
4070 
4071   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4072   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4073   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4074 
4075   MachineMemOperand *MMO = *MI.memoperands_begin();
4076 
4077   unsigned ImmOffset;
4078   unsigned TotalOffset;
4079   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4080   if (TotalOffset != 0)
4081     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4082 
4083   if (!VIndex)
4084     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4085 
4086   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4087 
4088   if (HasReturn)
4089     MIB.addDef(Dst);
4090 
4091   MIB.addUse(VData); // vdata
4092 
4093   if (IsCmpSwap)
4094     MIB.addReg(CmpVal);
4095 
4096   MIB.addUse(RSrc)               // rsrc
4097      .addUse(VIndex)             // vindex
4098      .addUse(VOffset)            // voffset
4099      .addUse(SOffset)            // soffset
4100      .addImm(ImmOffset)          // offset(imm)
4101      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4102      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4103      .addMemOperand(MMO);
4104 
4105   MI.eraseFromParent();
4106   return true;
4107 }
4108 
4109 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4110 /// vector with s16 typed elements.
4111 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4112                                       SmallVectorImpl<Register> &PackedAddrs,
4113                                       unsigned ArgOffset,
4114                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4115                                       bool IsA16, bool IsG16) {
4116   const LLT S16 = LLT::scalar(16);
4117   const LLT V2S16 = LLT::vector(2, 16);
4118   auto EndIdx = Intr->VAddrEnd;
4119 
4120   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4121     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4122     if (!SrcOp.isReg())
4123       continue; // _L to _LZ may have eliminated this.
4124 
4125     Register AddrReg = SrcOp.getReg();
4126 
4127     if (I < Intr->GradientStart) {
4128       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4129       PackedAddrs.push_back(AddrReg);
4130     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4131                (I >= Intr->CoordStart && !IsA16)) {
4132       // Handle any gradient or coordinate operands that should not be packed
4133       PackedAddrs.push_back(AddrReg);
4134     } else {
4135       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4136       // derivatives dx/dh and dx/dv are packed with undef.
4137       if (((I + 1) >= EndIdx) ||
4138           ((Intr->NumGradients / 2) % 2 == 1 &&
4139            (I == static_cast<unsigned>(Intr->GradientStart +
4140                                        (Intr->NumGradients / 2) - 1) ||
4141             I == static_cast<unsigned>(Intr->GradientStart +
4142                                        Intr->NumGradients - 1))) ||
4143           // Check for _L to _LZ optimization
4144           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4145         PackedAddrs.push_back(
4146             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4147                 .getReg(0));
4148       } else {
4149         PackedAddrs.push_back(
4150             B.buildBuildVector(
4151                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4152                 .getReg(0));
4153         ++I;
4154       }
4155     }
4156   }
4157 }
4158 
4159 /// Convert from separate vaddr components to a single vector address register,
4160 /// and replace the remaining operands with $noreg.
4161 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4162                                      int DimIdx, int NumVAddrs) {
4163   const LLT S32 = LLT::scalar(32);
4164 
4165   SmallVector<Register, 8> AddrRegs;
4166   for (int I = 0; I != NumVAddrs; ++I) {
4167     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4168     if (SrcOp.isReg()) {
4169       AddrRegs.push_back(SrcOp.getReg());
4170       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4171     }
4172   }
4173 
4174   int NumAddrRegs = AddrRegs.size();
4175   if (NumAddrRegs != 1) {
4176     // Round up to 8 elements for v5-v7
4177     // FIXME: Missing intermediate sized register classes and instructions.
4178     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4179       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4180       auto Undef = B.buildUndef(S32);
4181       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4182       NumAddrRegs = RoundedNumRegs;
4183     }
4184 
4185     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4186     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4187   }
4188 
4189   for (int I = 1; I != NumVAddrs; ++I) {
4190     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4191     if (SrcOp.isReg())
4192       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4193   }
4194 }
4195 
4196 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4197 ///
4198 /// Depending on the subtarget, load/store with 16-bit element data need to be
4199 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4200 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4201 /// registers.
4202 ///
4203 /// We don't want to directly select image instructions just yet, but also want
4204 /// to exposes all register repacking to the legalizer/combiners. We also don't
4205 /// want a selected instrution entering RegBankSelect. In order to avoid
4206 /// defining a multitude of intermediate image instructions, directly hack on
4207 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4208 /// now unnecessary arguments with $noreg.
4209 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4210     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4211     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4212 
4213   const unsigned NumDefs = MI.getNumExplicitDefs();
4214   const unsigned ArgOffset = NumDefs + 1;
4215   bool IsTFE = NumDefs == 2;
4216   // We are only processing the operands of d16 image operations on subtargets
4217   // that use the unpacked register layout, or need to repack the TFE result.
4218 
4219   // TODO: Do we need to guard against already legalized intrinsics?
4220   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4221       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4222 
4223   MachineRegisterInfo *MRI = B.getMRI();
4224   const LLT S32 = LLT::scalar(32);
4225   const LLT S16 = LLT::scalar(16);
4226   const LLT V2S16 = LLT::vector(2, 16);
4227 
4228   unsigned DMask = 0;
4229 
4230   // Check for 16 bit addresses and pack if true.
4231   LLT GradTy =
4232       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4233   LLT AddrTy =
4234       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4235   const bool IsG16 = GradTy == S16;
4236   const bool IsA16 = AddrTy == S16;
4237 
4238   int DMaskLanes = 0;
4239   if (!BaseOpcode->Atomic) {
4240     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4241     if (BaseOpcode->Gather4) {
4242       DMaskLanes = 4;
4243     } else if (DMask != 0) {
4244       DMaskLanes = countPopulation(DMask);
4245     } else if (!IsTFE && !BaseOpcode->Store) {
4246       // If dmask is 0, this is a no-op load. This can be eliminated.
4247       B.buildUndef(MI.getOperand(0));
4248       MI.eraseFromParent();
4249       return true;
4250     }
4251   }
4252 
4253   Observer.changingInstr(MI);
4254   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4255 
4256   unsigned NewOpcode = NumDefs == 0 ?
4257     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4258 
4259   // Track that we legalized this
4260   MI.setDesc(B.getTII().get(NewOpcode));
4261 
4262   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4263   // dmask to be at least 1 otherwise the instruction will fail
4264   if (IsTFE && DMask == 0) {
4265     DMask = 0x1;
4266     DMaskLanes = 1;
4267     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4268   }
4269 
4270   if (BaseOpcode->Atomic) {
4271     Register VData0 = MI.getOperand(2).getReg();
4272     LLT Ty = MRI->getType(VData0);
4273 
4274     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4275     if (Ty.isVector())
4276       return false;
4277 
4278     if (BaseOpcode->AtomicX2) {
4279       Register VData1 = MI.getOperand(3).getReg();
4280       // The two values are packed in one register.
4281       LLT PackedTy = LLT::vector(2, Ty);
4282       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4283       MI.getOperand(2).setReg(Concat.getReg(0));
4284       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4285     }
4286   }
4287 
4288   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4289 
4290   // Optimize _L to _LZ when _L is zero
4291   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4292           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4293     const ConstantFP *ConstantLod;
4294 
4295     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4296                  m_GFCst(ConstantLod))) {
4297       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4298         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4299         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4300             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4301                                                       Intr->Dim);
4302 
4303         // The starting indexes should remain in the same place.
4304         --CorrectedNumVAddrs;
4305 
4306         MI.getOperand(MI.getNumExplicitDefs())
4307             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4308         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4309         Intr = NewImageDimIntr;
4310       }
4311     }
4312   }
4313 
4314   // Optimize _mip away, when 'lod' is zero
4315   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4316     int64_t ConstantLod;
4317     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4318                  m_ICst(ConstantLod))) {
4319       if (ConstantLod == 0) {
4320         // TODO: Change intrinsic opcode and remove operand instead or replacing
4321         // it with 0, as the _L to _LZ handling is done above.
4322         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4323         --CorrectedNumVAddrs;
4324       }
4325     }
4326   }
4327 
4328   // Rewrite the addressing register layout before doing anything else.
4329   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4330     // 16 bit gradients are supported, but are tied to the A16 control
4331     // so both gradients and addresses must be 16 bit
4332     return false;
4333   }
4334 
4335   if (IsA16 && !ST.hasA16()) {
4336     // A16 not supported
4337     return false;
4338   }
4339 
4340   if (IsA16 || IsG16) {
4341     if (Intr->NumVAddrs > 1) {
4342       SmallVector<Register, 4> PackedRegs;
4343 
4344       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4345                                 IsG16);
4346 
4347       // See also below in the non-a16 branch
4348       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4349 
4350       if (!UseNSA && PackedRegs.size() > 1) {
4351         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4352         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4353         PackedRegs[0] = Concat.getReg(0);
4354         PackedRegs.resize(1);
4355       }
4356 
4357       const unsigned NumPacked = PackedRegs.size();
4358       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4359         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4360         if (!SrcOp.isReg()) {
4361           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4362           continue;
4363         }
4364 
4365         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4366 
4367         if (I - Intr->VAddrStart < NumPacked)
4368           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4369         else
4370           SrcOp.setReg(AMDGPU::NoRegister);
4371       }
4372     }
4373   } else {
4374     // If the register allocator cannot place the address registers contiguously
4375     // without introducing moves, then using the non-sequential address encoding
4376     // is always preferable, since it saves VALU instructions and is usually a
4377     // wash in terms of code size or even better.
4378     //
4379     // However, we currently have no way of hinting to the register allocator
4380     // that MIMG addresses should be placed contiguously when it is possible to
4381     // do so, so force non-NSA for the common 2-address case as a heuristic.
4382     //
4383     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4384     // allocation when possible.
4385     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4386 
4387     if (!UseNSA && Intr->NumVAddrs > 1)
4388       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4389                                Intr->NumVAddrs);
4390   }
4391 
4392   int Flags = 0;
4393   if (IsA16)
4394     Flags |= 1;
4395   if (IsG16)
4396     Flags |= 2;
4397   MI.addOperand(MachineOperand::CreateImm(Flags));
4398 
4399   if (BaseOpcode->Store) { // No TFE for stores?
4400     // TODO: Handle dmask trim
4401     Register VData = MI.getOperand(1).getReg();
4402     LLT Ty = MRI->getType(VData);
4403     if (!Ty.isVector() || Ty.getElementType() != S16)
4404       return true;
4405 
4406     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4407     if (RepackedReg != VData) {
4408       MI.getOperand(1).setReg(RepackedReg);
4409     }
4410 
4411     return true;
4412   }
4413 
4414   Register DstReg = MI.getOperand(0).getReg();
4415   LLT Ty = MRI->getType(DstReg);
4416   const LLT EltTy = Ty.getScalarType();
4417   const bool IsD16 = Ty.getScalarType() == S16;
4418   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4419 
4420   // Confirm that the return type is large enough for the dmask specified
4421   if (NumElts < DMaskLanes)
4422     return false;
4423 
4424   if (NumElts > 4 || DMaskLanes > 4)
4425     return false;
4426 
4427   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4428   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4429 
4430   // The raw dword aligned data component of the load. The only legal cases
4431   // where this matters should be when using the packed D16 format, for
4432   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4433   LLT RoundedTy;
4434 
4435   // S32 vector to to cover all data, plus TFE result element.
4436   LLT TFETy;
4437 
4438   // Register type to use for each loaded component. Will be S32 or V2S16.
4439   LLT RegTy;
4440 
4441   if (IsD16 && ST.hasUnpackedD16VMem()) {
4442     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4443     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4444     RegTy = S32;
4445   } else {
4446     unsigned EltSize = EltTy.getSizeInBits();
4447     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4448     unsigned RoundedSize = 32 * RoundedElts;
4449     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4450     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4451     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4452   }
4453 
4454   // The return type does not need adjustment.
4455   // TODO: Should we change s16 case to s32 or <2 x s16>?
4456   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4457     return true;
4458 
4459   Register Dst1Reg;
4460 
4461   // Insert after the instruction.
4462   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4463 
4464   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4465   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4466   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4467   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4468 
4469   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4470 
4471   MI.getOperand(0).setReg(NewResultReg);
4472 
4473   // In the IR, TFE is supposed to be used with a 2 element struct return
4474   // type. The intruction really returns these two values in one contiguous
4475   // register, with one additional dword beyond the loaded data. Rewrite the
4476   // return type to use a single register result.
4477 
4478   if (IsTFE) {
4479     Dst1Reg = MI.getOperand(1).getReg();
4480     if (MRI->getType(Dst1Reg) != S32)
4481       return false;
4482 
4483     // TODO: Make sure the TFE operand bit is set.
4484     MI.RemoveOperand(1);
4485 
4486     // Handle the easy case that requires no repack instructions.
4487     if (Ty == S32) {
4488       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4489       return true;
4490     }
4491   }
4492 
4493   // Now figure out how to copy the new result register back into the old
4494   // result.
4495   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4496 
4497   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4498 
4499   if (ResultNumRegs == 1) {
4500     assert(!IsTFE);
4501     ResultRegs[0] = NewResultReg;
4502   } else {
4503     // We have to repack into a new vector of some kind.
4504     for (int I = 0; I != NumDataRegs; ++I)
4505       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4506     B.buildUnmerge(ResultRegs, NewResultReg);
4507 
4508     // Drop the final TFE element to get the data part. The TFE result is
4509     // directly written to the right place already.
4510     if (IsTFE)
4511       ResultRegs.resize(NumDataRegs);
4512   }
4513 
4514   // For an s16 scalar result, we form an s32 result with a truncate regardless
4515   // of packed vs. unpacked.
4516   if (IsD16 && !Ty.isVector()) {
4517     B.buildTrunc(DstReg, ResultRegs[0]);
4518     return true;
4519   }
4520 
4521   // Avoid a build/concat_vector of 1 entry.
4522   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4523     B.buildBitcast(DstReg, ResultRegs[0]);
4524     return true;
4525   }
4526 
4527   assert(Ty.isVector());
4528 
4529   if (IsD16) {
4530     // For packed D16 results with TFE enabled, all the data components are
4531     // S32. Cast back to the expected type.
4532     //
4533     // TODO: We don't really need to use load s32 elements. We would only need one
4534     // cast for the TFE result if a multiple of v2s16 was used.
4535     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4536       for (Register &Reg : ResultRegs)
4537         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4538     } else if (ST.hasUnpackedD16VMem()) {
4539       for (Register &Reg : ResultRegs)
4540         Reg = B.buildTrunc(S16, Reg).getReg(0);
4541     }
4542   }
4543 
4544   auto padWithUndef = [&](LLT Ty, int NumElts) {
4545     if (NumElts == 0)
4546       return;
4547     Register Undef = B.buildUndef(Ty).getReg(0);
4548     for (int I = 0; I != NumElts; ++I)
4549       ResultRegs.push_back(Undef);
4550   };
4551 
4552   // Pad out any elements eliminated due to the dmask.
4553   LLT ResTy = MRI->getType(ResultRegs[0]);
4554   if (!ResTy.isVector()) {
4555     padWithUndef(ResTy, NumElts - ResultRegs.size());
4556     B.buildBuildVector(DstReg, ResultRegs);
4557     return true;
4558   }
4559 
4560   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4561   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4562 
4563   // Deal with the one annoying legal case.
4564   const LLT V3S16 = LLT::vector(3, 16);
4565   if (Ty == V3S16) {
4566     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4567     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4568     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4569     return true;
4570   }
4571 
4572   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4573   B.buildConcatVectors(DstReg, ResultRegs);
4574   return true;
4575 }
4576 
4577 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4578   LegalizerHelper &Helper, MachineInstr &MI) const {
4579   MachineIRBuilder &B = Helper.MIRBuilder;
4580   GISelChangeObserver &Observer = Helper.Observer;
4581 
4582   Register Dst = MI.getOperand(0).getReg();
4583   LLT Ty = B.getMRI()->getType(Dst);
4584   unsigned Size = Ty.getSizeInBits();
4585   MachineFunction &MF = B.getMF();
4586 
4587   Observer.changingInstr(MI);
4588 
4589   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4590     Ty = getBitcastRegisterType(Ty);
4591     Helper.bitcastDst(MI, Ty, 0);
4592     Dst = MI.getOperand(0).getReg();
4593     B.setInsertPt(B.getMBB(), MI);
4594   }
4595 
4596   // FIXME: We don't really need this intermediate instruction. The intrinsic
4597   // should be fixed to have a memory operand. Since it's readnone, we're not
4598   // allowed to add one.
4599   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4600   MI.RemoveOperand(1); // Remove intrinsic ID
4601 
4602   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4603   // TODO: Should this use datalayout alignment?
4604   const unsigned MemSize = (Size + 7) / 8;
4605   const Align MemAlign(4);
4606   MachineMemOperand *MMO = MF.getMachineMemOperand(
4607       MachinePointerInfo(),
4608       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4609           MachineMemOperand::MOInvariant,
4610       MemSize, MemAlign);
4611   MI.addMemOperand(MF, MMO);
4612 
4613   // There are no 96-bit result scalar loads, but widening to 128-bit should
4614   // always be legal. We may need to restore this to a 96-bit result if it turns
4615   // out this needs to be converted to a vector load during RegBankSelect.
4616   if (!isPowerOf2_32(Size)) {
4617     if (Ty.isVector())
4618       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4619     else
4620       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4621   }
4622 
4623   Observer.changedInstr(MI);
4624   return true;
4625 }
4626 
4627 // TODO: Move to selection
4628 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4629                                                 MachineRegisterInfo &MRI,
4630                                                 MachineIRBuilder &B) const {
4631   if (!ST.isTrapHandlerEnabled() ||
4632       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4633     return legalizeTrapEndpgm(MI, MRI, B);
4634 
4635   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4636     switch (*HsaAbiVer) {
4637     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4638     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4639       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4640     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4641       return ST.supportsGetDoorbellID() ?
4642           legalizeTrapHsa(MI, MRI, B) :
4643           legalizeTrapHsaQueuePtr(MI, MRI, B);
4644     }
4645   }
4646 
4647   llvm_unreachable("Unknown trap handler");
4648 }
4649 
4650 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4651     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4652   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4653   MI.eraseFromParent();
4654   return true;
4655 }
4656 
4657 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4658     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4659   // Pass queue pointer to trap handler as input, and insert trap instruction
4660   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4661   Register LiveIn =
4662     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4663   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4664     return false;
4665 
4666   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4667   B.buildCopy(SGPR01, LiveIn);
4668   B.buildInstr(AMDGPU::S_TRAP)
4669       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4670       .addReg(SGPR01, RegState::Implicit);
4671 
4672   MI.eraseFromParent();
4673   return true;
4674 }
4675 
4676 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4677     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4678   B.buildInstr(AMDGPU::S_TRAP)
4679       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4680   MI.eraseFromParent();
4681   return true;
4682 }
4683 
4684 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4685     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4686   // Is non-HSA path or trap-handler disabled? then, report a warning
4687   // accordingly
4688   if (!ST.isTrapHandlerEnabled() ||
4689       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4690     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4691                                      "debugtrap handler not supported",
4692                                      MI.getDebugLoc(), DS_Warning);
4693     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4694     Ctx.diagnose(NoTrap);
4695   } else {
4696     // Insert debug-trap instruction
4697     B.buildInstr(AMDGPU::S_TRAP)
4698         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4699   }
4700 
4701   MI.eraseFromParent();
4702   return true;
4703 }
4704 
4705 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4706                                                MachineIRBuilder &B) const {
4707   MachineRegisterInfo &MRI = *B.getMRI();
4708   const LLT S16 = LLT::scalar(16);
4709   const LLT S32 = LLT::scalar(32);
4710 
4711   Register DstReg = MI.getOperand(0).getReg();
4712   Register NodePtr = MI.getOperand(2).getReg();
4713   Register RayExtent = MI.getOperand(3).getReg();
4714   Register RayOrigin = MI.getOperand(4).getReg();
4715   Register RayDir = MI.getOperand(5).getReg();
4716   Register RayInvDir = MI.getOperand(6).getReg();
4717   Register TDescr = MI.getOperand(7).getReg();
4718 
4719   if (!ST.hasGFX10_AEncoding()) {
4720     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4721                                         "intrinsic not supported on subtarget",
4722                                         MI.getDebugLoc());
4723     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4724     return false;
4725   }
4726 
4727   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4728   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4729   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4730                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4731                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4732                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4733 
4734   SmallVector<Register, 12> Ops;
4735   if (Is64) {
4736     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4737     Ops.push_back(Unmerge.getReg(0));
4738     Ops.push_back(Unmerge.getReg(1));
4739   } else {
4740     Ops.push_back(NodePtr);
4741   }
4742   Ops.push_back(RayExtent);
4743 
4744   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4745     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4746     Ops.push_back(Unmerge.getReg(0));
4747     Ops.push_back(Unmerge.getReg(1));
4748     Ops.push_back(Unmerge.getReg(2));
4749   };
4750 
4751   packLanes(RayOrigin);
4752   if (IsA16) {
4753     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4754     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4755     Register R1 = MRI.createGenericVirtualRegister(S32);
4756     Register R2 = MRI.createGenericVirtualRegister(S32);
4757     Register R3 = MRI.createGenericVirtualRegister(S32);
4758     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4759     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4760     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4761     Ops.push_back(R1);
4762     Ops.push_back(R2);
4763     Ops.push_back(R3);
4764   } else {
4765     packLanes(RayDir);
4766     packLanes(RayInvDir);
4767   }
4768 
4769   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4770     .addDef(DstReg)
4771     .addImm(Opcode);
4772 
4773   for (Register R : Ops) {
4774     MIB.addUse(R);
4775   }
4776 
4777   MIB.addUse(TDescr)
4778      .addImm(IsA16 ? 1 : 0)
4779      .cloneMemRefs(MI);
4780 
4781   MI.eraseFromParent();
4782   return true;
4783 }
4784 
4785 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4786                                             MachineInstr &MI) const {
4787   MachineIRBuilder &B = Helper.MIRBuilder;
4788   MachineRegisterInfo &MRI = *B.getMRI();
4789 
4790   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4791   auto IntrID = MI.getIntrinsicID();
4792   switch (IntrID) {
4793   case Intrinsic::amdgcn_if:
4794   case Intrinsic::amdgcn_else: {
4795     MachineInstr *Br = nullptr;
4796     MachineBasicBlock *UncondBrTarget = nullptr;
4797     bool Negated = false;
4798     if (MachineInstr *BrCond =
4799             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4800       const SIRegisterInfo *TRI
4801         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4802 
4803       Register Def = MI.getOperand(1).getReg();
4804       Register Use = MI.getOperand(3).getReg();
4805 
4806       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4807 
4808       if (Negated)
4809         std::swap(CondBrTarget, UncondBrTarget);
4810 
4811       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4812       if (IntrID == Intrinsic::amdgcn_if) {
4813         B.buildInstr(AMDGPU::SI_IF)
4814           .addDef(Def)
4815           .addUse(Use)
4816           .addMBB(UncondBrTarget);
4817       } else {
4818         B.buildInstr(AMDGPU::SI_ELSE)
4819             .addDef(Def)
4820             .addUse(Use)
4821             .addMBB(UncondBrTarget);
4822       }
4823 
4824       if (Br) {
4825         Br->getOperand(0).setMBB(CondBrTarget);
4826       } else {
4827         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4828         // since we're swapping branch targets it needs to be reinserted.
4829         // FIXME: IRTranslator should probably not do this
4830         B.buildBr(*CondBrTarget);
4831       }
4832 
4833       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4834       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4835       MI.eraseFromParent();
4836       BrCond->eraseFromParent();
4837       return true;
4838     }
4839 
4840     return false;
4841   }
4842   case Intrinsic::amdgcn_loop: {
4843     MachineInstr *Br = nullptr;
4844     MachineBasicBlock *UncondBrTarget = nullptr;
4845     bool Negated = false;
4846     if (MachineInstr *BrCond =
4847             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4848       const SIRegisterInfo *TRI
4849         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4850 
4851       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4852       Register Reg = MI.getOperand(2).getReg();
4853 
4854       if (Negated)
4855         std::swap(CondBrTarget, UncondBrTarget);
4856 
4857       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4858       B.buildInstr(AMDGPU::SI_LOOP)
4859         .addUse(Reg)
4860         .addMBB(UncondBrTarget);
4861 
4862       if (Br)
4863         Br->getOperand(0).setMBB(CondBrTarget);
4864       else
4865         B.buildBr(*CondBrTarget);
4866 
4867       MI.eraseFromParent();
4868       BrCond->eraseFromParent();
4869       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4870       return true;
4871     }
4872 
4873     return false;
4874   }
4875   case Intrinsic::amdgcn_kernarg_segment_ptr:
4876     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4877       // This only makes sense to call in a kernel, so just lower to null.
4878       B.buildConstant(MI.getOperand(0).getReg(), 0);
4879       MI.eraseFromParent();
4880       return true;
4881     }
4882 
4883     return legalizePreloadedArgIntrin(
4884       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4885   case Intrinsic::amdgcn_implicitarg_ptr:
4886     return legalizeImplicitArgPtr(MI, MRI, B);
4887   case Intrinsic::amdgcn_workitem_id_x:
4888     return legalizePreloadedArgIntrin(MI, MRI, B,
4889                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4890   case Intrinsic::amdgcn_workitem_id_y:
4891     return legalizePreloadedArgIntrin(MI, MRI, B,
4892                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4893   case Intrinsic::amdgcn_workitem_id_z:
4894     return legalizePreloadedArgIntrin(MI, MRI, B,
4895                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4896   case Intrinsic::amdgcn_workgroup_id_x:
4897     return legalizePreloadedArgIntrin(MI, MRI, B,
4898                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4899   case Intrinsic::amdgcn_workgroup_id_y:
4900     return legalizePreloadedArgIntrin(MI, MRI, B,
4901                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4902   case Intrinsic::amdgcn_workgroup_id_z:
4903     return legalizePreloadedArgIntrin(MI, MRI, B,
4904                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4905   case Intrinsic::amdgcn_dispatch_ptr:
4906     return legalizePreloadedArgIntrin(MI, MRI, B,
4907                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4908   case Intrinsic::amdgcn_queue_ptr:
4909     return legalizePreloadedArgIntrin(MI, MRI, B,
4910                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4911   case Intrinsic::amdgcn_implicit_buffer_ptr:
4912     return legalizePreloadedArgIntrin(
4913       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4914   case Intrinsic::amdgcn_dispatch_id:
4915     return legalizePreloadedArgIntrin(MI, MRI, B,
4916                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4917   case Intrinsic::amdgcn_fdiv_fast:
4918     return legalizeFDIVFastIntrin(MI, MRI, B);
4919   case Intrinsic::amdgcn_is_shared:
4920     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4921   case Intrinsic::amdgcn_is_private:
4922     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4923   case Intrinsic::amdgcn_wavefrontsize: {
4924     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4925     MI.eraseFromParent();
4926     return true;
4927   }
4928   case Intrinsic::amdgcn_s_buffer_load:
4929     return legalizeSBufferLoad(Helper, MI);
4930   case Intrinsic::amdgcn_raw_buffer_store:
4931   case Intrinsic::amdgcn_struct_buffer_store:
4932     return legalizeBufferStore(MI, MRI, B, false, false);
4933   case Intrinsic::amdgcn_raw_buffer_store_format:
4934   case Intrinsic::amdgcn_struct_buffer_store_format:
4935     return legalizeBufferStore(MI, MRI, B, false, true);
4936   case Intrinsic::amdgcn_raw_tbuffer_store:
4937   case Intrinsic::amdgcn_struct_tbuffer_store:
4938     return legalizeBufferStore(MI, MRI, B, true, true);
4939   case Intrinsic::amdgcn_raw_buffer_load:
4940   case Intrinsic::amdgcn_struct_buffer_load:
4941     return legalizeBufferLoad(MI, MRI, B, false, false);
4942   case Intrinsic::amdgcn_raw_buffer_load_format:
4943   case Intrinsic::amdgcn_struct_buffer_load_format:
4944     return legalizeBufferLoad(MI, MRI, B, true, false);
4945   case Intrinsic::amdgcn_raw_tbuffer_load:
4946   case Intrinsic::amdgcn_struct_tbuffer_load:
4947     return legalizeBufferLoad(MI, MRI, B, true, true);
4948   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4949   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4950   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4951   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4952   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4953   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4954   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4955   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4956   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4957   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4958   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4959   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4960   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4961   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4962   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4963   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4964   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4965   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4966   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4967   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4968   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4969   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4970   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4971   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4972   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4973   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4974   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4975   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4976   case Intrinsic::amdgcn_buffer_atomic_fadd:
4977   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4978   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4979   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4980   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4981     return legalizeBufferAtomic(MI, B, IntrID);
4982   case Intrinsic::amdgcn_atomic_inc:
4983     return legalizeAtomicIncDec(MI, B, true);
4984   case Intrinsic::amdgcn_atomic_dec:
4985     return legalizeAtomicIncDec(MI, B, false);
4986   case Intrinsic::trap:
4987     return legalizeTrapIntrinsic(MI, MRI, B);
4988   case Intrinsic::debugtrap:
4989     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4990   case Intrinsic::amdgcn_rsq_clamp:
4991     return legalizeRsqClampIntrinsic(MI, MRI, B);
4992   case Intrinsic::amdgcn_ds_fadd:
4993   case Intrinsic::amdgcn_ds_fmin:
4994   case Intrinsic::amdgcn_ds_fmax:
4995     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4996   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4997     return legalizeBVHIntrinsic(MI, B);
4998   default: {
4999     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5000             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5001       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5002     return true;
5003   }
5004   }
5005 
5006   return true;
5007 }
5008