1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 // If we have a truncating store or an extending load with a data size larger
226 // than 32-bits, we need to reduce to a 32-bit type.
227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return ST.enableFlatScratch() ? 128 : 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // Do not handle extending vector loads.
280   if (Ty.isVector() && MemSize != RegSize)
281     return false;
282 
283   // TODO: We should be able to widen loads if the alignment is high enough, but
284   // we also need to modify the memory access size.
285 #if 0
286   // Accept widening loads based on alignment.
287   if (IsLoad && MemSize < Size)
288     MemSize = std::max(MemSize, Align);
289 #endif
290 
291   // Only 1-byte and 2-byte to 32-bit extloads are valid.
292   if (MemSize != RegSize && RegSize != 32)
293     return false;
294 
295   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296     return false;
297 
298   switch (MemSize) {
299   case 8:
300   case 16:
301   case 32:
302   case 64:
303   case 128:
304     break;
305   case 96:
306     if (!ST.hasDwordx3LoadStores())
307       return false;
308     break;
309   case 256:
310   case 512:
311     // These may contextually need to be broken down.
312     break;
313   default:
314     return false;
315   }
316 
317   assert(RegSize >= MemSize);
318 
319   if (AlignBits < MemSize) {
320     const SITargetLowering *TLI = ST.getTargetLowering();
321     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322                                                  Align(AlignBits / 8)))
323       return false;
324   }
325 
326   return true;
327 }
328 
329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330 // workaround this. Eventually it should ignore the type for loads and only care
331 // about the size. Return true in cases where we will workaround this for now by
332 // bitcasting.
333 static bool loadStoreBitcastWorkaround(const LLT Ty) {
334   if (EnableNewLegality)
335     return false;
336 
337   const unsigned Size = Ty.getSizeInBits();
338   if (Size <= 64)
339     return false;
340   if (!Ty.isVector())
341     return true;
342 
343   LLT EltTy = Ty.getElementType();
344   if (EltTy.isPointer())
345     return true;
346 
347   unsigned EltSize = EltTy.getSizeInBits();
348   return EltSize != 32 && EltSize != 64;
349 }
350 
351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352   const LLT Ty = Query.Types[0];
353   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
354          !loadStoreBitcastWorkaround(Ty);
355 }
356 
357 /// Return true if a load or store of the type should be lowered with a bitcast
358 /// to a different type.
359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360                                        const unsigned MemSizeInBits) {
361   const unsigned Size = Ty.getSizeInBits();
362     if (Size != MemSizeInBits)
363       return Size <= 32 && Ty.isVector();
364 
365   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
366     return true;
367   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
368          !isRegisterVectorElementType(Ty.getElementType());
369 }
370 
371 /// Return true if we should legalize a load by widening an odd sized memory
372 /// access up to the alignment. Note this case when the memory access itself
373 /// changes, not the size of the result register.
374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375                             unsigned AlignInBits, unsigned AddrSpace,
376                             unsigned Opcode) {
377   // We don't want to widen cases that are naturally legal.
378   if (isPowerOf2_32(SizeInBits))
379     return false;
380 
381   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382   // end up widening these for a scalar load during RegBankSelect, since there
383   // aren't 96-bit scalar loads.
384   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385     return false;
386 
387   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388     return false;
389 
390   // A load is known dereferenceable up to the alignment, so it's legal to widen
391   // to it.
392   //
393   // TODO: Could check dereferenceable for less aligned cases.
394   unsigned RoundedSize = NextPowerOf2(SizeInBits);
395   if (AlignInBits < RoundedSize)
396     return false;
397 
398   // Do not widen if it would introduce a slow unaligned load.
399   const SITargetLowering *TLI = ST.getTargetLowering();
400   bool Fast = false;
401   return TLI->allowsMisalignedMemoryAccessesImpl(
402              RoundedSize, AddrSpace, Align(AlignInBits / 8),
403              MachineMemOperand::MOLoad, &Fast) &&
404          Fast;
405 }
406 
407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408                             unsigned Opcode) {
409   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410     return false;
411 
412   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413                          Query.MMODescrs[0].AlignInBits,
414                          Query.Types[1].getAddressSpace(), Opcode);
415 }
416 
417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
418                                          const GCNTargetMachine &TM)
419   :  ST(ST_) {
420   using namespace TargetOpcode;
421 
422   auto GetAddrSpacePtr = [&TM](unsigned AS) {
423     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424   };
425 
426   const LLT S1 = LLT::scalar(1);
427   const LLT S8 = LLT::scalar(8);
428   const LLT S16 = LLT::scalar(16);
429   const LLT S32 = LLT::scalar(32);
430   const LLT S64 = LLT::scalar(64);
431   const LLT S128 = LLT::scalar(128);
432   const LLT S256 = LLT::scalar(256);
433   const LLT S512 = LLT::scalar(512);
434   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435 
436   const LLT V2S8 = LLT::vector(2, 8);
437   const LLT V2S16 = LLT::vector(2, 16);
438   const LLT V4S16 = LLT::vector(4, 16);
439 
440   const LLT V2S32 = LLT::vector(2, 32);
441   const LLT V3S32 = LLT::vector(3, 32);
442   const LLT V4S32 = LLT::vector(4, 32);
443   const LLT V5S32 = LLT::vector(5, 32);
444   const LLT V6S32 = LLT::vector(6, 32);
445   const LLT V7S32 = LLT::vector(7, 32);
446   const LLT V8S32 = LLT::vector(8, 32);
447   const LLT V9S32 = LLT::vector(9, 32);
448   const LLT V10S32 = LLT::vector(10, 32);
449   const LLT V11S32 = LLT::vector(11, 32);
450   const LLT V12S32 = LLT::vector(12, 32);
451   const LLT V13S32 = LLT::vector(13, 32);
452   const LLT V14S32 = LLT::vector(14, 32);
453   const LLT V15S32 = LLT::vector(15, 32);
454   const LLT V16S32 = LLT::vector(16, 32);
455   const LLT V32S32 = LLT::vector(32, 32);
456 
457   const LLT V2S64 = LLT::vector(2, 64);
458   const LLT V3S64 = LLT::vector(3, 64);
459   const LLT V4S64 = LLT::vector(4, 64);
460   const LLT V5S64 = LLT::vector(5, 64);
461   const LLT V6S64 = LLT::vector(6, 64);
462   const LLT V7S64 = LLT::vector(7, 64);
463   const LLT V8S64 = LLT::vector(8, 64);
464   const LLT V16S64 = LLT::vector(16, 64);
465 
466   std::initializer_list<LLT> AllS32Vectors =
467     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469   std::initializer_list<LLT> AllS64Vectors =
470     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471 
472   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479 
480   const LLT CodePtr = FlatPtr;
481 
482   const std::initializer_list<LLT> AddrSpaces64 = {
483     GlobalPtr, ConstantPtr, FlatPtr
484   };
485 
486   const std::initializer_list<LLT> AddrSpaces32 = {
487     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488   };
489 
490   const std::initializer_list<LLT> FPTypesBase = {
491     S32, S64
492   };
493 
494   const std::initializer_list<LLT> FPTypes16 = {
495     S32, S64, S16
496   };
497 
498   const std::initializer_list<LLT> FPTypesPK16 = {
499     S32, S64, S16, V2S16
500   };
501 
502   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503 
504   auto &LegacyInfo = getLegacyLegalizerInfo();
505   LegacyInfo.setAction({G_BRCOND, S1},
506                        LegacyLegalizeActions::Legal); // VCC branches
507   LegacyInfo.setAction({G_BRCOND, S32},
508                        LegacyLegalizeActions::Legal); // SCC branches
509 
510   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
511   // elements for v3s16
512   getActionDefinitionsBuilder(G_PHI)
513     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
514     .legalFor(AllS32Vectors)
515     .legalFor(AllS64Vectors)
516     .legalFor(AddrSpaces64)
517     .legalFor(AddrSpaces32)
518     .legalIf(isPointer(0))
519     .clampScalar(0, S16, S256)
520     .widenScalarToNextPow2(0, 32)
521     .clampMaxNumElements(0, S32, 16)
522     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
523     .scalarize(0);
524 
525   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
526     // Full set of gfx9 features.
527     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
528       .legalFor({S32, S16, V2S16})
529       .clampScalar(0, S16, S32)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32);
533 
534     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
535       .legalFor({S32, S16, V2S16}) // Clamp modifier
536       .minScalarOrElt(0, S16)
537       .clampMaxNumElements(0, S16, 2)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32)
540       .lower();
541   } else if (ST.has16BitInsts()) {
542     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
543       .legalFor({S32, S16})
544       .clampScalar(0, S16, S32)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
547 
548     // Technically the saturating operations require clamp bit support, but this
549     // was introduced at the same time as 16-bit operations.
550     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
551       .legalFor({S32, S16}) // Clamp modifier
552       .minScalar(0, S16)
553       .scalarize(0)
554       .widenScalarToNextPow2(0, 16)
555       .lower();
556 
557     // We're just lowering this, but it helps get a better result to try to
558     // coerce to the desired type first.
559     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
560       .minScalar(0, S16)
561       .scalarize(0)
562       .lower();
563   } else {
564     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
565       .legalFor({S32})
566       .clampScalar(0, S32, S32)
567       .scalarize(0);
568 
569     if (ST.hasIntClamp()) {
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .legalFor({S32}) // Clamp modifier.
572         .scalarize(0)
573         .minScalarOrElt(0, S32)
574         .lower();
575     } else {
576       // Clamp bit support was added in VI, along with 16-bit operations.
577       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578         .minScalar(0, S32)
579         .scalarize(0)
580         .lower();
581     }
582 
583     // FIXME: DAG expansion gets better results. The widening uses the smaller
584     // range values and goes for the min/max lowering directly.
585     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
586       .minScalar(0, S32)
587       .scalarize(0)
588       .lower();
589   }
590 
591   getActionDefinitionsBuilder(
592       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
593       .customFor({S32, S64})
594       .clampScalar(0, S32, S64)
595       .widenScalarToNextPow2(0, 32)
596       .scalarize(0);
597 
598   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
599                    .legalFor({S32})
600                    .maxScalarOrElt(0, S32);
601 
602   if (ST.hasVOP3PInsts()) {
603     Mulh
604       .clampMaxNumElements(0, S8, 2)
605       .lowerFor({V2S8});
606   }
607 
608   Mulh
609     .scalarize(0)
610     .lower();
611 
612   // Report legal for any types we can handle anywhere. For the cases only legal
613   // on the SALU, RegBankSelect will be able to re-legalize.
614   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
615     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
616     .clampScalar(0, S32, S64)
617     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
618     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
619     .widenScalarToNextPow2(0)
620     .scalarize(0);
621 
622   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
623                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
624     .legalFor({{S32, S1}, {S32, S32}})
625     .minScalar(0, S32)
626     // TODO: .scalarize(0)
627     .lower();
628 
629   getActionDefinitionsBuilder(G_BITCAST)
630     // Don't worry about the size constraint.
631     .legalIf(all(isRegisterType(0), isRegisterType(1)))
632     .lower();
633 
634 
635   getActionDefinitionsBuilder(G_CONSTANT)
636     .legalFor({S1, S32, S64, S16, GlobalPtr,
637                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
638     .legalIf(isPointer(0))
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(0);
641 
642   getActionDefinitionsBuilder(G_FCONSTANT)
643     .legalFor({S32, S64, S16})
644     .clampScalar(0, S16, S64);
645 
646   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
647       .legalIf(isRegisterType(0))
648       // s1 and s16 are special cases because they have legal operations on
649       // them, but don't really occupy registers in the normal way.
650       .legalFor({S1, S16})
651       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
652       .clampScalarOrElt(0, S32, MaxScalar)
653       .widenScalarToNextPow2(0, 32)
654       .clampMaxNumElements(0, S32, 16);
655 
656   LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr},
657                        LegacyLegalizeActions::Legal);
658 
659   // If the amount is divergent, we have to do a wave reduction to get the
660   // maximum value, so this is expanded during RegBankSelect.
661   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662     .legalFor({{PrivatePtr, S32}});
663 
664   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665     .customIf(typeIsNot(0, PrivatePtr));
666 
667   LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal);
668 
669   auto &FPOpActions = getActionDefinitionsBuilder(
670     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671     .legalFor({S32, S64});
672   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673     .customFor({S32, S64});
674   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675     .customFor({S32, S64});
676 
677   if (ST.has16BitInsts()) {
678     if (ST.hasVOP3PInsts())
679       FPOpActions.legalFor({S16, V2S16});
680     else
681       FPOpActions.legalFor({S16});
682 
683     TrigActions.customFor({S16});
684     FDIVActions.customFor({S16});
685   }
686 
687   auto &MinNumMaxNum = getActionDefinitionsBuilder({
688       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689 
690   if (ST.hasVOP3PInsts()) {
691     MinNumMaxNum.customFor(FPTypesPK16)
692       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
693       .clampMaxNumElements(0, S16, 2)
694       .clampScalar(0, S16, S64)
695       .scalarize(0);
696   } else if (ST.has16BitInsts()) {
697     MinNumMaxNum.customFor(FPTypes16)
698       .clampScalar(0, S16, S64)
699       .scalarize(0);
700   } else {
701     MinNumMaxNum.customFor(FPTypesBase)
702       .clampScalar(0, S32, S64)
703       .scalarize(0);
704   }
705 
706   if (ST.hasVOP3PInsts())
707     FPOpActions.clampMaxNumElements(0, S16, 2);
708 
709   FPOpActions
710     .scalarize(0)
711     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713   TrigActions
714     .scalarize(0)
715     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717   FDIVActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   getActionDefinitionsBuilder({G_FNEG, G_FABS})
722     .legalFor(FPTypesPK16)
723     .clampMaxNumElements(0, S16, 2)
724     .scalarize(0)
725     .clampScalar(0, S16, S64);
726 
727   if (ST.has16BitInsts()) {
728     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729       .legalFor({S32, S64, S16})
730       .scalarize(0)
731       .clampScalar(0, S16, S64);
732   } else {
733     getActionDefinitionsBuilder(G_FSQRT)
734       .legalFor({S32, S64})
735       .scalarize(0)
736       .clampScalar(0, S32, S64);
737 
738     if (ST.hasFractBug()) {
739       getActionDefinitionsBuilder(G_FFLOOR)
740         .customFor({S64})
741         .legalFor({S32, S64})
742         .scalarize(0)
743         .clampScalar(0, S32, S64);
744     } else {
745       getActionDefinitionsBuilder(G_FFLOOR)
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     }
750   }
751 
752   getActionDefinitionsBuilder(G_FPTRUNC)
753     .legalFor({{S32, S64}, {S16, S32}})
754     .scalarize(0)
755     .lower();
756 
757   getActionDefinitionsBuilder(G_FPEXT)
758     .legalFor({{S64, S32}, {S32, S16}})
759     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760     .scalarize(0);
761 
762   getActionDefinitionsBuilder(G_FSUB)
763       // Use actual fsub instruction
764       .legalFor({S32})
765       // Must use fadd + fneg
766       .lowerFor({S64, S16, V2S16})
767       .scalarize(0)
768       .clampScalar(0, S32, S64);
769 
770   // Whether this is legal depends on the floating point mode for the function.
771   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773     FMad.customFor({S32, S16});
774   else if (ST.hasMadMacF32Insts())
775     FMad.customFor({S32});
776   else if (ST.hasMadF16())
777     FMad.customFor({S16});
778   FMad.scalarize(0)
779       .lower();
780 
781   auto &FRem = getActionDefinitionsBuilder(G_FREM);
782   if (ST.has16BitInsts()) {
783     FRem.customFor({S16, S32, S64});
784   } else {
785     FRem.minScalar(0, S32)
786         .customFor({S32, S64});
787   }
788   FRem.scalarize(0);
789 
790   // TODO: Do we need to clamp maximum bitwidth?
791   getActionDefinitionsBuilder(G_TRUNC)
792     .legalIf(isScalar(0))
793     .legalFor({{V2S16, V2S32}})
794     .clampMaxNumElements(0, S16, 2)
795     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796     // situations (like an invalid implicit use), we don't want to infinite loop
797     // in the legalizer.
798     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
799     .alwaysLegal();
800 
801   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803                {S32, S1}, {S64, S1}, {S16, S1}})
804     .scalarize(0)
805     .clampScalar(0, S32, S64)
806     .widenScalarToNextPow2(1, 32);
807 
808   // TODO: Split s1->s64 during regbankselect for VALU.
809   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811     .lowerFor({{S32, S64}})
812     .lowerIf(typeIs(1, S1))
813     .customFor({{S64, S64}});
814   if (ST.has16BitInsts())
815     IToFP.legalFor({{S16, S16}});
816   IToFP.clampScalar(1, S32, S64)
817        .minScalar(0, S32)
818        .scalarize(0)
819        .widenScalarToNextPow2(1);
820 
821   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823     .customFor({{S64, S64}})
824     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825   if (ST.has16BitInsts())
826     FPToI.legalFor({{S16, S16}});
827   else
828     FPToI.minScalar(1, S32);
829 
830   FPToI.minScalar(0, S32)
831        .widenScalarToNextPow2(0, 32)
832        .scalarize(0)
833        .lower();
834 
835   // Lower roundeven into G_FRINT
836   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837     .scalarize(0)
838     .lower();
839 
840   if (ST.has16BitInsts()) {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S16, S32, S64})
843       .clampScalar(0, S16, S64)
844       .scalarize(0);
845   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S32, S64})
848       .clampScalar(0, S32, S64)
849       .scalarize(0);
850   } else {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32})
853       .customFor({S64})
854       .clampScalar(0, S32, S64)
855       .scalarize(0);
856   }
857 
858   getActionDefinitionsBuilder(G_PTR_ADD)
859     .legalIf(all(isPointer(0), sameSize(0, 1)))
860     .scalarize(0)
861     .scalarSameSizeAs(1, 0);
862 
863   getActionDefinitionsBuilder(G_PTRMASK)
864     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865     .scalarSameSizeAs(1, 0)
866     .scalarize(0);
867 
868   auto &CmpBuilder =
869     getActionDefinitionsBuilder(G_ICMP)
870     // The compare output type differs based on the register bank of the output,
871     // so make both s1 and s32 legal.
872     //
873     // Scalar compares producing output in scc will be promoted to s32, as that
874     // is the allocatable register type that will be needed for the copy from
875     // scc. This will be promoted during RegBankSelect, and we assume something
876     // before that won't try to use s32 result types.
877     //
878     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879     // bank.
880     .legalForCartesianProduct(
881       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882     .legalForCartesianProduct(
883       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884   if (ST.has16BitInsts()) {
885     CmpBuilder.legalFor({{S1, S16}});
886   }
887 
888   CmpBuilder
889     .widenScalarToNextPow2(1)
890     .clampScalar(1, S32, S64)
891     .scalarize(0)
892     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893 
894   getActionDefinitionsBuilder(G_FCMP)
895     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0);
899 
900   // FIXME: fpow has a selection pattern that should move to custom lowering.
901   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902   if (ST.has16BitInsts())
903     Exp2Ops.legalFor({S32, S16});
904   else
905     Exp2Ops.legalFor({S32});
906   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907   Exp2Ops.scalarize(0);
908 
909   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910   if (ST.has16BitInsts())
911     ExpOps.customFor({{S32}, {S16}});
912   else
913     ExpOps.customFor({S32});
914   ExpOps.clampScalar(0, MinScalarFPTy, S32)
915         .scalarize(0);
916 
917   getActionDefinitionsBuilder(G_FPOWI)
918     .clampScalar(0, MinScalarFPTy, S32)
919     .lower();
920 
921   // The 64-bit versions produce 32-bit results, but only on the SALU.
922   getActionDefinitionsBuilder(G_CTPOP)
923     .legalFor({{S32, S32}, {S32, S64}})
924     .clampScalar(0, S32, S32)
925     .clampScalar(1, S32, S64)
926     .scalarize(0)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32);
929 
930   // The hardware instructions return a different result on 0 than the generic
931   // instructions expect. The hardware produces -1, but these produce the
932   // bitwidth.
933   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934     .scalarize(0)
935     .clampScalar(0, S32, S32)
936     .clampScalar(1, S32, S64)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32)
939     .lower();
940 
941   // The 64-bit versions produce 32-bit results, but only on the SALU.
942   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943     .legalFor({{S32, S32}, {S32, S64}})
944     .clampScalar(0, S32, S32)
945     .clampScalar(1, S32, S64)
946     .scalarize(0)
947     .widenScalarToNextPow2(0, 32)
948     .widenScalarToNextPow2(1, 32);
949 
950   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951   // RegBankSelect.
952   getActionDefinitionsBuilder(G_BITREVERSE)
953     .legalFor({S32, S64})
954     .clampScalar(0, S32, S64)
955     .scalarize(0)
956     .widenScalarToNextPow2(0);
957 
958   if (ST.has16BitInsts()) {
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S16, S32, V2S16})
961       .clampMaxNumElements(0, S16, 2)
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .clampScalar(0, S16, S32)
966       .scalarize(0);
967 
968     if (ST.hasVOP3PInsts()) {
969       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970         .legalFor({S32, S16, V2S16})
971         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972         .clampMaxNumElements(0, S16, 2)
973         .minScalar(0, S16)
974         .widenScalarToNextPow2(0)
975         .scalarize(0)
976         .lower();
977     } else {
978       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979         .legalFor({S32, S16})
980         .widenScalarToNextPow2(0)
981         .minScalar(0, S16)
982         .scalarize(0)
983         .lower();
984     }
985   } else {
986     // TODO: Should have same legality without v_perm_b32
987     getActionDefinitionsBuilder(G_BSWAP)
988       .legalFor({S32})
989       .lowerIf(scalarNarrowerThan(0, 32))
990       // FIXME: Fixing non-power-of-2 before clamp is workaround for
991       // narrowScalar limitation.
992       .widenScalarToNextPow2(0)
993       .maxScalar(0, S32)
994       .scalarize(0)
995       .lower();
996 
997     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998       .legalFor({S32})
999       .minScalar(0, S32)
1000       .widenScalarToNextPow2(0)
1001       .scalarize(0)
1002       .lower();
1003   }
1004 
1005   getActionDefinitionsBuilder(G_INTTOPTR)
1006     // List the common cases
1007     .legalForCartesianProduct(AddrSpaces64, {S64})
1008     .legalForCartesianProduct(AddrSpaces32, {S32})
1009     .scalarize(0)
1010     // Accept any address space as long as the size matches
1011     .legalIf(sameSize(0, 1))
1012     .widenScalarIf(smallerThan(1, 0),
1013       [](const LegalityQuery &Query) {
1014         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015       })
1016     .narrowScalarIf(largerThan(1, 0),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019       });
1020 
1021   getActionDefinitionsBuilder(G_PTRTOINT)
1022     // List the common cases
1023     .legalForCartesianProduct(AddrSpaces64, {S64})
1024     .legalForCartesianProduct(AddrSpaces32, {S32})
1025     .scalarize(0)
1026     // Accept any address space as long as the size matches
1027     .legalIf(sameSize(0, 1))
1028     .widenScalarIf(smallerThan(0, 1),
1029       [](const LegalityQuery &Query) {
1030         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031       })
1032     .narrowScalarIf(
1033       largerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       });
1037 
1038   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039     .scalarize(0)
1040     .custom();
1041 
1042   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043                                     bool IsLoad) -> bool {
1044     const LLT DstTy = Query.Types[0];
1045 
1046     // Split vector extloads.
1047     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049 
1050     if (MemSize < DstTy.getSizeInBits())
1051       MemSize = std::max(MemSize, AlignBits);
1052 
1053     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054       return true;
1055 
1056     const LLT PtrTy = Query.Types[1];
1057     unsigned AS = PtrTy.getAddressSpace();
1058     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059       return true;
1060 
1061     // Catch weird sized loads that don't evenly divide into the access sizes
1062     // TODO: May be able to widen depending on alignment etc.
1063     unsigned NumRegs = (MemSize + 31) / 32;
1064     if (NumRegs == 3) {
1065       if (!ST.hasDwordx3LoadStores())
1066         return true;
1067     } else {
1068       // If the alignment allows, these should have been widened.
1069       if (!isPowerOf2_32(NumRegs))
1070         return true;
1071     }
1072 
1073     if (AlignBits < MemSize) {
1074       const SITargetLowering *TLI = ST.getTargetLowering();
1075       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076                                                       Align(AlignBits / 8));
1077     }
1078 
1079     return false;
1080   };
1081 
1082   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085 
1086   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087   // LDS
1088   // TODO: Unsupported flat for SI.
1089 
1090   for (unsigned Op : {G_LOAD, G_STORE}) {
1091     const bool IsStore = Op == G_STORE;
1092 
1093     auto &Actions = getActionDefinitionsBuilder(Op);
1094     // Explicitly list some common cases.
1095     // TODO: Does this help compile time at all?
1096     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1098                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1099                                       {S64, GlobalPtr, 64, GlobalAlign32},
1100                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1101                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1102                                       {S32, GlobalPtr, 8, GlobalAlign8},
1103                                       {S32, GlobalPtr, 16, GlobalAlign16},
1104 
1105                                       {S32, LocalPtr, 32, 32},
1106                                       {S64, LocalPtr, 64, 32},
1107                                       {V2S32, LocalPtr, 64, 32},
1108                                       {S32, LocalPtr, 8, 8},
1109                                       {S32, LocalPtr, 16, 16},
1110                                       {V2S16, LocalPtr, 32, 32},
1111 
1112                                       {S32, PrivatePtr, 32, 32},
1113                                       {S32, PrivatePtr, 8, 8},
1114                                       {S32, PrivatePtr, 16, 16},
1115                                       {V2S16, PrivatePtr, 32, 32},
1116 
1117                                       {S32, ConstantPtr, 32, GlobalAlign32},
1118                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1119                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1120                                       {S64, ConstantPtr, 64, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122     Actions.legalIf(
1123       [=](const LegalityQuery &Query) -> bool {
1124         return isLoadStoreLegal(ST, Query);
1125       });
1126 
1127     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128     // 64-bits.
1129     //
1130     // TODO: Should generalize bitcast action into coerce, which will also cover
1131     // inserting addrspacecasts.
1132     Actions.customIf(typeIs(1, Constant32Ptr));
1133 
1134     // Turn any illegal element vectors into something easier to deal
1135     // with. These will ultimately produce 32-bit scalar shifts to extract the
1136     // parts anyway.
1137     //
1138     // For odd 16-bit element vectors, prefer to split those into pieces with
1139     // 16-bit vector parts.
1140     Actions.bitcastIf(
1141       [=](const LegalityQuery &Query) -> bool {
1142         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143                                           Query.MMODescrs[0].SizeInBits);
1144       }, bitcastToRegisterType(0));
1145 
1146     if (!IsStore) {
1147       // Widen suitably aligned loads by loading extra bytes. The standard
1148       // legalization actions can't properly express widening memory operands.
1149       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150         return shouldWidenLoad(ST, Query, G_LOAD);
1151       });
1152     }
1153 
1154     // FIXME: load/store narrowing should be moved to lower action
1155     Actions
1156         .narrowScalarIf(
1157             [=](const LegalityQuery &Query) -> bool {
1158               return !Query.Types[0].isVector() &&
1159                      needToSplitMemOp(Query, Op == G_LOAD);
1160             },
1161             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162               const LLT DstTy = Query.Types[0];
1163               const LLT PtrTy = Query.Types[1];
1164 
1165               const unsigned DstSize = DstTy.getSizeInBits();
1166               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167 
1168               // Split extloads.
1169               if (DstSize > MemSize)
1170                 return std::make_pair(0, LLT::scalar(MemSize));
1171 
1172               if (!isPowerOf2_32(DstSize)) {
1173                 // We're probably decomposing an odd sized store. Try to split
1174                 // to the widest type. TODO: Account for alignment. As-is it
1175                 // should be OK, since the new parts will be further legalized.
1176                 unsigned FloorSize = PowerOf2Floor(DstSize);
1177                 return std::make_pair(0, LLT::scalar(FloorSize));
1178               }
1179 
1180               if (DstSize > 32 && (DstSize % 32 != 0)) {
1181                 // FIXME: Need a way to specify non-extload of larger size if
1182                 // suitably aligned.
1183                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184               }
1185 
1186               unsigned MaxSize = maxSizeForAddrSpace(ST,
1187                                                      PtrTy.getAddressSpace(),
1188                                                      Op == G_LOAD);
1189               if (MemSize > MaxSize)
1190                 return std::make_pair(0, LLT::scalar(MaxSize));
1191 
1192               unsigned Align = Query.MMODescrs[0].AlignInBits;
1193               return std::make_pair(0, LLT::scalar(Align));
1194             })
1195         .fewerElementsIf(
1196             [=](const LegalityQuery &Query) -> bool {
1197               return Query.Types[0].isVector() &&
1198                      needToSplitMemOp(Query, Op == G_LOAD);
1199             },
1200             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201               const LLT DstTy = Query.Types[0];
1202               const LLT PtrTy = Query.Types[1];
1203 
1204               LLT EltTy = DstTy.getElementType();
1205               unsigned MaxSize = maxSizeForAddrSpace(ST,
1206                                                      PtrTy.getAddressSpace(),
1207                                                      Op == G_LOAD);
1208 
1209               // FIXME: Handle widened to power of 2 results better. This ends
1210               // up scalarizing.
1211               // FIXME: 3 element stores scalarized on SI
1212 
1213               // Split if it's too large for the address space.
1214               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215                 unsigned NumElts = DstTy.getNumElements();
1216                 unsigned EltSize = EltTy.getSizeInBits();
1217 
1218                 if (MaxSize % EltSize == 0) {
1219                   return std::make_pair(
1220                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1221                 }
1222 
1223                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1224 
1225                 // FIXME: Refine when odd breakdowns handled
1226                 // The scalars will need to be re-legalized.
1227                 if (NumPieces == 1 || NumPieces >= NumElts ||
1228                     NumElts % NumPieces != 0)
1229                   return std::make_pair(0, EltTy);
1230 
1231                 return std::make_pair(0,
1232                                       LLT::vector(NumElts / NumPieces, EltTy));
1233               }
1234 
1235               // FIXME: We could probably handle weird extending loads better.
1236               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1237               if (DstTy.getSizeInBits() > MemSize)
1238                 return std::make_pair(0, EltTy);
1239 
1240               unsigned EltSize = EltTy.getSizeInBits();
1241               unsigned DstSize = DstTy.getSizeInBits();
1242               if (!isPowerOf2_32(DstSize)) {
1243                 // We're probably decomposing an odd sized store. Try to split
1244                 // to the widest type. TODO: Account for alignment. As-is it
1245                 // should be OK, since the new parts will be further legalized.
1246                 unsigned FloorSize = PowerOf2Floor(DstSize);
1247                 return std::make_pair(
1248                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1249               }
1250 
1251               // Need to split because of alignment.
1252               unsigned Align = Query.MMODescrs[0].AlignInBits;
1253               if (EltSize > Align &&
1254                   (EltSize / Align < DstTy.getNumElements())) {
1255                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1256               }
1257 
1258               // May need relegalization for the scalars.
1259               return std::make_pair(0, EltTy);
1260             })
1261     .lowerIfMemSizeNotPow2()
1262     .minScalar(0, S32)
1263     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1264     .widenScalarToNextPow2(0)
1265     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1266     .lower();
1267   }
1268 
1269   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1270                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1271                                                   {S32, GlobalPtr, 16, 2 * 8},
1272                                                   {S32, LocalPtr, 8, 8},
1273                                                   {S32, LocalPtr, 16, 16},
1274                                                   {S32, PrivatePtr, 8, 8},
1275                                                   {S32, PrivatePtr, 16, 16},
1276                                                   {S32, ConstantPtr, 8, 8},
1277                                                   {S32, ConstantPtr, 16, 2 * 8}})
1278                        .legalIf(
1279                          [=](const LegalityQuery &Query) -> bool {
1280                            return isLoadStoreLegal(ST, Query);
1281                          });
1282 
1283   if (ST.hasFlatAddressSpace()) {
1284     ExtLoads.legalForTypesWithMemDesc(
1285         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1286   }
1287 
1288   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1289   // 64-bits.
1290   //
1291   // TODO: Should generalize bitcast action into coerce, which will also cover
1292   // inserting addrspacecasts.
1293   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1294 
1295   ExtLoads.clampScalar(0, S32, S32)
1296           .widenScalarToNextPow2(0)
1297           .unsupportedIfMemSizeNotPow2()
1298           .lower();
1299 
1300   auto &Atomics = getActionDefinitionsBuilder(
1301     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1302      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1303      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1304      G_ATOMICRMW_UMIN})
1305     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1306                {S64, GlobalPtr}, {S64, LocalPtr},
1307                {S32, RegionPtr}, {S64, RegionPtr}});
1308   if (ST.hasFlatAddressSpace()) {
1309     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1310   }
1311 
1312   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1313   if (ST.hasLDSFPAtomics()) {
1314     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1315     if (ST.hasGFX90AInsts())
1316       Atomic.legalFor({{S64, LocalPtr}});
1317   }
1318   if (ST.hasAtomicFaddInsts())
1319     Atomic.legalFor({{S32, GlobalPtr}});
1320 
1321   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1322   // demarshalling
1323   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1324     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1325                 {S32, FlatPtr}, {S64, FlatPtr}})
1326     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1327                {S32, RegionPtr}, {S64, RegionPtr}});
1328   // TODO: Pointer types, any 32-bit or 64-bit vector
1329 
1330   // Condition should be s32 for scalar, s1 for vector.
1331   getActionDefinitionsBuilder(G_SELECT)
1332     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1333           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1334           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1335     .clampScalar(0, S16, S64)
1336     .scalarize(1)
1337     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1338     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1339     .clampMaxNumElements(0, S32, 2)
1340     .clampMaxNumElements(0, LocalPtr, 2)
1341     .clampMaxNumElements(0, PrivatePtr, 2)
1342     .scalarize(0)
1343     .widenScalarToNextPow2(0)
1344     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1345 
1346   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1347   // be more flexible with the shift amount type.
1348   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1349     .legalFor({{S32, S32}, {S64, S32}});
1350   if (ST.has16BitInsts()) {
1351     if (ST.hasVOP3PInsts()) {
1352       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1353             .clampMaxNumElements(0, S16, 2);
1354     } else
1355       Shifts.legalFor({{S16, S16}});
1356 
1357     // TODO: Support 16-bit shift amounts for all types
1358     Shifts.widenScalarIf(
1359       [=](const LegalityQuery &Query) {
1360         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1361         // 32-bit amount.
1362         const LLT ValTy = Query.Types[0];
1363         const LLT AmountTy = Query.Types[1];
1364         return ValTy.getSizeInBits() <= 16 &&
1365                AmountTy.getSizeInBits() < 16;
1366       }, changeTo(1, S16));
1367     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1368     Shifts.clampScalar(1, S32, S32);
1369     Shifts.clampScalar(0, S16, S64);
1370     Shifts.widenScalarToNextPow2(0, 16);
1371 
1372     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1373       .minScalar(0, S16)
1374       .scalarize(0)
1375       .lower();
1376   } else {
1377     // Make sure we legalize the shift amount type first, as the general
1378     // expansion for the shifted type will produce much worse code if it hasn't
1379     // been truncated already.
1380     Shifts.clampScalar(1, S32, S32);
1381     Shifts.clampScalar(0, S32, S64);
1382     Shifts.widenScalarToNextPow2(0, 32);
1383 
1384     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1385       .minScalar(0, S32)
1386       .scalarize(0)
1387       .lower();
1388   }
1389   Shifts.scalarize(0);
1390 
1391   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1392     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1393     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1394     unsigned IdxTypeIdx = 2;
1395 
1396     getActionDefinitionsBuilder(Op)
1397       .customIf([=](const LegalityQuery &Query) {
1398           const LLT EltTy = Query.Types[EltTypeIdx];
1399           const LLT VecTy = Query.Types[VecTypeIdx];
1400           const LLT IdxTy = Query.Types[IdxTypeIdx];
1401           const unsigned EltSize = EltTy.getSizeInBits();
1402           return (EltSize == 32 || EltSize == 64) &&
1403                   VecTy.getSizeInBits() % 32 == 0 &&
1404                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1405                   IdxTy.getSizeInBits() == 32;
1406         })
1407       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1408                  bitcastToVectorElement32(VecTypeIdx))
1409       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1410       .bitcastIf(
1411         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1412         [=](const LegalityQuery &Query) {
1413           // For > 64-bit element types, try to turn this into a 64-bit
1414           // element vector since we may be able to do better indexing
1415           // if this is scalar. If not, fall back to 32.
1416           const LLT EltTy = Query.Types[EltTypeIdx];
1417           const LLT VecTy = Query.Types[VecTypeIdx];
1418           const unsigned DstEltSize = EltTy.getSizeInBits();
1419           const unsigned VecSize = VecTy.getSizeInBits();
1420 
1421           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1422           return std::make_pair(
1423             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1424         })
1425       .clampScalar(EltTypeIdx, S32, S64)
1426       .clampScalar(VecTypeIdx, S32, S64)
1427       .clampScalar(IdxTypeIdx, S32, S32)
1428       .clampMaxNumElements(VecTypeIdx, S32, 32)
1429       // TODO: Clamp elements for 64-bit vectors?
1430       // It should only be necessary with variable indexes.
1431       // As a last resort, lower to the stack
1432       .lower();
1433   }
1434 
1435   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436     .unsupportedIf([=](const LegalityQuery &Query) {
1437         const LLT &EltTy = Query.Types[1].getElementType();
1438         return Query.Types[0] != EltTy;
1439       });
1440 
1441   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444 
1445     // FIXME: Doesn't handle extract of illegal sizes.
1446     getActionDefinitionsBuilder(Op)
1447       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448       // FIXME: Multiples of 16 should not be legal.
1449       .legalIf([=](const LegalityQuery &Query) {
1450           const LLT BigTy = Query.Types[BigTyIdx];
1451           const LLT LitTy = Query.Types[LitTyIdx];
1452           return (BigTy.getSizeInBits() % 32 == 0) &&
1453                  (LitTy.getSizeInBits() % 16 == 0);
1454         })
1455       .widenScalarIf(
1456         [=](const LegalityQuery &Query) {
1457           const LLT BigTy = Query.Types[BigTyIdx];
1458           return (BigTy.getScalarSizeInBits() < 16);
1459         },
1460         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT LitTy = Query.Types[LitTyIdx];
1464           return (LitTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1467       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468       .widenScalarToNextPow2(BigTyIdx, 32);
1469 
1470   }
1471 
1472   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473     .legalForCartesianProduct(AllS32Vectors, {S32})
1474     .legalForCartesianProduct(AllS64Vectors, {S64})
1475     .clampNumElements(0, V16S32, V32S32)
1476     .clampNumElements(0, V2S64, V16S64)
1477     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478 
1479   if (ST.hasScalarPackInsts()) {
1480     BuildVector
1481       // FIXME: Should probably widen s1 vectors straight to s32
1482       .minScalarOrElt(0, S16)
1483       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484       .minScalar(1, S32);
1485 
1486     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487       .legalFor({V2S16, S32})
1488       .lower();
1489     BuildVector.minScalarOrElt(0, S32);
1490   } else {
1491     BuildVector.customFor({V2S16, S16});
1492     BuildVector.minScalarOrElt(0, S32);
1493 
1494     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495       .customFor({V2S16, S32})
1496       .lower();
1497   }
1498 
1499   BuildVector.legalIf(isRegisterType(0));
1500 
1501   // FIXME: Clamp maximum size
1502   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504     .clampMaxNumElements(0, S32, 32)
1505     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506     .clampMaxNumElements(0, S16, 64);
1507 
1508   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509   // pre-legalize.
1510   if (ST.hasVOP3PInsts()) {
1511     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512       .customFor({V2S16, V2S16})
1513       .lower();
1514   } else
1515     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516 
1517   // Merge/Unmerge
1518   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521 
1522     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523       const LLT Ty = Query.Types[TypeIdx];
1524       if (Ty.isVector()) {
1525         const LLT &EltTy = Ty.getElementType();
1526         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527           return true;
1528         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529           return true;
1530       }
1531       return false;
1532     };
1533 
1534     auto &Builder = getActionDefinitionsBuilder(Op)
1535       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536       .lowerFor({{S16, V2S16}})
1537       .lowerIf([=](const LegalityQuery &Query) {
1538           const LLT BigTy = Query.Types[BigTyIdx];
1539           return BigTy.getSizeInBits() == 32;
1540         })
1541       // Try to widen to s16 first for small types.
1542       // TODO: Only do this on targets with legal s16 shifts
1543       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547                            elementTypeIs(1, S16)),
1548                        changeTo(1, V2S16))
1549       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551       // valid.
1552       .clampScalar(LitTyIdx, S32, S512)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554       // Break up vectors with weird elements into scalars
1555       .fewerElementsIf(
1556         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557         scalarize(0))
1558       .fewerElementsIf(
1559         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560         scalarize(1))
1561       .clampScalar(BigTyIdx, S32, MaxScalar);
1562 
1563     if (Op == G_MERGE_VALUES) {
1564       Builder.widenScalarIf(
1565         // TODO: Use 16-bit shifts if legal for 8-bit values?
1566         [=](const LegalityQuery &Query) {
1567           const LLT Ty = Query.Types[LitTyIdx];
1568           return Ty.getSizeInBits() < 32;
1569         },
1570         changeTo(LitTyIdx, S32));
1571     }
1572 
1573     Builder.widenScalarIf(
1574       [=](const LegalityQuery &Query) {
1575         const LLT Ty = Query.Types[BigTyIdx];
1576         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577           Ty.getSizeInBits() % 16 != 0;
1578       },
1579       [=](const LegalityQuery &Query) {
1580         // Pick the next power of 2, or a multiple of 64 over 128.
1581         // Whichever is smaller.
1582         const LLT &Ty = Query.Types[BigTyIdx];
1583         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584         if (NewSizeInBits >= 256) {
1585           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586           if (RoundedTo < NewSizeInBits)
1587             NewSizeInBits = RoundedTo;
1588         }
1589         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590       })
1591       // Any vectors left are the wrong size. Scalarize them.
1592       .scalarize(0)
1593       .scalarize(1);
1594   }
1595 
1596   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597   // RegBankSelect.
1598   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599     .legalFor({{S32}, {S64}});
1600 
1601   if (ST.hasVOP3PInsts()) {
1602     SextInReg.lowerFor({{V2S16}})
1603       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604       // get more vector shift opportunities, since we'll get those when
1605       // expanded.
1606       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1607   } else if (ST.has16BitInsts()) {
1608     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609   } else {
1610     // Prefer to promote to s32 before lowering if we don't have 16-bit
1611     // shifts. This avoid a lot of intermediate truncate and extend operations.
1612     SextInReg.lowerFor({{S32}, {S64}});
1613   }
1614 
1615   SextInReg
1616     .scalarize(0)
1617     .clampScalar(0, S32, S64)
1618     .lower();
1619 
1620   // TODO: Only Try to form v2s16 with legal packed instructions.
1621   getActionDefinitionsBuilder(G_FSHR)
1622     .legalFor({{S32, S32}})
1623     .lowerFor({{V2S16, V2S16}})
1624     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1625     .scalarize(0)
1626     .lower();
1627 
1628   if (ST.hasVOP3PInsts()) {
1629     getActionDefinitionsBuilder(G_FSHL)
1630       .lowerFor({{V2S16, V2S16}})
1631       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1632       .scalarize(0)
1633       .lower();
1634   } else {
1635     getActionDefinitionsBuilder(G_FSHL)
1636       .scalarize(0)
1637       .lower();
1638   }
1639 
1640   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1641     .legalFor({S64});
1642 
1643   getActionDefinitionsBuilder(G_FENCE)
1644     .alwaysLegal();
1645 
1646   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1647       .scalarize(0)
1648       .minScalar(0, S32)
1649       .lower();
1650 
1651   getActionDefinitionsBuilder({
1652       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1653       G_FCOPYSIGN,
1654 
1655       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1656       G_ATOMICRMW_NAND,
1657       G_ATOMICRMW_FSUB,
1658       G_READ_REGISTER,
1659       G_WRITE_REGISTER,
1660 
1661       G_SADDO, G_SSUBO,
1662 
1663        // TODO: Implement
1664       G_FMINIMUM, G_FMAXIMUM}).lower();
1665 
1666   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1667         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1668         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1669     .unsupported();
1670 
1671   getLegacyLegalizerInfo().computeTables();
1672   verify(*ST.getInstrInfo());
1673 }
1674 
1675 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1676                                          MachineInstr &MI) const {
1677   MachineIRBuilder &B = Helper.MIRBuilder;
1678   MachineRegisterInfo &MRI = *B.getMRI();
1679 
1680   switch (MI.getOpcode()) {
1681   case TargetOpcode::G_ADDRSPACE_CAST:
1682     return legalizeAddrSpaceCast(MI, MRI, B);
1683   case TargetOpcode::G_FRINT:
1684     return legalizeFrint(MI, MRI, B);
1685   case TargetOpcode::G_FCEIL:
1686     return legalizeFceil(MI, MRI, B);
1687   case TargetOpcode::G_FREM:
1688     return legalizeFrem(MI, MRI, B);
1689   case TargetOpcode::G_INTRINSIC_TRUNC:
1690     return legalizeIntrinsicTrunc(MI, MRI, B);
1691   case TargetOpcode::G_SITOFP:
1692     return legalizeITOFP(MI, MRI, B, true);
1693   case TargetOpcode::G_UITOFP:
1694     return legalizeITOFP(MI, MRI, B, false);
1695   case TargetOpcode::G_FPTOSI:
1696     return legalizeFPTOI(MI, MRI, B, true);
1697   case TargetOpcode::G_FPTOUI:
1698     return legalizeFPTOI(MI, MRI, B, false);
1699   case TargetOpcode::G_FMINNUM:
1700   case TargetOpcode::G_FMAXNUM:
1701   case TargetOpcode::G_FMINNUM_IEEE:
1702   case TargetOpcode::G_FMAXNUM_IEEE:
1703     return legalizeMinNumMaxNum(Helper, MI);
1704   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1705     return legalizeExtractVectorElt(MI, MRI, B);
1706   case TargetOpcode::G_INSERT_VECTOR_ELT:
1707     return legalizeInsertVectorElt(MI, MRI, B);
1708   case TargetOpcode::G_SHUFFLE_VECTOR:
1709     return legalizeShuffleVector(MI, MRI, B);
1710   case TargetOpcode::G_FSIN:
1711   case TargetOpcode::G_FCOS:
1712     return legalizeSinCos(MI, MRI, B);
1713   case TargetOpcode::G_GLOBAL_VALUE:
1714     return legalizeGlobalValue(MI, MRI, B);
1715   case TargetOpcode::G_LOAD:
1716   case TargetOpcode::G_SEXTLOAD:
1717   case TargetOpcode::G_ZEXTLOAD:
1718     return legalizeLoad(Helper, MI);
1719   case TargetOpcode::G_FMAD:
1720     return legalizeFMad(MI, MRI, B);
1721   case TargetOpcode::G_FDIV:
1722     return legalizeFDIV(MI, MRI, B);
1723   case TargetOpcode::G_UDIV:
1724   case TargetOpcode::G_UREM:
1725   case TargetOpcode::G_UDIVREM:
1726     return legalizeUnsignedDIV_REM(MI, MRI, B);
1727   case TargetOpcode::G_SDIV:
1728   case TargetOpcode::G_SREM:
1729   case TargetOpcode::G_SDIVREM:
1730     return legalizeSignedDIV_REM(MI, MRI, B);
1731   case TargetOpcode::G_ATOMIC_CMPXCHG:
1732     return legalizeAtomicCmpXChg(MI, MRI, B);
1733   case TargetOpcode::G_FLOG:
1734     return legalizeFlog(MI, B, numbers::ln2f);
1735   case TargetOpcode::G_FLOG10:
1736     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1737   case TargetOpcode::G_FEXP:
1738     return legalizeFExp(MI, B);
1739   case TargetOpcode::G_FPOW:
1740     return legalizeFPow(MI, B);
1741   case TargetOpcode::G_FFLOOR:
1742     return legalizeFFloor(MI, MRI, B);
1743   case TargetOpcode::G_BUILD_VECTOR:
1744     return legalizeBuildVector(MI, MRI, B);
1745   default:
1746     return false;
1747   }
1748 
1749   llvm_unreachable("expected switch to return");
1750 }
1751 
1752 Register AMDGPULegalizerInfo::getSegmentAperture(
1753   unsigned AS,
1754   MachineRegisterInfo &MRI,
1755   MachineIRBuilder &B) const {
1756   MachineFunction &MF = B.getMF();
1757   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1758   const LLT S32 = LLT::scalar(32);
1759 
1760   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1761 
1762   if (ST.hasApertureRegs()) {
1763     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1764     // getreg.
1765     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1766         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1767         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1768     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1769         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1770         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1771     unsigned Encoding =
1772         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1773         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1774         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1775 
1776     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1777 
1778     B.buildInstr(AMDGPU::S_GETREG_B32)
1779       .addDef(GetReg)
1780       .addImm(Encoding);
1781     MRI.setType(GetReg, S32);
1782 
1783     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1784     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1785   }
1786 
1787   Register QueuePtr = MRI.createGenericVirtualRegister(
1788     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1789 
1790   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1791     return Register();
1792 
1793   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1794   // private_segment_aperture_base_hi.
1795   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1796 
1797   // TODO: can we be smarter about machine pointer info?
1798   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1799   MachineMemOperand *MMO = MF.getMachineMemOperand(
1800       PtrInfo,
1801       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1802           MachineMemOperand::MOInvariant,
1803       4, commonAlignment(Align(64), StructOffset));
1804 
1805   Register LoadAddr;
1806 
1807   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1808   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1809 }
1810 
1811 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1812   MachineInstr &MI, MachineRegisterInfo &MRI,
1813   MachineIRBuilder &B) const {
1814   MachineFunction &MF = B.getMF();
1815 
1816   const LLT S32 = LLT::scalar(32);
1817   Register Dst = MI.getOperand(0).getReg();
1818   Register Src = MI.getOperand(1).getReg();
1819 
1820   LLT DstTy = MRI.getType(Dst);
1821   LLT SrcTy = MRI.getType(Src);
1822   unsigned DestAS = DstTy.getAddressSpace();
1823   unsigned SrcAS = SrcTy.getAddressSpace();
1824 
1825   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1826   // vector element.
1827   assert(!DstTy.isVector());
1828 
1829   const AMDGPUTargetMachine &TM
1830     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1831 
1832   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1833     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1834     return true;
1835   }
1836 
1837   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1838     // Truncate.
1839     B.buildExtract(Dst, Src, 0);
1840     MI.eraseFromParent();
1841     return true;
1842   }
1843 
1844   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1845     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1846     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1847 
1848     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1849     // another. Merge operands are required to be the same type, but creating an
1850     // extra ptrtoint would be kind of pointless.
1851     auto HighAddr = B.buildConstant(
1852       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1853     B.buildMerge(Dst, {Src, HighAddr});
1854     MI.eraseFromParent();
1855     return true;
1856   }
1857 
1858   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1859     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1860            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1861     unsigned NullVal = TM.getNullPointerValue(DestAS);
1862 
1863     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1864     auto FlatNull = B.buildConstant(SrcTy, 0);
1865 
1866     // Extract low 32-bits of the pointer.
1867     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1868 
1869     auto CmpRes =
1870         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1871     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1872 
1873     MI.eraseFromParent();
1874     return true;
1875   }
1876 
1877   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1878     return false;
1879 
1880   if (!ST.hasFlatAddressSpace())
1881     return false;
1882 
1883   auto SegmentNull =
1884       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1885   auto FlatNull =
1886       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1887 
1888   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1889   if (!ApertureReg.isValid())
1890     return false;
1891 
1892   auto CmpRes =
1893       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1894 
1895   // Coerce the type of the low half of the result so we can use merge_values.
1896   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1897 
1898   // TODO: Should we allow mismatched types but matching sizes in merges to
1899   // avoid the ptrtoint?
1900   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1901   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1902 
1903   MI.eraseFromParent();
1904   return true;
1905 }
1906 
1907 bool AMDGPULegalizerInfo::legalizeFrint(
1908   MachineInstr &MI, MachineRegisterInfo &MRI,
1909   MachineIRBuilder &B) const {
1910   Register Src = MI.getOperand(1).getReg();
1911   LLT Ty = MRI.getType(Src);
1912   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1913 
1914   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1915   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1916 
1917   auto C1 = B.buildFConstant(Ty, C1Val);
1918   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1919 
1920   // TODO: Should this propagate fast-math-flags?
1921   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1922   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1923 
1924   auto C2 = B.buildFConstant(Ty, C2Val);
1925   auto Fabs = B.buildFAbs(Ty, Src);
1926 
1927   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1928   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1929   MI.eraseFromParent();
1930   return true;
1931 }
1932 
1933 bool AMDGPULegalizerInfo::legalizeFceil(
1934   MachineInstr &MI, MachineRegisterInfo &MRI,
1935   MachineIRBuilder &B) const {
1936 
1937   const LLT S1 = LLT::scalar(1);
1938   const LLT S64 = LLT::scalar(64);
1939 
1940   Register Src = MI.getOperand(1).getReg();
1941   assert(MRI.getType(Src) == S64);
1942 
1943   // result = trunc(src)
1944   // if (src > 0.0 && src != result)
1945   //   result += 1.0
1946 
1947   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1948 
1949   const auto Zero = B.buildFConstant(S64, 0.0);
1950   const auto One = B.buildFConstant(S64, 1.0);
1951   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1952   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1953   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1954   auto Add = B.buildSelect(S64, And, One, Zero);
1955 
1956   // TODO: Should this propagate fast-math-flags?
1957   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1958   return true;
1959 }
1960 
1961 bool AMDGPULegalizerInfo::legalizeFrem(
1962   MachineInstr &MI, MachineRegisterInfo &MRI,
1963   MachineIRBuilder &B) const {
1964     Register DstReg = MI.getOperand(0).getReg();
1965     Register Src0Reg = MI.getOperand(1).getReg();
1966     Register Src1Reg = MI.getOperand(2).getReg();
1967     auto Flags = MI.getFlags();
1968     LLT Ty = MRI.getType(DstReg);
1969 
1970     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1971     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1972     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1973     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1974     MI.eraseFromParent();
1975     return true;
1976 }
1977 
1978 static MachineInstrBuilder extractF64Exponent(Register Hi,
1979                                               MachineIRBuilder &B) {
1980   const unsigned FractBits = 52;
1981   const unsigned ExpBits = 11;
1982   LLT S32 = LLT::scalar(32);
1983 
1984   auto Const0 = B.buildConstant(S32, FractBits - 32);
1985   auto Const1 = B.buildConstant(S32, ExpBits);
1986 
1987   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1988     .addUse(Hi)
1989     .addUse(Const0.getReg(0))
1990     .addUse(Const1.getReg(0));
1991 
1992   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1993 }
1994 
1995 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B) const {
1998   const LLT S1 = LLT::scalar(1);
1999   const LLT S32 = LLT::scalar(32);
2000   const LLT S64 = LLT::scalar(64);
2001 
2002   Register Src = MI.getOperand(1).getReg();
2003   assert(MRI.getType(Src) == S64);
2004 
2005   // TODO: Should this use extract since the low half is unused?
2006   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2007   Register Hi = Unmerge.getReg(1);
2008 
2009   // Extract the upper half, since this is where we will find the sign and
2010   // exponent.
2011   auto Exp = extractF64Exponent(Hi, B);
2012 
2013   const unsigned FractBits = 52;
2014 
2015   // Extract the sign bit.
2016   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2017   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2018 
2019   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2020 
2021   const auto Zero32 = B.buildConstant(S32, 0);
2022 
2023   // Extend back to 64-bits.
2024   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2025 
2026   auto Shr = B.buildAShr(S64, FractMask, Exp);
2027   auto Not = B.buildNot(S64, Shr);
2028   auto Tmp0 = B.buildAnd(S64, Src, Not);
2029   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2030 
2031   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2032   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2033 
2034   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2035   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2036   MI.eraseFromParent();
2037   return true;
2038 }
2039 
2040 bool AMDGPULegalizerInfo::legalizeITOFP(
2041   MachineInstr &MI, MachineRegisterInfo &MRI,
2042   MachineIRBuilder &B, bool Signed) const {
2043 
2044   Register Dst = MI.getOperand(0).getReg();
2045   Register Src = MI.getOperand(1).getReg();
2046 
2047   const LLT S64 = LLT::scalar(64);
2048   const LLT S32 = LLT::scalar(32);
2049 
2050   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2051 
2052   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2053 
2054   auto CvtHi = Signed ?
2055     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2056     B.buildUITOFP(S64, Unmerge.getReg(1));
2057 
2058   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2059 
2060   auto ThirtyTwo = B.buildConstant(S32, 32);
2061   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2062     .addUse(CvtHi.getReg(0))
2063     .addUse(ThirtyTwo.getReg(0));
2064 
2065   // TODO: Should this propagate fast-math-flags?
2066   B.buildFAdd(Dst, LdExp, CvtLo);
2067   MI.eraseFromParent();
2068   return true;
2069 }
2070 
2071 // TODO: Copied from DAG implementation. Verify logic and document how this
2072 // actually works.
2073 bool AMDGPULegalizerInfo::legalizeFPTOI(
2074   MachineInstr &MI, MachineRegisterInfo &MRI,
2075   MachineIRBuilder &B, bool Signed) const {
2076 
2077   Register Dst = MI.getOperand(0).getReg();
2078   Register Src = MI.getOperand(1).getReg();
2079 
2080   const LLT S64 = LLT::scalar(64);
2081   const LLT S32 = LLT::scalar(32);
2082 
2083   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2084 
2085   unsigned Flags = MI.getFlags();
2086 
2087   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2088   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2089   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2090 
2091   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2092   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2093   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2094 
2095   auto Hi = Signed ?
2096     B.buildFPTOSI(S32, FloorMul) :
2097     B.buildFPTOUI(S32, FloorMul);
2098   auto Lo = B.buildFPTOUI(S32, Fma);
2099 
2100   B.buildMerge(Dst, { Lo, Hi });
2101   MI.eraseFromParent();
2102 
2103   return true;
2104 }
2105 
2106 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2107                                                MachineInstr &MI) const {
2108   MachineFunction &MF = Helper.MIRBuilder.getMF();
2109   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2110 
2111   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2112                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2113 
2114   // With ieee_mode disabled, the instructions have the correct behavior
2115   // already for G_FMINNUM/G_FMAXNUM
2116   if (!MFI->getMode().IEEE)
2117     return !IsIEEEOp;
2118 
2119   if (IsIEEEOp)
2120     return true;
2121 
2122   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2123 }
2124 
2125 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2126   MachineInstr &MI, MachineRegisterInfo &MRI,
2127   MachineIRBuilder &B) const {
2128   // TODO: Should move some of this into LegalizerHelper.
2129 
2130   // TODO: Promote dynamic indexing of s16 to s32
2131 
2132   // FIXME: Artifact combiner probably should have replaced the truncated
2133   // constant before this, so we shouldn't need
2134   // getConstantVRegValWithLookThrough.
2135   Optional<ValueAndVReg> MaybeIdxVal =
2136       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2137   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2138     return true;
2139   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2140 
2141   Register Dst = MI.getOperand(0).getReg();
2142   Register Vec = MI.getOperand(1).getReg();
2143 
2144   LLT VecTy = MRI.getType(Vec);
2145   LLT EltTy = VecTy.getElementType();
2146   assert(EltTy == MRI.getType(Dst));
2147 
2148   if (IdxVal < VecTy.getNumElements())
2149     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2150   else
2151     B.buildUndef(Dst);
2152 
2153   MI.eraseFromParent();
2154   return true;
2155 }
2156 
2157 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2158   MachineInstr &MI, MachineRegisterInfo &MRI,
2159   MachineIRBuilder &B) const {
2160   // TODO: Should move some of this into LegalizerHelper.
2161 
2162   // TODO: Promote dynamic indexing of s16 to s32
2163 
2164   // FIXME: Artifact combiner probably should have replaced the truncated
2165   // constant before this, so we shouldn't need
2166   // getConstantVRegValWithLookThrough.
2167   Optional<ValueAndVReg> MaybeIdxVal =
2168       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2169   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2170     return true;
2171 
2172   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2173   Register Dst = MI.getOperand(0).getReg();
2174   Register Vec = MI.getOperand(1).getReg();
2175   Register Ins = MI.getOperand(2).getReg();
2176 
2177   LLT VecTy = MRI.getType(Vec);
2178   LLT EltTy = VecTy.getElementType();
2179   assert(EltTy == MRI.getType(Ins));
2180 
2181   if (IdxVal < VecTy.getNumElements())
2182     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2183   else
2184     B.buildUndef(Dst);
2185 
2186   MI.eraseFromParent();
2187   return true;
2188 }
2189 
2190 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2191   MachineInstr &MI, MachineRegisterInfo &MRI,
2192   MachineIRBuilder &B) const {
2193   const LLT V2S16 = LLT::vector(2, 16);
2194 
2195   Register Dst = MI.getOperand(0).getReg();
2196   Register Src0 = MI.getOperand(1).getReg();
2197   LLT DstTy = MRI.getType(Dst);
2198   LLT SrcTy = MRI.getType(Src0);
2199 
2200   if (SrcTy == V2S16 && DstTy == V2S16 &&
2201       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2202     return true;
2203 
2204   MachineIRBuilder HelperBuilder(MI);
2205   GISelObserverWrapper DummyObserver;
2206   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2207   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2208 }
2209 
2210 bool AMDGPULegalizerInfo::legalizeSinCos(
2211   MachineInstr &MI, MachineRegisterInfo &MRI,
2212   MachineIRBuilder &B) const {
2213 
2214   Register DstReg = MI.getOperand(0).getReg();
2215   Register SrcReg = MI.getOperand(1).getReg();
2216   LLT Ty = MRI.getType(DstReg);
2217   unsigned Flags = MI.getFlags();
2218 
2219   Register TrigVal;
2220   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2221   if (ST.hasTrigReducedRange()) {
2222     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2223     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2224       .addUse(MulVal.getReg(0))
2225       .setMIFlags(Flags).getReg(0);
2226   } else
2227     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2228 
2229   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2230     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2231   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2232     .addUse(TrigVal)
2233     .setMIFlags(Flags);
2234   MI.eraseFromParent();
2235   return true;
2236 }
2237 
2238 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2239                                                   MachineIRBuilder &B,
2240                                                   const GlobalValue *GV,
2241                                                   int64_t Offset,
2242                                                   unsigned GAFlags) const {
2243   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2244   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2245   // to the following code sequence:
2246   //
2247   // For constant address space:
2248   //   s_getpc_b64 s[0:1]
2249   //   s_add_u32 s0, s0, $symbol
2250   //   s_addc_u32 s1, s1, 0
2251   //
2252   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2253   //   a fixup or relocation is emitted to replace $symbol with a literal
2254   //   constant, which is a pc-relative offset from the encoding of the $symbol
2255   //   operand to the global variable.
2256   //
2257   // For global address space:
2258   //   s_getpc_b64 s[0:1]
2259   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2260   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2261   //
2262   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2263   //   fixups or relocations are emitted to replace $symbol@*@lo and
2264   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2265   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2266   //   operand to the global variable.
2267   //
2268   // What we want here is an offset from the value returned by s_getpc
2269   // (which is the address of the s_add_u32 instruction) to the global
2270   // variable, but since the encoding of $symbol starts 4 bytes after the start
2271   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2272   // small. This requires us to add 4 to the global variable offset in order to
2273   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2274   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2275   // instruction.
2276 
2277   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2278 
2279   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2280     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2281 
2282   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2283     .addDef(PCReg);
2284 
2285   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2286   if (GAFlags == SIInstrInfo::MO_NONE)
2287     MIB.addImm(0);
2288   else
2289     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2290 
2291   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2292 
2293   if (PtrTy.getSizeInBits() == 32)
2294     B.buildExtract(DstReg, PCReg, 0);
2295   return true;
2296  }
2297 
2298 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2299   MachineInstr &MI, MachineRegisterInfo &MRI,
2300   MachineIRBuilder &B) const {
2301   Register DstReg = MI.getOperand(0).getReg();
2302   LLT Ty = MRI.getType(DstReg);
2303   unsigned AS = Ty.getAddressSpace();
2304 
2305   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2306   MachineFunction &MF = B.getMF();
2307   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2308 
2309   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2310     if (!MFI->isModuleEntryFunction() &&
2311         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2312       const Function &Fn = MF.getFunction();
2313       DiagnosticInfoUnsupported BadLDSDecl(
2314         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2315         DS_Warning);
2316       Fn.getContext().diagnose(BadLDSDecl);
2317 
2318       // We currently don't have a way to correctly allocate LDS objects that
2319       // aren't directly associated with a kernel. We do force inlining of
2320       // functions that use local objects. However, if these dead functions are
2321       // not eliminated, we don't want a compile time error. Just emit a warning
2322       // and a trap, since there should be no callable path here.
2323       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2324       B.buildUndef(DstReg);
2325       MI.eraseFromParent();
2326       return true;
2327     }
2328 
2329     // TODO: We could emit code to handle the initialization somewhere.
2330     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2331       const SITargetLowering *TLI = ST.getTargetLowering();
2332       if (!TLI->shouldUseLDSConstAddress(GV)) {
2333         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2334         return true; // Leave in place;
2335       }
2336 
2337       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2338         Type *Ty = GV->getValueType();
2339         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2340         // zero-sized type in other languages to declare the dynamic shared
2341         // memory which size is not known at the compile time. They will be
2342         // allocated by the runtime and placed directly after the static
2343         // allocated ones. They all share the same offset.
2344         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2345           // Adjust alignment for that dynamic shared memory array.
2346           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2347           LLT S32 = LLT::scalar(32);
2348           auto Sz =
2349               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2350           B.buildIntToPtr(DstReg, Sz);
2351           MI.eraseFromParent();
2352           return true;
2353         }
2354       }
2355 
2356       B.buildConstant(
2357           DstReg,
2358           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2359       MI.eraseFromParent();
2360       return true;
2361     }
2362 
2363     const Function &Fn = MF.getFunction();
2364     DiagnosticInfoUnsupported BadInit(
2365       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2366     Fn.getContext().diagnose(BadInit);
2367     return true;
2368   }
2369 
2370   const SITargetLowering *TLI = ST.getTargetLowering();
2371 
2372   if (TLI->shouldEmitFixup(GV)) {
2373     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2374     MI.eraseFromParent();
2375     return true;
2376   }
2377 
2378   if (TLI->shouldEmitPCReloc(GV)) {
2379     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2380     MI.eraseFromParent();
2381     return true;
2382   }
2383 
2384   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2385   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2386 
2387   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2388       MachinePointerInfo::getGOT(MF),
2389       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2390           MachineMemOperand::MOInvariant,
2391       8 /*Size*/, Align(8));
2392 
2393   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2394 
2395   if (Ty.getSizeInBits() == 32) {
2396     // Truncate if this is a 32-bit constant adrdess.
2397     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2398     B.buildExtract(DstReg, Load, 0);
2399   } else
2400     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2401 
2402   MI.eraseFromParent();
2403   return true;
2404 }
2405 
2406 static LLT widenToNextPowerOf2(LLT Ty) {
2407   if (Ty.isVector())
2408     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2409   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2410 }
2411 
2412 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2413                                        MachineInstr &MI) const {
2414   MachineIRBuilder &B = Helper.MIRBuilder;
2415   MachineRegisterInfo &MRI = *B.getMRI();
2416   GISelChangeObserver &Observer = Helper.Observer;
2417 
2418   Register PtrReg = MI.getOperand(1).getReg();
2419   LLT PtrTy = MRI.getType(PtrReg);
2420   unsigned AddrSpace = PtrTy.getAddressSpace();
2421 
2422   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2423     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2424     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2425     Observer.changingInstr(MI);
2426     MI.getOperand(1).setReg(Cast.getReg(0));
2427     Observer.changedInstr(MI);
2428     return true;
2429   }
2430 
2431   if (MI.getOpcode() != AMDGPU::G_LOAD)
2432     return false;
2433 
2434   Register ValReg = MI.getOperand(0).getReg();
2435   LLT ValTy = MRI.getType(ValReg);
2436 
2437   MachineMemOperand *MMO = *MI.memoperands_begin();
2438   const unsigned ValSize = ValTy.getSizeInBits();
2439   const unsigned MemSize = 8 * MMO->getSize();
2440   const Align MemAlign = MMO->getAlign();
2441   const unsigned AlignInBits = 8 * MemAlign.value();
2442 
2443   // Widen non-power-of-2 loads to the alignment if needed
2444   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2445     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2446 
2447     // This was already the correct extending load result type, so just adjust
2448     // the memory type.
2449     if (WideMemSize == ValSize) {
2450       MachineFunction &MF = B.getMF();
2451 
2452       MachineMemOperand *WideMMO =
2453           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2454       Observer.changingInstr(MI);
2455       MI.setMemRefs(MF, {WideMMO});
2456       Observer.changedInstr(MI);
2457       return true;
2458     }
2459 
2460     // Don't bother handling edge case that should probably never be produced.
2461     if (ValSize > WideMemSize)
2462       return false;
2463 
2464     LLT WideTy = widenToNextPowerOf2(ValTy);
2465 
2466     Register WideLoad;
2467     if (!WideTy.isVector()) {
2468       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2469       B.buildTrunc(ValReg, WideLoad).getReg(0);
2470     } else {
2471       // Extract the subvector.
2472 
2473       if (isRegisterType(ValTy)) {
2474         // If this a case where G_EXTRACT is legal, use it.
2475         // (e.g. <3 x s32> -> <4 x s32>)
2476         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2477         B.buildExtract(ValReg, WideLoad, 0);
2478       } else {
2479         // For cases where the widened type isn't a nice register value, unmerge
2480         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2481         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2482         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2483         B.setInsertPt(B.getMBB(), MI.getIterator());
2484         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2485       }
2486     }
2487 
2488     MI.eraseFromParent();
2489     return true;
2490   }
2491 
2492   return false;
2493 }
2494 
2495 bool AMDGPULegalizerInfo::legalizeFMad(
2496   MachineInstr &MI, MachineRegisterInfo &MRI,
2497   MachineIRBuilder &B) const {
2498   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2499   assert(Ty.isScalar());
2500 
2501   MachineFunction &MF = B.getMF();
2502   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2503 
2504   // TODO: Always legal with future ftz flag.
2505   // FIXME: Do we need just output?
2506   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2507     return true;
2508   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2509     return true;
2510 
2511   MachineIRBuilder HelperBuilder(MI);
2512   GISelObserverWrapper DummyObserver;
2513   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2514   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2515 }
2516 
2517 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2518   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2519   Register DstReg = MI.getOperand(0).getReg();
2520   Register PtrReg = MI.getOperand(1).getReg();
2521   Register CmpVal = MI.getOperand(2).getReg();
2522   Register NewVal = MI.getOperand(3).getReg();
2523 
2524   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2525          "this should not have been custom lowered");
2526 
2527   LLT ValTy = MRI.getType(CmpVal);
2528   LLT VecTy = LLT::vector(2, ValTy);
2529 
2530   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2531 
2532   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2533     .addDef(DstReg)
2534     .addUse(PtrReg)
2535     .addUse(PackedVal)
2536     .setMemRefs(MI.memoperands());
2537 
2538   MI.eraseFromParent();
2539   return true;
2540 }
2541 
2542 bool AMDGPULegalizerInfo::legalizeFlog(
2543   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2544   Register Dst = MI.getOperand(0).getReg();
2545   Register Src = MI.getOperand(1).getReg();
2546   LLT Ty = B.getMRI()->getType(Dst);
2547   unsigned Flags = MI.getFlags();
2548 
2549   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2550   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2551 
2552   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2553   MI.eraseFromParent();
2554   return true;
2555 }
2556 
2557 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2558                                        MachineIRBuilder &B) const {
2559   Register Dst = MI.getOperand(0).getReg();
2560   Register Src = MI.getOperand(1).getReg();
2561   unsigned Flags = MI.getFlags();
2562   LLT Ty = B.getMRI()->getType(Dst);
2563 
2564   auto K = B.buildFConstant(Ty, numbers::log2e);
2565   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2566   B.buildFExp2(Dst, Mul, Flags);
2567   MI.eraseFromParent();
2568   return true;
2569 }
2570 
2571 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2572                                        MachineIRBuilder &B) const {
2573   Register Dst = MI.getOperand(0).getReg();
2574   Register Src0 = MI.getOperand(1).getReg();
2575   Register Src1 = MI.getOperand(2).getReg();
2576   unsigned Flags = MI.getFlags();
2577   LLT Ty = B.getMRI()->getType(Dst);
2578   const LLT S16 = LLT::scalar(16);
2579   const LLT S32 = LLT::scalar(32);
2580 
2581   if (Ty == S32) {
2582     auto Log = B.buildFLog2(S32, Src0, Flags);
2583     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2584       .addUse(Log.getReg(0))
2585       .addUse(Src1)
2586       .setMIFlags(Flags);
2587     B.buildFExp2(Dst, Mul, Flags);
2588   } else if (Ty == S16) {
2589     // There's no f16 fmul_legacy, so we need to convert for it.
2590     auto Log = B.buildFLog2(S16, Src0, Flags);
2591     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2592     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2593     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2594       .addUse(Ext0.getReg(0))
2595       .addUse(Ext1.getReg(0))
2596       .setMIFlags(Flags);
2597 
2598     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2599   } else
2600     return false;
2601 
2602   MI.eraseFromParent();
2603   return true;
2604 }
2605 
2606 // Find a source register, ignoring any possible source modifiers.
2607 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2608   Register ModSrc = OrigSrc;
2609   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2610     ModSrc = SrcFNeg->getOperand(1).getReg();
2611     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2612       ModSrc = SrcFAbs->getOperand(1).getReg();
2613   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2614     ModSrc = SrcFAbs->getOperand(1).getReg();
2615   return ModSrc;
2616 }
2617 
2618 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2619                                          MachineRegisterInfo &MRI,
2620                                          MachineIRBuilder &B) const {
2621 
2622   const LLT S1 = LLT::scalar(1);
2623   const LLT S64 = LLT::scalar(64);
2624   Register Dst = MI.getOperand(0).getReg();
2625   Register OrigSrc = MI.getOperand(1).getReg();
2626   unsigned Flags = MI.getFlags();
2627   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2628          "this should not have been custom lowered");
2629 
2630   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2631   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2632   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2633   // V_FRACT bug is:
2634   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2635   //
2636   // Convert floor(x) to (x - fract(x))
2637 
2638   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2639     .addUse(OrigSrc)
2640     .setMIFlags(Flags);
2641 
2642   // Give source modifier matching some assistance before obscuring a foldable
2643   // pattern.
2644 
2645   // TODO: We can avoid the neg on the fract? The input sign to fract
2646   // shouldn't matter?
2647   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2648 
2649   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2650 
2651   Register Min = MRI.createGenericVirtualRegister(S64);
2652 
2653   // We don't need to concern ourselves with the snan handling difference, so
2654   // use the one which will directly select.
2655   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2656   if (MFI->getMode().IEEE)
2657     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2658   else
2659     B.buildFMinNum(Min, Fract, Const, Flags);
2660 
2661   Register CorrectedFract = Min;
2662   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2663     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2664     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2665   }
2666 
2667   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2668   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2669 
2670   MI.eraseFromParent();
2671   return true;
2672 }
2673 
2674 // Turn an illegal packed v2s16 build vector into bit operations.
2675 // TODO: This should probably be a bitcast action in LegalizerHelper.
2676 bool AMDGPULegalizerInfo::legalizeBuildVector(
2677   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2678   Register Dst = MI.getOperand(0).getReg();
2679   const LLT S32 = LLT::scalar(32);
2680   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2681 
2682   Register Src0 = MI.getOperand(1).getReg();
2683   Register Src1 = MI.getOperand(2).getReg();
2684   assert(MRI.getType(Src0) == LLT::scalar(16));
2685 
2686   auto Merge = B.buildMerge(S32, {Src0, Src1});
2687   B.buildBitcast(Dst, Merge);
2688 
2689   MI.eraseFromParent();
2690   return true;
2691 }
2692 
2693 // Check that this is a G_XOR x, -1
2694 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2695   if (MI.getOpcode() != TargetOpcode::G_XOR)
2696     return false;
2697   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2698   return ConstVal && *ConstVal == -1;
2699 }
2700 
2701 // Return the use branch instruction, otherwise null if the usage is invalid.
2702 static MachineInstr *
2703 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2704                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2705   Register CondDef = MI.getOperand(0).getReg();
2706   if (!MRI.hasOneNonDBGUse(CondDef))
2707     return nullptr;
2708 
2709   MachineBasicBlock *Parent = MI.getParent();
2710   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2711 
2712   if (isNot(MRI, *UseMI)) {
2713     Register NegatedCond = UseMI->getOperand(0).getReg();
2714     if (!MRI.hasOneNonDBGUse(NegatedCond))
2715       return nullptr;
2716 
2717     // We're deleting the def of this value, so we need to remove it.
2718     UseMI->eraseFromParent();
2719 
2720     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2721     Negated = true;
2722   }
2723 
2724   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2725     return nullptr;
2726 
2727   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2728   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2729   if (Next == Parent->end()) {
2730     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2731     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2732       return nullptr;
2733     UncondBrTarget = &*NextMBB;
2734   } else {
2735     if (Next->getOpcode() != AMDGPU::G_BR)
2736       return nullptr;
2737     Br = &*Next;
2738     UncondBrTarget = Br->getOperand(0).getMBB();
2739   }
2740 
2741   return UseMI;
2742 }
2743 
2744 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2745                                          const ArgDescriptor *Arg,
2746                                          const TargetRegisterClass *ArgRC,
2747                                          LLT ArgTy) const {
2748   MCRegister SrcReg = Arg->getRegister();
2749   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2750   assert(DstReg.isVirtual() && "Virtual register expected");
2751 
2752   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2753                                              ArgTy);
2754   if (Arg->isMasked()) {
2755     // TODO: Should we try to emit this once in the entry block?
2756     const LLT S32 = LLT::scalar(32);
2757     const unsigned Mask = Arg->getMask();
2758     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2759 
2760     Register AndMaskSrc = LiveIn;
2761 
2762     if (Shift != 0) {
2763       auto ShiftAmt = B.buildConstant(S32, Shift);
2764       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2765     }
2766 
2767     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2768   } else {
2769     B.buildCopy(DstReg, LiveIn);
2770   }
2771 
2772   return true;
2773 }
2774 
2775 bool AMDGPULegalizerInfo::loadInputValue(
2776     Register DstReg, MachineIRBuilder &B,
2777     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2778   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2779   const ArgDescriptor *Arg;
2780   const TargetRegisterClass *ArgRC;
2781   LLT ArgTy;
2782   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2783 
2784   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2785     return false; // TODO: Handle these
2786   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2787 }
2788 
2789 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2790     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2791     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2792   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2793     return false;
2794 
2795   MI.eraseFromParent();
2796   return true;
2797 }
2798 
2799 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2800                                        MachineRegisterInfo &MRI,
2801                                        MachineIRBuilder &B) const {
2802   Register Dst = MI.getOperand(0).getReg();
2803   LLT DstTy = MRI.getType(Dst);
2804   LLT S16 = LLT::scalar(16);
2805   LLT S32 = LLT::scalar(32);
2806   LLT S64 = LLT::scalar(64);
2807 
2808   if (DstTy == S16)
2809     return legalizeFDIV16(MI, MRI, B);
2810   if (DstTy == S32)
2811     return legalizeFDIV32(MI, MRI, B);
2812   if (DstTy == S64)
2813     return legalizeFDIV64(MI, MRI, B);
2814 
2815   return false;
2816 }
2817 
2818 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2819                                                         Register DstDivReg,
2820                                                         Register DstRemReg,
2821                                                         Register X,
2822                                                         Register Y) const {
2823   const LLT S1 = LLT::scalar(1);
2824   const LLT S32 = LLT::scalar(32);
2825 
2826   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2827   // algorithm used here.
2828 
2829   // Initial estimate of inv(y).
2830   auto FloatY = B.buildUITOFP(S32, Y);
2831   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2832   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2833   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2834   auto Z = B.buildFPTOUI(S32, ScaledY);
2835 
2836   // One round of UNR.
2837   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2838   auto NegYZ = B.buildMul(S32, NegY, Z);
2839   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2840 
2841   // Quotient/remainder estimate.
2842   auto Q = B.buildUMulH(S32, X, Z);
2843   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2844 
2845   // First quotient/remainder refinement.
2846   auto One = B.buildConstant(S32, 1);
2847   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2848   if (DstDivReg)
2849     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2850   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2851 
2852   // Second quotient/remainder refinement.
2853   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2854   if (DstDivReg)
2855     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2856 
2857   if (DstRemReg)
2858     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2859 }
2860 
2861 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2862 //
2863 // Return lo, hi of result
2864 //
2865 // %cvt.lo = G_UITOFP Val.lo
2866 // %cvt.hi = G_UITOFP Val.hi
2867 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2868 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2869 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2870 // %mul2 = G_FMUL %mul1, 2**(-32)
2871 // %trunc = G_INTRINSIC_TRUNC %mul2
2872 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2873 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2874 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2875                                                        Register Val) {
2876   const LLT S32 = LLT::scalar(32);
2877   auto Unmerge = B.buildUnmerge(S32, Val);
2878 
2879   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2880   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2881 
2882   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2883                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2884 
2885   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2886   auto Mul1 =
2887       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2888 
2889   // 2**(-32)
2890   auto Mul2 =
2891       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2892   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2893 
2894   // -(2**32)
2895   auto Mad2 = B.buildFMAD(S32, Trunc,
2896                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2897 
2898   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2899   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2900 
2901   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2902 }
2903 
2904 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2905                                                         Register DstDivReg,
2906                                                         Register DstRemReg,
2907                                                         Register Numer,
2908                                                         Register Denom) const {
2909   const LLT S32 = LLT::scalar(32);
2910   const LLT S64 = LLT::scalar(64);
2911   const LLT S1 = LLT::scalar(1);
2912   Register RcpLo, RcpHi;
2913 
2914   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2915 
2916   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2917 
2918   auto Zero64 = B.buildConstant(S64, 0);
2919   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2920 
2921   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2922   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2923 
2924   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2925   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2926   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2927 
2928   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2929   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2930   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2931   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2932 
2933   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2934   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2935   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2936   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2937   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2938 
2939   auto Zero32 = B.buildConstant(S32, 0);
2940   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2941   auto Add2_HiC =
2942       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2943   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2944   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2945 
2946   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2947   Register NumerLo = UnmergeNumer.getReg(0);
2948   Register NumerHi = UnmergeNumer.getReg(1);
2949 
2950   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2951   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2952   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2953   Register Mul3_Lo = UnmergeMul3.getReg(0);
2954   Register Mul3_Hi = UnmergeMul3.getReg(1);
2955   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2956   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2957   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2958   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2959 
2960   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2961   Register DenomLo = UnmergeDenom.getReg(0);
2962   Register DenomHi = UnmergeDenom.getReg(1);
2963 
2964   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2965   auto C1 = B.buildSExt(S32, CmpHi);
2966 
2967   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2968   auto C2 = B.buildSExt(S32, CmpLo);
2969 
2970   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2971   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2972 
2973   // TODO: Here and below portions of the code can be enclosed into if/endif.
2974   // Currently control flow is unconditional and we have 4 selects after
2975   // potential endif to substitute PHIs.
2976 
2977   // if C3 != 0 ...
2978   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2979   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2980   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2981   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2982 
2983   auto One64 = B.buildConstant(S64, 1);
2984   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2985 
2986   auto C4 =
2987       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2988   auto C5 =
2989       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2990   auto C6 = B.buildSelect(
2991       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2992 
2993   // if (C6 != 0)
2994   auto Add4 = B.buildAdd(S64, Add3, One64);
2995   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2996 
2997   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2998   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2999   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3000 
3001   // endif C6
3002   // endif C3
3003 
3004   if (DstDivReg) {
3005     auto Sel1 = B.buildSelect(
3006         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3007     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3008                   Sel1, MulHi3);
3009   }
3010 
3011   if (DstRemReg) {
3012     auto Sel2 = B.buildSelect(
3013         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3014     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3015                   Sel2, Sub1);
3016   }
3017 }
3018 
3019 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3020                                                   MachineRegisterInfo &MRI,
3021                                                   MachineIRBuilder &B) const {
3022   Register DstDivReg, DstRemReg;
3023   switch (MI.getOpcode()) {
3024   default:
3025     llvm_unreachable("Unexpected opcode!");
3026   case AMDGPU::G_UDIV: {
3027     DstDivReg = MI.getOperand(0).getReg();
3028     break;
3029   }
3030   case AMDGPU::G_UREM: {
3031     DstRemReg = MI.getOperand(0).getReg();
3032     break;
3033   }
3034   case AMDGPU::G_UDIVREM: {
3035     DstDivReg = MI.getOperand(0).getReg();
3036     DstRemReg = MI.getOperand(1).getReg();
3037     break;
3038   }
3039   }
3040 
3041   const LLT S64 = LLT::scalar(64);
3042   const LLT S32 = LLT::scalar(32);
3043   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3044   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3045   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3046   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3047 
3048   if (Ty == S32)
3049     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3050   else if (Ty == S64)
3051     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3052   else
3053     return false;
3054 
3055   MI.eraseFromParent();
3056   return true;
3057 }
3058 
3059 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3060                                                 MachineRegisterInfo &MRI,
3061                                                 MachineIRBuilder &B) const {
3062   const LLT S64 = LLT::scalar(64);
3063   const LLT S32 = LLT::scalar(32);
3064 
3065   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3066   if (Ty != S32 && Ty != S64)
3067     return false;
3068 
3069   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3070   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3071   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3072 
3073   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3074   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3075   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3076 
3077   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3078   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3079 
3080   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3081   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3082 
3083   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3084   switch (MI.getOpcode()) {
3085   default:
3086     llvm_unreachable("Unexpected opcode!");
3087   case AMDGPU::G_SDIV: {
3088     DstDivReg = MI.getOperand(0).getReg();
3089     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3090     break;
3091   }
3092   case AMDGPU::G_SREM: {
3093     DstRemReg = MI.getOperand(0).getReg();
3094     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3095     break;
3096   }
3097   case AMDGPU::G_SDIVREM: {
3098     DstDivReg = MI.getOperand(0).getReg();
3099     DstRemReg = MI.getOperand(1).getReg();
3100     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3101     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3102     break;
3103   }
3104   }
3105 
3106   if (Ty == S32)
3107     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3108   else
3109     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3110 
3111   if (DstDivReg) {
3112     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3113     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3114     B.buildSub(DstDivReg, SignXor, Sign);
3115   }
3116 
3117   if (DstRemReg) {
3118     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3119     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3120     B.buildSub(DstRemReg, SignXor, Sign);
3121   }
3122 
3123   MI.eraseFromParent();
3124   return true;
3125 }
3126 
3127 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3128                                                  MachineRegisterInfo &MRI,
3129                                                  MachineIRBuilder &B) const {
3130   Register Res = MI.getOperand(0).getReg();
3131   Register LHS = MI.getOperand(1).getReg();
3132   Register RHS = MI.getOperand(2).getReg();
3133   uint16_t Flags = MI.getFlags();
3134   LLT ResTy = MRI.getType(Res);
3135 
3136   const MachineFunction &MF = B.getMF();
3137   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3138                             MI.getFlag(MachineInstr::FmAfn);
3139 
3140   if (!AllowInaccurateRcp)
3141     return false;
3142 
3143   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3144     // 1 / x -> RCP(x)
3145     if (CLHS->isExactlyValue(1.0)) {
3146       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3147         .addUse(RHS)
3148         .setMIFlags(Flags);
3149 
3150       MI.eraseFromParent();
3151       return true;
3152     }
3153 
3154     // -1 / x -> RCP( FNEG(x) )
3155     if (CLHS->isExactlyValue(-1.0)) {
3156       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3157       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3158         .addUse(FNeg.getReg(0))
3159         .setMIFlags(Flags);
3160 
3161       MI.eraseFromParent();
3162       return true;
3163     }
3164   }
3165 
3166   // x / y -> x * (1.0 / y)
3167   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3168     .addUse(RHS)
3169     .setMIFlags(Flags);
3170   B.buildFMul(Res, LHS, RCP, Flags);
3171 
3172   MI.eraseFromParent();
3173   return true;
3174 }
3175 
3176 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3177                                                    MachineRegisterInfo &MRI,
3178                                                    MachineIRBuilder &B) const {
3179   Register Res = MI.getOperand(0).getReg();
3180   Register X = MI.getOperand(1).getReg();
3181   Register Y = MI.getOperand(2).getReg();
3182   uint16_t Flags = MI.getFlags();
3183   LLT ResTy = MRI.getType(Res);
3184 
3185   const MachineFunction &MF = B.getMF();
3186   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3187                             MI.getFlag(MachineInstr::FmAfn);
3188 
3189   if (!AllowInaccurateRcp)
3190     return false;
3191 
3192   auto NegY = B.buildFNeg(ResTy, Y);
3193   auto One = B.buildFConstant(ResTy, 1.0);
3194 
3195   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3196     .addUse(Y)
3197     .setMIFlags(Flags);
3198 
3199   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3200   R = B.buildFMA(ResTy, Tmp0, R, R);
3201 
3202   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3203   R = B.buildFMA(ResTy, Tmp1, R, R);
3204 
3205   auto Ret = B.buildFMul(ResTy, X, R);
3206   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3207 
3208   B.buildFMA(Res, Tmp2, R, Ret);
3209   MI.eraseFromParent();
3210   return true;
3211 }
3212 
3213 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3214                                          MachineRegisterInfo &MRI,
3215                                          MachineIRBuilder &B) const {
3216   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3217     return true;
3218 
3219   Register Res = MI.getOperand(0).getReg();
3220   Register LHS = MI.getOperand(1).getReg();
3221   Register RHS = MI.getOperand(2).getReg();
3222 
3223   uint16_t Flags = MI.getFlags();
3224 
3225   LLT S16 = LLT::scalar(16);
3226   LLT S32 = LLT::scalar(32);
3227 
3228   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3229   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3230 
3231   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3232     .addUse(RHSExt.getReg(0))
3233     .setMIFlags(Flags);
3234 
3235   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3236   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3237 
3238   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3239     .addUse(RDst.getReg(0))
3240     .addUse(RHS)
3241     .addUse(LHS)
3242     .setMIFlags(Flags);
3243 
3244   MI.eraseFromParent();
3245   return true;
3246 }
3247 
3248 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3249 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3250 static void toggleSPDenormMode(bool Enable,
3251                                MachineIRBuilder &B,
3252                                const GCNSubtarget &ST,
3253                                AMDGPU::SIModeRegisterDefaults Mode) {
3254   // Set SP denorm mode to this value.
3255   unsigned SPDenormMode =
3256     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3257 
3258   if (ST.hasDenormModeInst()) {
3259     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3260     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3261 
3262     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3263     B.buildInstr(AMDGPU::S_DENORM_MODE)
3264       .addImm(NewDenormModeValue);
3265 
3266   } else {
3267     // Select FP32 bit field in mode register.
3268     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3269                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3270                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3271 
3272     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3273       .addImm(SPDenormMode)
3274       .addImm(SPDenormModeBitField);
3275   }
3276 }
3277 
3278 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3279                                          MachineRegisterInfo &MRI,
3280                                          MachineIRBuilder &B) const {
3281   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3282     return true;
3283 
3284   Register Res = MI.getOperand(0).getReg();
3285   Register LHS = MI.getOperand(1).getReg();
3286   Register RHS = MI.getOperand(2).getReg();
3287   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3288   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3289 
3290   uint16_t Flags = MI.getFlags();
3291 
3292   LLT S32 = LLT::scalar(32);
3293   LLT S1 = LLT::scalar(1);
3294 
3295   auto One = B.buildFConstant(S32, 1.0f);
3296 
3297   auto DenominatorScaled =
3298     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3299       .addUse(LHS)
3300       .addUse(RHS)
3301       .addImm(0)
3302       .setMIFlags(Flags);
3303   auto NumeratorScaled =
3304     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3305       .addUse(LHS)
3306       .addUse(RHS)
3307       .addImm(1)
3308       .setMIFlags(Flags);
3309 
3310   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3311     .addUse(DenominatorScaled.getReg(0))
3312     .setMIFlags(Flags);
3313   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3314 
3315   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3316   // aren't modeled as reading it.
3317   if (!Mode.allFP32Denormals())
3318     toggleSPDenormMode(true, B, ST, Mode);
3319 
3320   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3321   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3322   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3323   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3324   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3325   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3326 
3327   if (!Mode.allFP32Denormals())
3328     toggleSPDenormMode(false, B, ST, Mode);
3329 
3330   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3331     .addUse(Fma4.getReg(0))
3332     .addUse(Fma1.getReg(0))
3333     .addUse(Fma3.getReg(0))
3334     .addUse(NumeratorScaled.getReg(1))
3335     .setMIFlags(Flags);
3336 
3337   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3338     .addUse(Fmas.getReg(0))
3339     .addUse(RHS)
3340     .addUse(LHS)
3341     .setMIFlags(Flags);
3342 
3343   MI.eraseFromParent();
3344   return true;
3345 }
3346 
3347 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3348                                          MachineRegisterInfo &MRI,
3349                                          MachineIRBuilder &B) const {
3350   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3351     return true;
3352 
3353   Register Res = MI.getOperand(0).getReg();
3354   Register LHS = MI.getOperand(1).getReg();
3355   Register RHS = MI.getOperand(2).getReg();
3356 
3357   uint16_t Flags = MI.getFlags();
3358 
3359   LLT S64 = LLT::scalar(64);
3360   LLT S1 = LLT::scalar(1);
3361 
3362   auto One = B.buildFConstant(S64, 1.0);
3363 
3364   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3365     .addUse(LHS)
3366     .addUse(RHS)
3367     .addImm(0)
3368     .setMIFlags(Flags);
3369 
3370   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3371 
3372   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3373     .addUse(DivScale0.getReg(0))
3374     .setMIFlags(Flags);
3375 
3376   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3377   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3378   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3379 
3380   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3381     .addUse(LHS)
3382     .addUse(RHS)
3383     .addImm(1)
3384     .setMIFlags(Flags);
3385 
3386   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3387   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3388   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3389 
3390   Register Scale;
3391   if (!ST.hasUsableDivScaleConditionOutput()) {
3392     // Workaround a hardware bug on SI where the condition output from div_scale
3393     // is not usable.
3394 
3395     LLT S32 = LLT::scalar(32);
3396 
3397     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3398     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3399     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3400     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3401 
3402     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3403                               Scale1Unmerge.getReg(1));
3404     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3405                               Scale0Unmerge.getReg(1));
3406     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3407   } else {
3408     Scale = DivScale1.getReg(1);
3409   }
3410 
3411   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3412     .addUse(Fma4.getReg(0))
3413     .addUse(Fma3.getReg(0))
3414     .addUse(Mul.getReg(0))
3415     .addUse(Scale)
3416     .setMIFlags(Flags);
3417 
3418   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3419     .addUse(Fmas.getReg(0))
3420     .addUse(RHS)
3421     .addUse(LHS)
3422     .setMIFlags(Flags);
3423 
3424   MI.eraseFromParent();
3425   return true;
3426 }
3427 
3428 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3429                                                  MachineRegisterInfo &MRI,
3430                                                  MachineIRBuilder &B) const {
3431   Register Res = MI.getOperand(0).getReg();
3432   Register LHS = MI.getOperand(2).getReg();
3433   Register RHS = MI.getOperand(3).getReg();
3434   uint16_t Flags = MI.getFlags();
3435 
3436   LLT S32 = LLT::scalar(32);
3437   LLT S1 = LLT::scalar(1);
3438 
3439   auto Abs = B.buildFAbs(S32, RHS, Flags);
3440   const APFloat C0Val(1.0f);
3441 
3442   auto C0 = B.buildConstant(S32, 0x6f800000);
3443   auto C1 = B.buildConstant(S32, 0x2f800000);
3444   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3445 
3446   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3447   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3448 
3449   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3450 
3451   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3452     .addUse(Mul0.getReg(0))
3453     .setMIFlags(Flags);
3454 
3455   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3456 
3457   B.buildFMul(Res, Sel, Mul1, Flags);
3458 
3459   MI.eraseFromParent();
3460   return true;
3461 }
3462 
3463 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3464 // FIXME: Why do we handle this one but not other removed instructions?
3465 //
3466 // Reciprocal square root.  The clamp prevents infinite results, clamping
3467 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3468 // +-max_float.
3469 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3470                                                     MachineRegisterInfo &MRI,
3471                                                     MachineIRBuilder &B) const {
3472   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3473     return true;
3474 
3475   Register Dst = MI.getOperand(0).getReg();
3476   Register Src = MI.getOperand(2).getReg();
3477   auto Flags = MI.getFlags();
3478 
3479   LLT Ty = MRI.getType(Dst);
3480 
3481   const fltSemantics *FltSemantics;
3482   if (Ty == LLT::scalar(32))
3483     FltSemantics = &APFloat::IEEEsingle();
3484   else if (Ty == LLT::scalar(64))
3485     FltSemantics = &APFloat::IEEEdouble();
3486   else
3487     return false;
3488 
3489   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3490     .addUse(Src)
3491     .setMIFlags(Flags);
3492 
3493   // We don't need to concern ourselves with the snan handling difference, since
3494   // the rsq quieted (or not) so use the one which will directly select.
3495   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3496   const bool UseIEEE = MFI->getMode().IEEE;
3497 
3498   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3499   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3500                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3501 
3502   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3503 
3504   if (UseIEEE)
3505     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3506   else
3507     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3508   MI.eraseFromParent();
3509   return true;
3510 }
3511 
3512 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3513   switch (IID) {
3514   case Intrinsic::amdgcn_ds_fadd:
3515     return AMDGPU::G_ATOMICRMW_FADD;
3516   case Intrinsic::amdgcn_ds_fmin:
3517     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3518   case Intrinsic::amdgcn_ds_fmax:
3519     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3520   default:
3521     llvm_unreachable("not a DS FP intrinsic");
3522   }
3523 }
3524 
3525 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3526                                                       MachineInstr &MI,
3527                                                       Intrinsic::ID IID) const {
3528   GISelChangeObserver &Observer = Helper.Observer;
3529   Observer.changingInstr(MI);
3530 
3531   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3532 
3533   // The remaining operands were used to set fields in the MemOperand on
3534   // construction.
3535   for (int I = 6; I > 3; --I)
3536     MI.RemoveOperand(I);
3537 
3538   MI.RemoveOperand(1); // Remove the intrinsic ID.
3539   Observer.changedInstr(MI);
3540   return true;
3541 }
3542 
3543 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3544                                             MachineRegisterInfo &MRI,
3545                                             MachineIRBuilder &B) const {
3546   uint64_t Offset =
3547     ST.getTargetLowering()->getImplicitParameterOffset(
3548       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3549   LLT DstTy = MRI.getType(DstReg);
3550   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3551 
3552   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3553   if (!loadInputValue(KernargPtrReg, B,
3554                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3555     return false;
3556 
3557   // FIXME: This should be nuw
3558   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3559   return true;
3560 }
3561 
3562 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3563                                                  MachineRegisterInfo &MRI,
3564                                                  MachineIRBuilder &B) const {
3565   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3566   if (!MFI->isEntryFunction()) {
3567     return legalizePreloadedArgIntrin(MI, MRI, B,
3568                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3569   }
3570 
3571   Register DstReg = MI.getOperand(0).getReg();
3572   if (!getImplicitArgPtr(DstReg, MRI, B))
3573     return false;
3574 
3575   MI.eraseFromParent();
3576   return true;
3577 }
3578 
3579 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3580                                               MachineRegisterInfo &MRI,
3581                                               MachineIRBuilder &B,
3582                                               unsigned AddrSpace) const {
3583   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3584   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3585   Register Hi32 = Unmerge.getReg(1);
3586 
3587   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3588   MI.eraseFromParent();
3589   return true;
3590 }
3591 
3592 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3593 // offset (the offset that is included in bounds checking and swizzling, to be
3594 // split between the instruction's voffset and immoffset fields) and soffset
3595 // (the offset that is excluded from bounds checking and swizzling, to go in
3596 // the instruction's soffset field).  This function takes the first kind of
3597 // offset and figures out how to split it between voffset and immoffset.
3598 std::tuple<Register, unsigned, unsigned>
3599 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3600                                         Register OrigOffset) const {
3601   const unsigned MaxImm = 4095;
3602   Register BaseReg;
3603   unsigned TotalConstOffset;
3604   const LLT S32 = LLT::scalar(32);
3605   MachineRegisterInfo &MRI = *B.getMRI();
3606 
3607   std::tie(BaseReg, TotalConstOffset) =
3608       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3609 
3610   unsigned ImmOffset = TotalConstOffset;
3611 
3612   // If BaseReg is a pointer, convert it to int.
3613   if (MRI.getType(BaseReg).isPointer())
3614     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3615 
3616   // If the immediate value is too big for the immoffset field, put the value
3617   // and -4096 into the immoffset field so that the value that is copied/added
3618   // for the voffset field is a multiple of 4096, and it stands more chance
3619   // of being CSEd with the copy/add for another similar load/store.
3620   // However, do not do that rounding down to a multiple of 4096 if that is a
3621   // negative number, as it appears to be illegal to have a negative offset
3622   // in the vgpr, even if adding the immediate offset makes it positive.
3623   unsigned Overflow = ImmOffset & ~MaxImm;
3624   ImmOffset -= Overflow;
3625   if ((int32_t)Overflow < 0) {
3626     Overflow += ImmOffset;
3627     ImmOffset = 0;
3628   }
3629 
3630   if (Overflow != 0) {
3631     if (!BaseReg) {
3632       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3633     } else {
3634       auto OverflowVal = B.buildConstant(S32, Overflow);
3635       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3636     }
3637   }
3638 
3639   if (!BaseReg)
3640     BaseReg = B.buildConstant(S32, 0).getReg(0);
3641 
3642   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3643 }
3644 
3645 /// Handle register layout difference for f16 images for some subtargets.
3646 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3647                                              MachineRegisterInfo &MRI,
3648                                              Register Reg,
3649                                              bool ImageStore) const {
3650   const LLT S16 = LLT::scalar(16);
3651   const LLT S32 = LLT::scalar(32);
3652   LLT StoreVT = MRI.getType(Reg);
3653   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3654 
3655   if (ST.hasUnpackedD16VMem()) {
3656     auto Unmerge = B.buildUnmerge(S16, Reg);
3657 
3658     SmallVector<Register, 4> WideRegs;
3659     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3660       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3661 
3662     int NumElts = StoreVT.getNumElements();
3663 
3664     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3665   }
3666 
3667   if (ImageStore && ST.hasImageStoreD16Bug()) {
3668     if (StoreVT.getNumElements() == 2) {
3669       SmallVector<Register, 4> PackedRegs;
3670       Reg = B.buildBitcast(S32, Reg).getReg(0);
3671       PackedRegs.push_back(Reg);
3672       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3673       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3674     }
3675 
3676     if (StoreVT.getNumElements() == 3) {
3677       SmallVector<Register, 4> PackedRegs;
3678       auto Unmerge = B.buildUnmerge(S16, Reg);
3679       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3680         PackedRegs.push_back(Unmerge.getReg(I));
3681       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3682       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3683       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3684     }
3685 
3686     if (StoreVT.getNumElements() == 4) {
3687       SmallVector<Register, 4> PackedRegs;
3688       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3689       auto Unmerge = B.buildUnmerge(S32, Reg);
3690       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3691         PackedRegs.push_back(Unmerge.getReg(I));
3692       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3693       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3694     }
3695 
3696     llvm_unreachable("invalid data type");
3697   }
3698 
3699   return Reg;
3700 }
3701 
3702 Register AMDGPULegalizerInfo::fixStoreSourceType(
3703   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3704   MachineRegisterInfo *MRI = B.getMRI();
3705   LLT Ty = MRI->getType(VData);
3706 
3707   const LLT S16 = LLT::scalar(16);
3708 
3709   // Fixup illegal register types for i8 stores.
3710   if (Ty == LLT::scalar(8) || Ty == S16) {
3711     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3712     return AnyExt;
3713   }
3714 
3715   if (Ty.isVector()) {
3716     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3717       if (IsFormat)
3718         return handleD16VData(B, *MRI, VData);
3719     }
3720   }
3721 
3722   return VData;
3723 }
3724 
3725 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3726                                               MachineRegisterInfo &MRI,
3727                                               MachineIRBuilder &B,
3728                                               bool IsTyped,
3729                                               bool IsFormat) const {
3730   Register VData = MI.getOperand(1).getReg();
3731   LLT Ty = MRI.getType(VData);
3732   LLT EltTy = Ty.getScalarType();
3733   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3734   const LLT S32 = LLT::scalar(32);
3735 
3736   VData = fixStoreSourceType(B, VData, IsFormat);
3737   Register RSrc = MI.getOperand(2).getReg();
3738 
3739   MachineMemOperand *MMO = *MI.memoperands_begin();
3740   const int MemSize = MMO->getSize();
3741 
3742   unsigned ImmOffset;
3743   unsigned TotalOffset;
3744 
3745   // The typed intrinsics add an immediate after the registers.
3746   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3747 
3748   // The struct intrinsic variants add one additional operand over raw.
3749   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3750   Register VIndex;
3751   int OpOffset = 0;
3752   if (HasVIndex) {
3753     VIndex = MI.getOperand(3).getReg();
3754     OpOffset = 1;
3755   }
3756 
3757   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3758   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3759 
3760   unsigned Format = 0;
3761   if (IsTyped) {
3762     Format = MI.getOperand(5 + OpOffset).getImm();
3763     ++OpOffset;
3764   }
3765 
3766   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3767 
3768   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3769   if (TotalOffset != 0)
3770     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3771 
3772   unsigned Opc;
3773   if (IsTyped) {
3774     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3775                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3776   } else if (IsFormat) {
3777     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3778                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3779   } else {
3780     switch (MemSize) {
3781     case 1:
3782       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3783       break;
3784     case 2:
3785       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3786       break;
3787     default:
3788       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3789       break;
3790     }
3791   }
3792 
3793   if (!VIndex)
3794     VIndex = B.buildConstant(S32, 0).getReg(0);
3795 
3796   auto MIB = B.buildInstr(Opc)
3797     .addUse(VData)              // vdata
3798     .addUse(RSrc)               // rsrc
3799     .addUse(VIndex)             // vindex
3800     .addUse(VOffset)            // voffset
3801     .addUse(SOffset)            // soffset
3802     .addImm(ImmOffset);         // offset(imm)
3803 
3804   if (IsTyped)
3805     MIB.addImm(Format);
3806 
3807   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3808      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3809      .addMemOperand(MMO);
3810 
3811   MI.eraseFromParent();
3812   return true;
3813 }
3814 
3815 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3816                                              MachineRegisterInfo &MRI,
3817                                              MachineIRBuilder &B,
3818                                              bool IsFormat,
3819                                              bool IsTyped) const {
3820   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3821   MachineMemOperand *MMO = *MI.memoperands_begin();
3822   const int MemSize = MMO->getSize();
3823   const LLT S32 = LLT::scalar(32);
3824 
3825   Register Dst = MI.getOperand(0).getReg();
3826   Register RSrc = MI.getOperand(2).getReg();
3827 
3828   // The typed intrinsics add an immediate after the registers.
3829   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3830 
3831   // The struct intrinsic variants add one additional operand over raw.
3832   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3833   Register VIndex;
3834   int OpOffset = 0;
3835   if (HasVIndex) {
3836     VIndex = MI.getOperand(3).getReg();
3837     OpOffset = 1;
3838   }
3839 
3840   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3841   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3842 
3843   unsigned Format = 0;
3844   if (IsTyped) {
3845     Format = MI.getOperand(5 + OpOffset).getImm();
3846     ++OpOffset;
3847   }
3848 
3849   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3850   unsigned ImmOffset;
3851   unsigned TotalOffset;
3852 
3853   LLT Ty = MRI.getType(Dst);
3854   LLT EltTy = Ty.getScalarType();
3855   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3856   const bool Unpacked = ST.hasUnpackedD16VMem();
3857 
3858   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3859   if (TotalOffset != 0)
3860     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3861 
3862   unsigned Opc;
3863 
3864   if (IsTyped) {
3865     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3866                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3867   } else if (IsFormat) {
3868     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3869                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3870   } else {
3871     switch (MemSize) {
3872     case 1:
3873       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3874       break;
3875     case 2:
3876       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3877       break;
3878     default:
3879       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3880       break;
3881     }
3882   }
3883 
3884   Register LoadDstReg;
3885 
3886   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3887   LLT UnpackedTy = Ty.changeElementSize(32);
3888 
3889   if (IsExtLoad)
3890     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3891   else if (Unpacked && IsD16 && Ty.isVector())
3892     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3893   else
3894     LoadDstReg = Dst;
3895 
3896   if (!VIndex)
3897     VIndex = B.buildConstant(S32, 0).getReg(0);
3898 
3899   auto MIB = B.buildInstr(Opc)
3900     .addDef(LoadDstReg)         // vdata
3901     .addUse(RSrc)               // rsrc
3902     .addUse(VIndex)             // vindex
3903     .addUse(VOffset)            // voffset
3904     .addUse(SOffset)            // soffset
3905     .addImm(ImmOffset);         // offset(imm)
3906 
3907   if (IsTyped)
3908     MIB.addImm(Format);
3909 
3910   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3911      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3912      .addMemOperand(MMO);
3913 
3914   if (LoadDstReg != Dst) {
3915     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3916 
3917     // Widen result for extending loads was widened.
3918     if (IsExtLoad)
3919       B.buildTrunc(Dst, LoadDstReg);
3920     else {
3921       // Repack to original 16-bit vector result
3922       // FIXME: G_TRUNC should work, but legalization currently fails
3923       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3924       SmallVector<Register, 4> Repack;
3925       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3926         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3927       B.buildMerge(Dst, Repack);
3928     }
3929   }
3930 
3931   MI.eraseFromParent();
3932   return true;
3933 }
3934 
3935 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3936                                                MachineIRBuilder &B,
3937                                                bool IsInc) const {
3938   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3939                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3940   B.buildInstr(Opc)
3941     .addDef(MI.getOperand(0).getReg())
3942     .addUse(MI.getOperand(2).getReg())
3943     .addUse(MI.getOperand(3).getReg())
3944     .cloneMemRefs(MI);
3945   MI.eraseFromParent();
3946   return true;
3947 }
3948 
3949 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3950   switch (IntrID) {
3951   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3952   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3953     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3954   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3955   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3956     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3957   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3958   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3959     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3960   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3961   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3962     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3963   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3964   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3965     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3966   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3967   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3968     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3969   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3970   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3971     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3972   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3973   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3974     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3975   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3976   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3977     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3978   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3979   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3980     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3981   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3982   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3983     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3984   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3985   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3986     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3987   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3988   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3989     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3990   case Intrinsic::amdgcn_buffer_atomic_fadd:
3991   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3992   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3993     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3994   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3995   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3996     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3997   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3998   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3999     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4000   default:
4001     llvm_unreachable("unhandled atomic opcode");
4002   }
4003 }
4004 
4005 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4006                                                MachineIRBuilder &B,
4007                                                Intrinsic::ID IID) const {
4008   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4009                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4010   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4011 
4012   Register Dst;
4013 
4014   int OpOffset = 0;
4015   if (HasReturn) {
4016     // A few FP atomics do not support return values.
4017     Dst = MI.getOperand(0).getReg();
4018   } else {
4019     OpOffset = -1;
4020   }
4021 
4022   Register VData = MI.getOperand(2 + OpOffset).getReg();
4023   Register CmpVal;
4024 
4025   if (IsCmpSwap) {
4026     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4027     ++OpOffset;
4028   }
4029 
4030   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4031   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4032 
4033   // The struct intrinsic variants add one additional operand over raw.
4034   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4035   Register VIndex;
4036   if (HasVIndex) {
4037     VIndex = MI.getOperand(4 + OpOffset).getReg();
4038     ++OpOffset;
4039   }
4040 
4041   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4042   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4043   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4044 
4045   MachineMemOperand *MMO = *MI.memoperands_begin();
4046 
4047   unsigned ImmOffset;
4048   unsigned TotalOffset;
4049   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4050   if (TotalOffset != 0)
4051     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4052 
4053   if (!VIndex)
4054     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4055 
4056   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4057 
4058   if (HasReturn)
4059     MIB.addDef(Dst);
4060 
4061   MIB.addUse(VData); // vdata
4062 
4063   if (IsCmpSwap)
4064     MIB.addReg(CmpVal);
4065 
4066   MIB.addUse(RSrc)               // rsrc
4067      .addUse(VIndex)             // vindex
4068      .addUse(VOffset)            // voffset
4069      .addUse(SOffset)            // soffset
4070      .addImm(ImmOffset)          // offset(imm)
4071      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4072      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4073      .addMemOperand(MMO);
4074 
4075   MI.eraseFromParent();
4076   return true;
4077 }
4078 
4079 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4080 /// vector with s16 typed elements.
4081 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4082                                       SmallVectorImpl<Register> &PackedAddrs,
4083                                       unsigned ArgOffset,
4084                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4085                                       bool IsA16, bool IsG16) {
4086   const LLT S16 = LLT::scalar(16);
4087   const LLT V2S16 = LLT::vector(2, 16);
4088   auto EndIdx = Intr->VAddrEnd;
4089 
4090   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4091     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4092     if (!SrcOp.isReg())
4093       continue; // _L to _LZ may have eliminated this.
4094 
4095     Register AddrReg = SrcOp.getReg();
4096 
4097     if (I < Intr->GradientStart) {
4098       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4099       PackedAddrs.push_back(AddrReg);
4100     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4101                (I >= Intr->CoordStart && !IsA16)) {
4102       // Handle any gradient or coordinate operands that should not be packed
4103       PackedAddrs.push_back(AddrReg);
4104     } else {
4105       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4106       // derivatives dx/dh and dx/dv are packed with undef.
4107       if (((I + 1) >= EndIdx) ||
4108           ((Intr->NumGradients / 2) % 2 == 1 &&
4109            (I == static_cast<unsigned>(Intr->GradientStart +
4110                                        (Intr->NumGradients / 2) - 1) ||
4111             I == static_cast<unsigned>(Intr->GradientStart +
4112                                        Intr->NumGradients - 1))) ||
4113           // Check for _L to _LZ optimization
4114           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4115         PackedAddrs.push_back(
4116             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4117                 .getReg(0));
4118       } else {
4119         PackedAddrs.push_back(
4120             B.buildBuildVector(
4121                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4122                 .getReg(0));
4123         ++I;
4124       }
4125     }
4126   }
4127 }
4128 
4129 /// Convert from separate vaddr components to a single vector address register,
4130 /// and replace the remaining operands with $noreg.
4131 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4132                                      int DimIdx, int NumVAddrs) {
4133   const LLT S32 = LLT::scalar(32);
4134 
4135   SmallVector<Register, 8> AddrRegs;
4136   for (int I = 0; I != NumVAddrs; ++I) {
4137     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4138     if (SrcOp.isReg()) {
4139       AddrRegs.push_back(SrcOp.getReg());
4140       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4141     }
4142   }
4143 
4144   int NumAddrRegs = AddrRegs.size();
4145   if (NumAddrRegs != 1) {
4146     // Round up to 8 elements for v5-v7
4147     // FIXME: Missing intermediate sized register classes and instructions.
4148     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4149       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4150       auto Undef = B.buildUndef(S32);
4151       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4152       NumAddrRegs = RoundedNumRegs;
4153     }
4154 
4155     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4156     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4157   }
4158 
4159   for (int I = 1; I != NumVAddrs; ++I) {
4160     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4161     if (SrcOp.isReg())
4162       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4163   }
4164 }
4165 
4166 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4167 ///
4168 /// Depending on the subtarget, load/store with 16-bit element data need to be
4169 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4170 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4171 /// registers.
4172 ///
4173 /// We don't want to directly select image instructions just yet, but also want
4174 /// to exposes all register repacking to the legalizer/combiners. We also don't
4175 /// want a selected instrution entering RegBankSelect. In order to avoid
4176 /// defining a multitude of intermediate image instructions, directly hack on
4177 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4178 /// now unnecessary arguments with $noreg.
4179 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4180     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4181     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4182 
4183   const unsigned NumDefs = MI.getNumExplicitDefs();
4184   const unsigned ArgOffset = NumDefs + 1;
4185   bool IsTFE = NumDefs == 2;
4186   // We are only processing the operands of d16 image operations on subtargets
4187   // that use the unpacked register layout, or need to repack the TFE result.
4188 
4189   // TODO: Do we need to guard against already legalized intrinsics?
4190   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4191       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4192 
4193   MachineRegisterInfo *MRI = B.getMRI();
4194   const LLT S32 = LLT::scalar(32);
4195   const LLT S16 = LLT::scalar(16);
4196   const LLT V2S16 = LLT::vector(2, 16);
4197 
4198   unsigned DMask = 0;
4199 
4200   // Check for 16 bit addresses and pack if true.
4201   LLT GradTy =
4202       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4203   LLT AddrTy =
4204       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4205   const bool IsG16 = GradTy == S16;
4206   const bool IsA16 = AddrTy == S16;
4207 
4208   int DMaskLanes = 0;
4209   if (!BaseOpcode->Atomic) {
4210     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4211     if (BaseOpcode->Gather4) {
4212       DMaskLanes = 4;
4213     } else if (DMask != 0) {
4214       DMaskLanes = countPopulation(DMask);
4215     } else if (!IsTFE && !BaseOpcode->Store) {
4216       // If dmask is 0, this is a no-op load. This can be eliminated.
4217       B.buildUndef(MI.getOperand(0));
4218       MI.eraseFromParent();
4219       return true;
4220     }
4221   }
4222 
4223   Observer.changingInstr(MI);
4224   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4225 
4226   unsigned NewOpcode = NumDefs == 0 ?
4227     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4228 
4229   // Track that we legalized this
4230   MI.setDesc(B.getTII().get(NewOpcode));
4231 
4232   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4233   // dmask to be at least 1 otherwise the instruction will fail
4234   if (IsTFE && DMask == 0) {
4235     DMask = 0x1;
4236     DMaskLanes = 1;
4237     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4238   }
4239 
4240   if (BaseOpcode->Atomic) {
4241     Register VData0 = MI.getOperand(2).getReg();
4242     LLT Ty = MRI->getType(VData0);
4243 
4244     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4245     if (Ty.isVector())
4246       return false;
4247 
4248     if (BaseOpcode->AtomicX2) {
4249       Register VData1 = MI.getOperand(3).getReg();
4250       // The two values are packed in one register.
4251       LLT PackedTy = LLT::vector(2, Ty);
4252       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4253       MI.getOperand(2).setReg(Concat.getReg(0));
4254       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4255     }
4256   }
4257 
4258   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4259 
4260   // Optimize _L to _LZ when _L is zero
4261   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4262           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4263     const ConstantFP *ConstantLod;
4264 
4265     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4266                  m_GFCst(ConstantLod))) {
4267       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4268         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4269         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4270             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4271                                                       Intr->Dim);
4272 
4273         // The starting indexes should remain in the same place.
4274         --CorrectedNumVAddrs;
4275 
4276         MI.getOperand(MI.getNumExplicitDefs())
4277             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4278         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4279         Intr = NewImageDimIntr;
4280       }
4281     }
4282   }
4283 
4284   // Optimize _mip away, when 'lod' is zero
4285   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4286     int64_t ConstantLod;
4287     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4288                  m_ICst(ConstantLod))) {
4289       if (ConstantLod == 0) {
4290         // TODO: Change intrinsic opcode and remove operand instead or replacing
4291         // it with 0, as the _L to _LZ handling is done above.
4292         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4293         --CorrectedNumVAddrs;
4294       }
4295     }
4296   }
4297 
4298   // Rewrite the addressing register layout before doing anything else.
4299   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4300     // 16 bit gradients are supported, but are tied to the A16 control
4301     // so both gradients and addresses must be 16 bit
4302     return false;
4303   }
4304 
4305   if (IsA16 && !ST.hasA16()) {
4306     // A16 not supported
4307     return false;
4308   }
4309 
4310   if (IsA16 || IsG16) {
4311     if (Intr->NumVAddrs > 1) {
4312       SmallVector<Register, 4> PackedRegs;
4313 
4314       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4315                                 IsG16);
4316 
4317       // See also below in the non-a16 branch
4318       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4319 
4320       if (!UseNSA && PackedRegs.size() > 1) {
4321         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4322         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4323         PackedRegs[0] = Concat.getReg(0);
4324         PackedRegs.resize(1);
4325       }
4326 
4327       const unsigned NumPacked = PackedRegs.size();
4328       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4329         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4330         if (!SrcOp.isReg()) {
4331           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4332           continue;
4333         }
4334 
4335         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4336 
4337         if (I - Intr->VAddrStart < NumPacked)
4338           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4339         else
4340           SrcOp.setReg(AMDGPU::NoRegister);
4341       }
4342     }
4343   } else {
4344     // If the register allocator cannot place the address registers contiguously
4345     // without introducing moves, then using the non-sequential address encoding
4346     // is always preferable, since it saves VALU instructions and is usually a
4347     // wash in terms of code size or even better.
4348     //
4349     // However, we currently have no way of hinting to the register allocator
4350     // that MIMG addresses should be placed contiguously when it is possible to
4351     // do so, so force non-NSA for the common 2-address case as a heuristic.
4352     //
4353     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4354     // allocation when possible.
4355     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4356 
4357     if (!UseNSA && Intr->NumVAddrs > 1)
4358       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4359                                Intr->NumVAddrs);
4360   }
4361 
4362   int Flags = 0;
4363   if (IsA16)
4364     Flags |= 1;
4365   if (IsG16)
4366     Flags |= 2;
4367   MI.addOperand(MachineOperand::CreateImm(Flags));
4368 
4369   if (BaseOpcode->Store) { // No TFE for stores?
4370     // TODO: Handle dmask trim
4371     Register VData = MI.getOperand(1).getReg();
4372     LLT Ty = MRI->getType(VData);
4373     if (!Ty.isVector() || Ty.getElementType() != S16)
4374       return true;
4375 
4376     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4377     if (RepackedReg != VData) {
4378       MI.getOperand(1).setReg(RepackedReg);
4379     }
4380 
4381     return true;
4382   }
4383 
4384   Register DstReg = MI.getOperand(0).getReg();
4385   LLT Ty = MRI->getType(DstReg);
4386   const LLT EltTy = Ty.getScalarType();
4387   const bool IsD16 = Ty.getScalarType() == S16;
4388   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4389 
4390   // Confirm that the return type is large enough for the dmask specified
4391   if (NumElts < DMaskLanes)
4392     return false;
4393 
4394   if (NumElts > 4 || DMaskLanes > 4)
4395     return false;
4396 
4397   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4398   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4399 
4400   // The raw dword aligned data component of the load. The only legal cases
4401   // where this matters should be when using the packed D16 format, for
4402   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4403   LLT RoundedTy;
4404 
4405   // S32 vector to to cover all data, plus TFE result element.
4406   LLT TFETy;
4407 
4408   // Register type to use for each loaded component. Will be S32 or V2S16.
4409   LLT RegTy;
4410 
4411   if (IsD16 && ST.hasUnpackedD16VMem()) {
4412     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4413     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4414     RegTy = S32;
4415   } else {
4416     unsigned EltSize = EltTy.getSizeInBits();
4417     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4418     unsigned RoundedSize = 32 * RoundedElts;
4419     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4420     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4421     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4422   }
4423 
4424   // The return type does not need adjustment.
4425   // TODO: Should we change s16 case to s32 or <2 x s16>?
4426   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4427     return true;
4428 
4429   Register Dst1Reg;
4430 
4431   // Insert after the instruction.
4432   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4433 
4434   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4435   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4436   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4437   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4438 
4439   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4440 
4441   MI.getOperand(0).setReg(NewResultReg);
4442 
4443   // In the IR, TFE is supposed to be used with a 2 element struct return
4444   // type. The intruction really returns these two values in one contiguous
4445   // register, with one additional dword beyond the loaded data. Rewrite the
4446   // return type to use a single register result.
4447 
4448   if (IsTFE) {
4449     Dst1Reg = MI.getOperand(1).getReg();
4450     if (MRI->getType(Dst1Reg) != S32)
4451       return false;
4452 
4453     // TODO: Make sure the TFE operand bit is set.
4454     MI.RemoveOperand(1);
4455 
4456     // Handle the easy case that requires no repack instructions.
4457     if (Ty == S32) {
4458       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4459       return true;
4460     }
4461   }
4462 
4463   // Now figure out how to copy the new result register back into the old
4464   // result.
4465   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4466 
4467   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4468 
4469   if (ResultNumRegs == 1) {
4470     assert(!IsTFE);
4471     ResultRegs[0] = NewResultReg;
4472   } else {
4473     // We have to repack into a new vector of some kind.
4474     for (int I = 0; I != NumDataRegs; ++I)
4475       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4476     B.buildUnmerge(ResultRegs, NewResultReg);
4477 
4478     // Drop the final TFE element to get the data part. The TFE result is
4479     // directly written to the right place already.
4480     if (IsTFE)
4481       ResultRegs.resize(NumDataRegs);
4482   }
4483 
4484   // For an s16 scalar result, we form an s32 result with a truncate regardless
4485   // of packed vs. unpacked.
4486   if (IsD16 && !Ty.isVector()) {
4487     B.buildTrunc(DstReg, ResultRegs[0]);
4488     return true;
4489   }
4490 
4491   // Avoid a build/concat_vector of 1 entry.
4492   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4493     B.buildBitcast(DstReg, ResultRegs[0]);
4494     return true;
4495   }
4496 
4497   assert(Ty.isVector());
4498 
4499   if (IsD16) {
4500     // For packed D16 results with TFE enabled, all the data components are
4501     // S32. Cast back to the expected type.
4502     //
4503     // TODO: We don't really need to use load s32 elements. We would only need one
4504     // cast for the TFE result if a multiple of v2s16 was used.
4505     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4506       for (Register &Reg : ResultRegs)
4507         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4508     } else if (ST.hasUnpackedD16VMem()) {
4509       for (Register &Reg : ResultRegs)
4510         Reg = B.buildTrunc(S16, Reg).getReg(0);
4511     }
4512   }
4513 
4514   auto padWithUndef = [&](LLT Ty, int NumElts) {
4515     if (NumElts == 0)
4516       return;
4517     Register Undef = B.buildUndef(Ty).getReg(0);
4518     for (int I = 0; I != NumElts; ++I)
4519       ResultRegs.push_back(Undef);
4520   };
4521 
4522   // Pad out any elements eliminated due to the dmask.
4523   LLT ResTy = MRI->getType(ResultRegs[0]);
4524   if (!ResTy.isVector()) {
4525     padWithUndef(ResTy, NumElts - ResultRegs.size());
4526     B.buildBuildVector(DstReg, ResultRegs);
4527     return true;
4528   }
4529 
4530   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4531   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4532 
4533   // Deal with the one annoying legal case.
4534   const LLT V3S16 = LLT::vector(3, 16);
4535   if (Ty == V3S16) {
4536     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4537     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4538     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4539     return true;
4540   }
4541 
4542   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4543   B.buildConcatVectors(DstReg, ResultRegs);
4544   return true;
4545 }
4546 
4547 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4548   LegalizerHelper &Helper, MachineInstr &MI) const {
4549   MachineIRBuilder &B = Helper.MIRBuilder;
4550   GISelChangeObserver &Observer = Helper.Observer;
4551 
4552   Register Dst = MI.getOperand(0).getReg();
4553   LLT Ty = B.getMRI()->getType(Dst);
4554   unsigned Size = Ty.getSizeInBits();
4555   MachineFunction &MF = B.getMF();
4556 
4557   Observer.changingInstr(MI);
4558 
4559   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4560     Ty = getBitcastRegisterType(Ty);
4561     Helper.bitcastDst(MI, Ty, 0);
4562     Dst = MI.getOperand(0).getReg();
4563     B.setInsertPt(B.getMBB(), MI);
4564   }
4565 
4566   // FIXME: We don't really need this intermediate instruction. The intrinsic
4567   // should be fixed to have a memory operand. Since it's readnone, we're not
4568   // allowed to add one.
4569   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4570   MI.RemoveOperand(1); // Remove intrinsic ID
4571 
4572   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4573   // TODO: Should this use datalayout alignment?
4574   const unsigned MemSize = (Size + 7) / 8;
4575   const Align MemAlign(4);
4576   MachineMemOperand *MMO = MF.getMachineMemOperand(
4577       MachinePointerInfo(),
4578       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4579           MachineMemOperand::MOInvariant,
4580       MemSize, MemAlign);
4581   MI.addMemOperand(MF, MMO);
4582 
4583   // There are no 96-bit result scalar loads, but widening to 128-bit should
4584   // always be legal. We may need to restore this to a 96-bit result if it turns
4585   // out this needs to be converted to a vector load during RegBankSelect.
4586   if (!isPowerOf2_32(Size)) {
4587     if (Ty.isVector())
4588       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4589     else
4590       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4591   }
4592 
4593   Observer.changedInstr(MI);
4594   return true;
4595 }
4596 
4597 // TODO: Move to selection
4598 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4599                                                 MachineRegisterInfo &MRI,
4600                                                 MachineIRBuilder &B) const {
4601   if (!ST.isTrapHandlerEnabled() ||
4602       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4603     return legalizeTrapEndpgm(MI, MRI, B);
4604 
4605   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4606     switch (*HsaAbiVer) {
4607     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4608     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4609       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4610     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4611       return ST.supportsGetDoorbellID() ?
4612           legalizeTrapHsa(MI, MRI, B) :
4613           legalizeTrapHsaQueuePtr(MI, MRI, B);
4614     }
4615   }
4616 
4617   llvm_unreachable("Unknown trap handler");
4618 }
4619 
4620 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4621     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4622   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4623   MI.eraseFromParent();
4624   return true;
4625 }
4626 
4627 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4628     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4629   // Pass queue pointer to trap handler as input, and insert trap instruction
4630   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4631   Register LiveIn =
4632     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4633   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4634     return false;
4635 
4636   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4637   B.buildCopy(SGPR01, LiveIn);
4638   B.buildInstr(AMDGPU::S_TRAP)
4639       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4640       .addReg(SGPR01, RegState::Implicit);
4641 
4642   MI.eraseFromParent();
4643   return true;
4644 }
4645 
4646 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4647     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4648   B.buildInstr(AMDGPU::S_TRAP)
4649       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4650   MI.eraseFromParent();
4651   return true;
4652 }
4653 
4654 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4655     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4656   // Is non-HSA path or trap-handler disabled? then, report a warning
4657   // accordingly
4658   if (!ST.isTrapHandlerEnabled() ||
4659       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4660     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4661                                      "debugtrap handler not supported",
4662                                      MI.getDebugLoc(), DS_Warning);
4663     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4664     Ctx.diagnose(NoTrap);
4665   } else {
4666     // Insert debug-trap instruction
4667     B.buildInstr(AMDGPU::S_TRAP)
4668         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4669   }
4670 
4671   MI.eraseFromParent();
4672   return true;
4673 }
4674 
4675 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4676                                                MachineIRBuilder &B) const {
4677   MachineRegisterInfo &MRI = *B.getMRI();
4678   const LLT S16 = LLT::scalar(16);
4679   const LLT S32 = LLT::scalar(32);
4680 
4681   Register DstReg = MI.getOperand(0).getReg();
4682   Register NodePtr = MI.getOperand(2).getReg();
4683   Register RayExtent = MI.getOperand(3).getReg();
4684   Register RayOrigin = MI.getOperand(4).getReg();
4685   Register RayDir = MI.getOperand(5).getReg();
4686   Register RayInvDir = MI.getOperand(6).getReg();
4687   Register TDescr = MI.getOperand(7).getReg();
4688 
4689   if (!ST.hasGFX10_AEncoding()) {
4690     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4691                                         "intrinsic not supported on subtarget",
4692                                         MI.getDebugLoc());
4693     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4694     return false;
4695   }
4696 
4697   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4698   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4699   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4700                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4701                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4702                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4703 
4704   SmallVector<Register, 12> Ops;
4705   if (Is64) {
4706     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4707     Ops.push_back(Unmerge.getReg(0));
4708     Ops.push_back(Unmerge.getReg(1));
4709   } else {
4710     Ops.push_back(NodePtr);
4711   }
4712   Ops.push_back(RayExtent);
4713 
4714   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4715     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4716     Ops.push_back(Unmerge.getReg(0));
4717     Ops.push_back(Unmerge.getReg(1));
4718     Ops.push_back(Unmerge.getReg(2));
4719   };
4720 
4721   packLanes(RayOrigin);
4722   if (IsA16) {
4723     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4724     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4725     Register R1 = MRI.createGenericVirtualRegister(S32);
4726     Register R2 = MRI.createGenericVirtualRegister(S32);
4727     Register R3 = MRI.createGenericVirtualRegister(S32);
4728     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4729     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4730     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4731     Ops.push_back(R1);
4732     Ops.push_back(R2);
4733     Ops.push_back(R3);
4734   } else {
4735     packLanes(RayDir);
4736     packLanes(RayInvDir);
4737   }
4738 
4739   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4740     .addDef(DstReg)
4741     .addImm(Opcode);
4742 
4743   for (Register R : Ops) {
4744     MIB.addUse(R);
4745   }
4746 
4747   MIB.addUse(TDescr)
4748      .addImm(IsA16 ? 1 : 0)
4749      .cloneMemRefs(MI);
4750 
4751   MI.eraseFromParent();
4752   return true;
4753 }
4754 
4755 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4756                                             MachineInstr &MI) const {
4757   MachineIRBuilder &B = Helper.MIRBuilder;
4758   MachineRegisterInfo &MRI = *B.getMRI();
4759 
4760   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4761   auto IntrID = MI.getIntrinsicID();
4762   switch (IntrID) {
4763   case Intrinsic::amdgcn_if:
4764   case Intrinsic::amdgcn_else: {
4765     MachineInstr *Br = nullptr;
4766     MachineBasicBlock *UncondBrTarget = nullptr;
4767     bool Negated = false;
4768     if (MachineInstr *BrCond =
4769             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4770       const SIRegisterInfo *TRI
4771         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4772 
4773       Register Def = MI.getOperand(1).getReg();
4774       Register Use = MI.getOperand(3).getReg();
4775 
4776       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4777 
4778       if (Negated)
4779         std::swap(CondBrTarget, UncondBrTarget);
4780 
4781       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4782       if (IntrID == Intrinsic::amdgcn_if) {
4783         B.buildInstr(AMDGPU::SI_IF)
4784           .addDef(Def)
4785           .addUse(Use)
4786           .addMBB(UncondBrTarget);
4787       } else {
4788         B.buildInstr(AMDGPU::SI_ELSE)
4789             .addDef(Def)
4790             .addUse(Use)
4791             .addMBB(UncondBrTarget);
4792       }
4793 
4794       if (Br) {
4795         Br->getOperand(0).setMBB(CondBrTarget);
4796       } else {
4797         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4798         // since we're swapping branch targets it needs to be reinserted.
4799         // FIXME: IRTranslator should probably not do this
4800         B.buildBr(*CondBrTarget);
4801       }
4802 
4803       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4804       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4805       MI.eraseFromParent();
4806       BrCond->eraseFromParent();
4807       return true;
4808     }
4809 
4810     return false;
4811   }
4812   case Intrinsic::amdgcn_loop: {
4813     MachineInstr *Br = nullptr;
4814     MachineBasicBlock *UncondBrTarget = nullptr;
4815     bool Negated = false;
4816     if (MachineInstr *BrCond =
4817             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4818       const SIRegisterInfo *TRI
4819         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4820 
4821       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4822       Register Reg = MI.getOperand(2).getReg();
4823 
4824       if (Negated)
4825         std::swap(CondBrTarget, UncondBrTarget);
4826 
4827       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4828       B.buildInstr(AMDGPU::SI_LOOP)
4829         .addUse(Reg)
4830         .addMBB(UncondBrTarget);
4831 
4832       if (Br)
4833         Br->getOperand(0).setMBB(CondBrTarget);
4834       else
4835         B.buildBr(*CondBrTarget);
4836 
4837       MI.eraseFromParent();
4838       BrCond->eraseFromParent();
4839       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4840       return true;
4841     }
4842 
4843     return false;
4844   }
4845   case Intrinsic::amdgcn_kernarg_segment_ptr:
4846     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4847       // This only makes sense to call in a kernel, so just lower to null.
4848       B.buildConstant(MI.getOperand(0).getReg(), 0);
4849       MI.eraseFromParent();
4850       return true;
4851     }
4852 
4853     return legalizePreloadedArgIntrin(
4854       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4855   case Intrinsic::amdgcn_implicitarg_ptr:
4856     return legalizeImplicitArgPtr(MI, MRI, B);
4857   case Intrinsic::amdgcn_workitem_id_x:
4858     return legalizePreloadedArgIntrin(MI, MRI, B,
4859                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4860   case Intrinsic::amdgcn_workitem_id_y:
4861     return legalizePreloadedArgIntrin(MI, MRI, B,
4862                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4863   case Intrinsic::amdgcn_workitem_id_z:
4864     return legalizePreloadedArgIntrin(MI, MRI, B,
4865                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4866   case Intrinsic::amdgcn_workgroup_id_x:
4867     return legalizePreloadedArgIntrin(MI, MRI, B,
4868                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4869   case Intrinsic::amdgcn_workgroup_id_y:
4870     return legalizePreloadedArgIntrin(MI, MRI, B,
4871                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4872   case Intrinsic::amdgcn_workgroup_id_z:
4873     return legalizePreloadedArgIntrin(MI, MRI, B,
4874                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4875   case Intrinsic::amdgcn_dispatch_ptr:
4876     return legalizePreloadedArgIntrin(MI, MRI, B,
4877                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4878   case Intrinsic::amdgcn_queue_ptr:
4879     return legalizePreloadedArgIntrin(MI, MRI, B,
4880                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4881   case Intrinsic::amdgcn_implicit_buffer_ptr:
4882     return legalizePreloadedArgIntrin(
4883       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4884   case Intrinsic::amdgcn_dispatch_id:
4885     return legalizePreloadedArgIntrin(MI, MRI, B,
4886                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4887   case Intrinsic::amdgcn_fdiv_fast:
4888     return legalizeFDIVFastIntrin(MI, MRI, B);
4889   case Intrinsic::amdgcn_is_shared:
4890     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4891   case Intrinsic::amdgcn_is_private:
4892     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4893   case Intrinsic::amdgcn_wavefrontsize: {
4894     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4895     MI.eraseFromParent();
4896     return true;
4897   }
4898   case Intrinsic::amdgcn_s_buffer_load:
4899     return legalizeSBufferLoad(Helper, MI);
4900   case Intrinsic::amdgcn_raw_buffer_store:
4901   case Intrinsic::amdgcn_struct_buffer_store:
4902     return legalizeBufferStore(MI, MRI, B, false, false);
4903   case Intrinsic::amdgcn_raw_buffer_store_format:
4904   case Intrinsic::amdgcn_struct_buffer_store_format:
4905     return legalizeBufferStore(MI, MRI, B, false, true);
4906   case Intrinsic::amdgcn_raw_tbuffer_store:
4907   case Intrinsic::amdgcn_struct_tbuffer_store:
4908     return legalizeBufferStore(MI, MRI, B, true, true);
4909   case Intrinsic::amdgcn_raw_buffer_load:
4910   case Intrinsic::amdgcn_struct_buffer_load:
4911     return legalizeBufferLoad(MI, MRI, B, false, false);
4912   case Intrinsic::amdgcn_raw_buffer_load_format:
4913   case Intrinsic::amdgcn_struct_buffer_load_format:
4914     return legalizeBufferLoad(MI, MRI, B, true, false);
4915   case Intrinsic::amdgcn_raw_tbuffer_load:
4916   case Intrinsic::amdgcn_struct_tbuffer_load:
4917     return legalizeBufferLoad(MI, MRI, B, true, true);
4918   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4919   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4920   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4921   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4922   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4923   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4924   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4925   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4926   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4927   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4928   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4929   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4930   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4931   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4932   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4933   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4934   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4935   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4936   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4937   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4938   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4939   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4940   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4941   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4942   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4943   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4944   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4945   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4946   case Intrinsic::amdgcn_buffer_atomic_fadd:
4947   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4948   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4949   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4950   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4951     return legalizeBufferAtomic(MI, B, IntrID);
4952   case Intrinsic::amdgcn_atomic_inc:
4953     return legalizeAtomicIncDec(MI, B, true);
4954   case Intrinsic::amdgcn_atomic_dec:
4955     return legalizeAtomicIncDec(MI, B, false);
4956   case Intrinsic::trap:
4957     return legalizeTrapIntrinsic(MI, MRI, B);
4958   case Intrinsic::debugtrap:
4959     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4960   case Intrinsic::amdgcn_rsq_clamp:
4961     return legalizeRsqClampIntrinsic(MI, MRI, B);
4962   case Intrinsic::amdgcn_ds_fadd:
4963   case Intrinsic::amdgcn_ds_fmin:
4964   case Intrinsic::amdgcn_ds_fmax:
4965     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4966   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4967     return legalizeBVHIntrinsic(MI, B);
4968   default: {
4969     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4970             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4971       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4972     return true;
4973   }
4974   }
4975 
4976   return true;
4977 }
4978